## Handling Numerical Data

####  Rescaling a Feature

In [17]:
import numpy as np
from sklearn import preprocessing
# Create feature
feature = np.array([[-500.5],[-100.1],[0],[100.1],[900.9]])
# Create Scaler
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
# Scale feature
scaled_feature = minmax_scale.fit_transform(feature)
print(scaled_feature)


[[0.        ]
 [0.28571429]
 [0.35714286]
 [0.42857143]
 [1.        ]]


#### Scikit-learn’s MinMaxScaler offers two options to rescale a feature. One option is to use fit to calculate the minimum and maximum values of the feature, then use transform to rescale the feature. The second option is to use fit_transform to do both operations at once. There is no mathematical difference between the two options, but there is sometimes a practical benefit to keeping the operations separate because it allows us to apply the same transformation to different sets of the data

### Standardizing a Feature

In [20]:
import numpy as np
from sklearn import preprocessing
feature = np.array([[-500.5],[-100.1],[0],[100.1],[900.9]])
# Create Standard scaler
create_stanadard = preprocessing.StandardScaler()
# transform scaler
standardized =create_stanadard.fit_transform(feature)
print(standardized)
print('standardized mean', standardized.mean())
print('standardized std', standardized.std())

[[-1.26687088]
 [-0.39316683]
 [-0.17474081]
 [ 0.0436852 ]
 [ 1.79109332]]
standardized mean 0.0
standardized std 1.0


#### If our data has significant outliers, it can negatively impact our standardization by affecting the feature’s mean and variance. In this scenario, it is often helpful to instead rescale the feature using the median and quartile range. In scikit-learn, we do this using the RobustScaler method:

In [22]:
# Create scaler
robust_scaler = preprocessing.RobustScaler()
# Transform feature
robust_scaler.fit_transform(feature)

array([[-2.5],
       [-0.5],
       [ 0. ],
       [ 0.5],
       [ 4.5]])

### Normalizing Observations

In [30]:
import numpy as np
from sklearn.preprocessing import Normalizer
# Create feature matrix
features = np.array([[0.5, 0.5],[1.1, 3.4],[1.5, 20.2],[1.63, 34.4],[10.9, 3.3]])
# Create Normalizer
normalizer = Normalizer(norm='l2')
# Transform feature matrix
normalizer.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

#### Alternatively, we can specify Manhattan norm (L1):

In [34]:
features_l1_norm = Normalizer(norm='l1').transform(features)
print(features_l1_norm)
# Print sum
print("Sum of the first observation\'s values:",features_l1_norm[0, 0] + features_l1_norm[0, 1])

[[0.5        0.5       ]
 [0.24444444 0.75555556]
 [0.06912442 0.93087558]
 [0.04524008 0.95475992]
 [0.76760563 0.23239437]]
Sum of the first observation's values: 1.0


### Generating Polynomial and Interaction Features

In [37]:
# Load libraries
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
# Create feature matrix
features = np.array([[2, 4],
[2, 5],
[2, 3]])
# Create PolynomialFeatures object
polynomial_interaction = PolynomialFeatures(degree=2, include_bias=False)
# Create polynomial features
polynomial_interaction.fit_transform(features)

array([[ 2.,  4.,  4.,  8., 16.],
       [ 2.,  5.,  4., 10., 25.],
       [ 2.,  3.,  4.,  6.,  9.]])

### Transforming Features

In [42]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer
features = np.array([[2, 4], [3, 5], [6, 8]])
def scale_down_mul(x):
    return x *.05

function_transformer = FunctionTransformer(scale_down_mul)
function_transformer.transform(features)


array([[0.1 , 0.2 ],
       [0.15, 0.25],
       [0.3 , 0.4 ]])

### We can create the same transformation in pandas using apply:

In [43]:
# Load library
import pandas as pd
# Create DataFrame
df = pd.DataFrame(features, columns=["feature_1", "feature_2"])
# Apply function
df.apply(scale_down_mul)

Unnamed: 0,feature_1,feature_2
0,0.1,0.2
1,0.15,0.25
2,0.3,0.4


### Detecting Outliers

In [52]:
# Load libraries
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs
# Create simulated data
features, _ = make_blobs(n_samples = 10,n_features = 2,centers = 1,random_state = 1)
features[0, 0] =1000
features[0, 1] = 1000
# Create detector
outlier_detector =EllipticEnvelope(contamination=.1)
# Fit detector
outlier_detector.fit(features)# Predict outliers
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [54]:
# Create one feature
feature = features[:,0]
# Create a function to return index of outliers
def indicies_of_outliers(x):
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    return np.where((x > upper_bound) | (x < lower_bound))
# Run function
indicies_of_outliers(feature)

(array([0], dtype=int64),)

### Discretizating Features

In [3]:
# Load libraries
import numpy as np
from sklearn.preprocessing import Binarizer # Create feature
age = np.array([[6],
[12],
[20],
[36],
[65]])
# Create binarizer
binarizer = Binarizer(threshold = 18)
# Transform feature
binarizer.fit_transform(age)

array([[0],
       [0],
       [1],
       [1],
       [1]])

###  Grouping Observations Using Clustering

###  Deleting Observations with Missing Values

In [6]:
# Load library
import numpy as np
# Create feature matrix
features = np.array([[1.1, 11.1],
[2.2, 22.2],
[3.3, 33.3],
[4.4, 44.4],
[np.nan, 55]])
# Keep only observations that are not (denoted by ~) missing
features[~np.isnan(features).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [7]:
# Load library
import pandas as pd
# Load data
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])
# Remove observations with missing values
dataframe.dropna()

Unnamed: 0,feature_1,feature_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


### Imputing Missing Values

In [18]:
import numpy as np
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs

# Generate synthetic data
features, _ = make_blobs(n_samples=20, n_features=2, random_state=2)

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
first_value = scaled_features[0, 0]
# Introduce a missing value
scaled_features[0, 0] = np.nan

# Initialize the KNN imputer
imputer = KNN(k=5, verbose=0)

# Impute missing values using the KNN imputer
imputed_features = imputer.fit_transform(scaled_features)

# Output the imputed features
print('imputed_features', imputed_features[0, 0])
print('first Value', first_value)

imputed_features -0.09640887746815932
first Value 1.0871343089471486
