Chapter 4. Handling numerical Data


In [1]:
import numpy as np
from sklearn import preprocessing

ModuleNotFoundError: No module named 'sklearn'

4.1 Rescaling a Feature

In [3]:
feature = np.array([[-500.5],
                   [-100.1],
                   [0],
                   [100.1],
                   [900.9]])

In [4]:
#Create scaler
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))

In [5]:
#Scale feature
scaled_feature = minmax_scale.fit_transform(feature)

In [6]:
#Show feature
scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

4.2 Standardisong a Feature
We want to transform a feature to have a mean of 0 and a standard deviation of 1.

In [129]:
x = np.array([[-1000.1],
                   [-200.2],
                   [500.5],
                   [600.6],
                   [9000.9]])

In [9]:
scaler = preprocessing.StandardScaler()

In [10]:
standardized = scaler.fit_transform(x)

In [11]:
standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

In [12]:
print("Mean:",round(standardized.mean()))

Mean: 0.0


In [13]:
print("Standard Deviation:",round(standardized.std()))

Standard Deviation: 1.0


If our data has significant outliers, it can negatively impact our standardization by affecting the feature’s mean and variance. In this scenario, it is often helpful to instead rescale the feature using the median and quartile range. In scikit-learn, we do this using the RobustScaler method:

In [16]:
robust_scaler = preprocessing.RobustScaler()

In [17]:
robust_scaler.fit_transform(x)

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

4.3 Normalizing Observartions
We want to rescale the feature values of observations to have unit norm (a total length of 1).

In [19]:
from sklearn.preprocessing import Normalizer

In [20]:
features = np.array([[0.5,0.5],
                    [1.1,3.4],
                    [1.5,20.2],
                    [1.63,34.4],
                    [10.9,3.3]])

In [26]:
normalizer = Normalizer(norm="l2")

In [27]:
normalizer.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [29]:
features_l2_norm = Normalizer(norm="l2").transform(features)

In [30]:
features_l2_norm

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [31]:
#Alternatively, we can specify Manhattan norm (L1)
features_l1_norm = Normalizer(norm="l1").transform(features)

In [32]:
features_l1_norm

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [33]:
#notice that norm='l1' rescales an observation’s values so they sum to 1, which can sometimes be a desirable quality:
print("Sum of the first observaton\'s values:",
     features_l1_norm[0,0] + features_l1_norm[0,1])

Sum of the first observaton's values: 1.0


4.4 Generating Polynomial and Interaction Features

In [34]:
from sklearn.preprocessing import PolynomialFeatures

In [35]:
features = np.array([[2,3],
                    [2,3],
                    [2,3]])

In [38]:
polynomial_interaction = PolynomialFeatures(degree=2,include_bias=False)

In [39]:
polynomial_interaction.fit_transform(features)

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

In [40]:
interaction = PolynomialFeatures(degree=2,interaction_only=True,include_bias=False)

In [41]:
interaction.fit_transform(features)

array([[2., 3., 6.],
       [2., 3., 6.],
       [2., 3., 6.]])

4.5 Transforming Features
We want to transform features based on a function.

In [42]:
from sklearn.preprocessing import FunctionTransformer

In [43]:
features = np.array([[2,3],
                    [2,3],
                    [2,3]])

In [44]:
def add_ten(x):
    return(x+10)

In [45]:
ten_transformer = FunctionTransformer(add_ten)

In [46]:
ten_transformer.transform(features)

array([[12, 13],
       [12, 13],
       [12, 13]])

In [48]:
import pandas as pd
df =pd.DataFrame(features,columns=['feature_1','feature_2'])

In [49]:
df.apply(add_ten)

Unnamed: 0,feature_1,feature_2
0,12,13
1,12,13
2,12,13


4.6 Detecting Outliers
Detecting outliers is unfortunately more of an art than a science. However, a common method is to assume the data is normally distributed and based on that assumption “draw” an ellipse around the data, classifying any observation inside the ellipse as an inlier (labeled as 1) and any observation outside the ellipse as an outlier (labeled as -1)

In [51]:
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

In [61]:
features,_ = make_blobs(n_samples = 10,
                       n_features = 2,
                       centers = 1,
                       random_state = 1)
features

array([[-1.83198811,  3.52863145],
       [-2.76017908,  5.55121358],
       [-1.61734616,  4.98930508],
       [-0.52579046,  3.3065986 ],
       [ 0.08525186,  3.64528297],
       [-0.79415228,  2.10495117],
       [-1.34052081,  4.15711949],
       [-1.98197711,  4.02243551],
       [-2.18773166,  3.33352125],
       [-0.19745197,  2.34634916]])

In [62]:
# create outliers
features[0,0] = 10000
features[0,1] = 10000
features

array([[ 1.00000000e+04,  1.00000000e+04],
       [-2.76017908e+00,  5.55121358e+00],
       [-1.61734616e+00,  4.98930508e+00],
       [-5.25790464e-01,  3.30659860e+00],
       [ 8.52518583e-02,  3.64528297e+00],
       [-7.94152277e-01,  2.10495117e+00],
       [-1.34052081e+00,  4.15711949e+00],
       [-1.98197711e+00,  4.02243551e+00],
       [-2.18773166e+00,  3.33352125e+00],
       [-1.97451969e-01,  2.34634916e+00]])

In [54]:
# Contamination is a an estimate of how many outliers are in the dataset. 
# The lower contamination mean there is high confidence there are few outlier, and high contamination 
# is when there is expected to be many outliers.
outlier_detector = EllipticEnvelope(contamination =0.1)

In [63]:
outlier_detector.fit(features)

EllipticEnvelope(assume_centered=False, contamination=0.1, random_state=None,
                 store_precision=True, support_fraction=None)

In [64]:
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [None]:
#Instead of looking at observations as a whole, we can instead look at individual features and 
# identify extreme values in those features using interquartile range (IQR):


In [65]:
features = features[:,0]

array([ 1.00000000e+04, -2.76017908e+00, -1.61734616e+00, -5.25790464e-01,
        8.52518583e-02, -7.94152277e-01, -1.34052081e+00, -1.98197711e+00,
       -2.18773166e+00, -1.97451969e-01])

In [68]:
features

array([ 1.00000000e+04, -2.76017908e+00, -1.61734616e+00, -5.25790464e-01,
        8.52518583e-02, -7.94152277e-01, -1.34052081e+00, -1.98197711e+00,
       -2.18773166e+00, -1.97451969e-01])

In [75]:
def indicies_of_outliers(x):
    q1,q3 = np.percentile(x,[25,75])
    print(q1)
    print(q3)
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    print(str(lower_bound) + "," + str(upper_bound))
    return np.where((x > upper_bound) | (x < lower_bound))

In [76]:
indicies_of_outliers(features)

-1.890819372279752
-0.2795365925809296
-4.3077435418279855,2.137387576967304


(array([0], dtype=int32),)

4.7 Handling Outliers

In [78]:
import pandas as pd
houses = pd.DataFrame()
houses['Price']=[534433,392333,293222,432032]
houses['Bathrooms'] = [2,3.5,2,116]
houses['Square_Feet'] =[1500,2500,1500,48000]

In [79]:
# 1. drop them
houses[houses['Bathrooms'] < 20]

Unnamed: 0,Price,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [80]:
# 2 nark them as outliers
houses['Outlier'] = np.where(houses['Bathrooms'] < 20,0,1)
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,432032,116.0,48000,1


In [82]:
# 3 transform the feature with outlier to dampen the effect of the outlier
houses['Log_Of_Square_Feet'] = [np.log(x) for x in houses['Square_Feet']]
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier,Log_Of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,432032,116.0,48000,1,10.778956


4.8 Discretizating Features
Breaking up numerical features into discrete bins.

In [86]:
from sklearn.preprocessing import Binarizer

In [88]:
age = np.array([[6],[12],[20],[36],[65]])

In [90]:
binaizer = Binarizer(18)

In [91]:
binaizer.fit_transform(age)

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [93]:
np.digitize(age,bins=[20,30,64])

array([[0],
       [0],
       [1],
       [2],
       [3]], dtype=int32)

In [94]:
# Setting right to True makes the boundries inclusive(20 is counted in the 0 bin , 
# there is no number less than or equal to 30, so there is no 1 bin, 2, and 3 are )
np.digitize(age,bins=[20,30,64],right=True)

array([[0],
       [0],
       [0],
       [2],
       [3]], dtype=int32)

In [95]:
np.digitize(age,bins=[18])

array([[0],
       [0],
       [1],
       [1],
       [1]], dtype=int32)

4.9 Grouping Observations Using Clustering
We want to cluster observations so that similar observations are grouped together. 
If we know that we have k groups, we can use k-means clustering to group similar observations and output a new feature containing each observation’s group membership:


In [105]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

In [106]:
features,_ = make_blobs(n_samples = 50,
                       n_features = 2,
                       centers = 3,
                       random_state = 1)

In [107]:
dataframe = pd.DataFrame(features,columns=["features_1","feature_2"])

In [108]:
clusterer = KMeans(3,random_state=0)

In [109]:
clusterer.fit(features)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

In [110]:
dataframe["group"] = clusterer.predict(features)

In [112]:
dataframe.head(10)

Unnamed: 0,features_1,feature_2,group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0
5,-0.794152,2.104951,1
6,-2.760179,5.551214,1
7,-9.946905,-4.590344,2
8,-0.52579,3.306599,1
9,-1.981977,4.022436,1


4.10 Deleting Observations with Missing Values

In [114]:
features = np.array([[1.1,11.1],
                    [2.2,22.2],
                    [3.3,33.3],
                    [4.4,44.4],
                    [np.nan,55]])

In [115]:
features[~np.isnan(features).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [116]:
dataframe = pd.DataFrame(features,columns=["features_1","feature_2"])

In [117]:
dataframe.dropna()

Unnamed: 0,features_1,feature_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


4.11 Imputing Missing Values

In [2]:
import numpy as np
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs

ModuleNotFoundError: No module named 'fancyimpute'

In [120]:
# Make a simulated feature matrix
features, _ = make_blobs(n_samples = 1000,
                         n_features = 2,
                         random_state = 1)

In [121]:
# Standardize the features
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)

NameError: name 'StandardScaler' is not defined