In [1]:
import pandas as pd
import numpy as np
housing=pd.read_csv("housing.csv")
housing_num=housing.select_dtypes(include=[np.number])
housing_cat=housing[["ocean_proximity"]]

# Scaling

ML algorithms dont perform well if input numerical attrs have very different scales. eg: total number of rooms vary from 6 to 40000. median income range from 0 to 15. Without scaling, most models will be biased toward ignoring median income and focussing more on the number of rooms.

Solution - MinMax Scaling(or normalization) and Standardization

In [2]:
## 1. MinMax Scaler - values end up ranging from 0 to 1.
from sklearn.preprocessing import MinMaxScaler
min_max_scaler=MinMaxScaler(feature_range=(-1,1))   ## value range adjusted to (-1,1)
housing_num_min_max_scaled=min_max_scaler.fit_transform(housing_num)
housing_num_min_max_scaled

array([[-0.57768924,  0.13496281,  0.56862745, ..., -0.95888834,
         0.07933684,  0.80453276],
       [-0.57569721,  0.13071201, -0.21568627, ..., -0.62604835,
         0.07605412,  0.41649313],
       [-0.57968127,  0.12858661,  1.        , ..., -0.94211478,
        -0.06794389,  0.39010148],
       ...,
       [-0.37649402,  0.46439957, -0.37254902, ..., -0.85791811,
        -0.83447125, -0.6812343 ],
       [-0.39641434,  0.46439957, -0.33333333, ..., -0.88554514,
        -0.8114095 , -0.71257438],
       [-0.38047809,  0.45164718, -0.41176471, ..., -0.82601546,
        -0.73949325, -0.69319302]])

In [3]:
## 2.Standardization  =>  0 mean 1 std deviation. Doesn't restrict values to a particular range. Less affected by outliers.
## eg: district has a median income of 100 instead of usual 0-15.
## MinMax Scaling would map this outlier to 1 and rest other to the range 0-0.15. Standardization would not be much affected.

from sklearn.preprocessing import StandardScaler
std_scaler=StandardScaler()
housing_num_std_scaled=std_scaler.fit_transform(housing_num)
housing_num_std_scaled


array([[-1.32783522,  1.05254828,  0.98214266, ..., -0.97703285,
         2.34476576,  2.12963148],
       [-1.32284391,  1.04318455, -0.60701891, ...,  1.66996103,
         2.33223796,  1.31415614],
       [-1.33282653,  1.03850269,  1.85618152, ..., -0.84363692,
         1.7826994 ,  1.25869341],
       ...,
       [-0.8237132 ,  1.77823747, -0.92485123, ..., -0.17404163,
        -1.14259331, -0.99274649],
       [-0.87362627,  1.77823747, -0.84539315, ..., -0.39375258,
        -1.05458292, -1.05860847],
       [-0.83369581,  1.75014627, -1.00430931, ...,  0.07967221,
        -0.78012947, -1.01787803]])

# Transformations

When a feature's distribution has a heavy tail, ie when values far from the mean are not exponentially rare, both MinMax Scaling and Standardisation will squash most values into a small range which is not good. So before we scale the feature, we should first transform it to shrink the heavy tail and if possible to make the distribution symmetrical, ie close to gaussian distribution.

1. For positive features with a heavy tail to the right - replace feature with its square root (or any power between 0 and 1)

2. Features with really long and heavy tail - replace with logarithm eg: population- districts with 10000 inhabitants is only 10 times less frequent than districts with 1000 inhabitants not exponentially less frequent.

3. Heavy tailed features - Bucketizing the feature
Chopping ditribution into equal sized buckets and replace each feature value with the index of the bucket. eg: income category, replacing with percentiles. No need to further scaling. This will give uniform distribution.

4. Feature has multimodal distribution - with 2 or more clear peaks called modes. Bucketize it. Treat bucket id as categories. Bucket indices must be encoded for example using OneHotEncoder

Target values also need transformation. If its heavy tailed do log transformation.

In [4]:
## Custom transformers

from sklearn.preprocessing import FunctionTransformer
log_transformer=FunctionTransformer(np.log,inverse_func=np.exp)
log_pop=log_transformer.transform(housing[['population']])
log_pop

Unnamed: 0,population
0,5.774552
1,7.783641
2,6.206576
3,6.324359
4,6.336826
...,...
20635,6.739337
20636,5.874931
20637,6.914731
20638,6.608001


# Transformation pipelines

In [6]:
## Sequences of transformations
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline,make_pipeline
##Use pipeline class if we want to name pipelines otherwise make_pipeline
num_pipeline=Pipeline([
    ("impute",SimpleImputer(strategy="median")),
    ("standardize",StandardScaler()),
])
##Last one can be predictor,transformer or any other type of estimator. Rest all are transformers.


In [7]:
num_pipeline=make_pipeline(SimpleImputer(strategy="median"),StandardScaler())
## When we call pipeline's fit() it sequentially calls fit_transform() on all estimators passing output of one to next.
## And for the final estimator it calls fit()
## If last estimator is a transformer => pipeline will have transform() method
## If last estimator is a predictor => pipeline will have predict() method

In [8]:
housing_num_prepared=num_pipeline.fit_transform(housing_num)
housing_num_prepared

array([[-1.32783522,  1.05254828,  0.98214266, ..., -0.97703285,
         2.34476576,  2.12963148],
       [-1.32284391,  1.04318455, -0.60701891, ...,  1.66996103,
         2.33223796,  1.31415614],
       [-1.33282653,  1.03850269,  1.85618152, ..., -0.84363692,
         1.7826994 ,  1.25869341],
       ...,
       [-0.8237132 ,  1.77823747, -0.92485123, ..., -0.17404163,
        -1.14259331, -0.99274649],
       [-0.87362627,  1.77823747, -0.84539315, ..., -0.39375258,
        -1.05458292, -1.05860847],
       [-0.83369581,  1.75014627, -1.00430931, ...,  0.07967221,
        -0.78012947, -1.01787803]])

In [10]:
from sklearn.preprocessing import OneHotEncoder
num_pipeline=make_pipeline(SimpleImputer(strategy="median"),StandardScaler())
cat_pipeline=make_pipeline(SimpleImputer(strategy="most_frequent"),OneHotEncoder(handle_unknown="ignore"))
from sklearn.compose import make_column_selector,make_column_transformer
preprocessing=make_column_transformer(
    (num_pipeline,make_column_selector(dtype_include=np.number)),
    (cat_pipeline,make_column_selector(dtype_include=object)),
)
housing_prepared=preprocessing.fit_transform(housing)
housing_prepared

array([[-1.32783522,  1.05254828,  0.98214266, ...,  0.        ,
         1.        ,  0.        ],
       [-1.32284391,  1.04318455, -0.60701891, ...,  0.        ,
         1.        ,  0.        ],
       [-1.33282653,  1.03850269,  1.85618152, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.8237132 ,  1.77823747, -0.92485123, ...,  0.        ,
         0.        ,  0.        ],
       [-0.87362627,  1.77823747, -0.84539315, ...,  0.        ,
         0.        ,  0.        ],
       [-0.83369581,  1.75014627, -1.00430931, ...,  0.        ,
         0.        ,  0.        ]])