# Scaling
1. MinMaxScaler (Normalization)
2. StandardScaler (Standardisation)
3. RobustScaler

## A few important tips
* Preferred for data that is normally (Gaussian) distributed
* Scaling should be done after train test split.

In [15]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'gender' : ['M', 'M', 'F', 'F', 'M'],
    'ssc_p' : [82.2,86.2,83.2,72.2,87.2],
    'hsc_p' : [81.2,88.2,87.4,92.6,77.2],
    'age' :[44, 40, 34, 46, 28],
    'height': [6.1, 5.9, 5.4, 4.6, 5.11],
    'salary': [120000, 80000, 210000, 50000, 70000],
    'suffer_from_ds' : ['no', 'no', 'yes', 'yes', 'no']
})

df.head()

Unnamed: 0,gender,ssc_p,hsc_p,age,height,salary,suffer_from_ds
0,M,82.2,81.2,44,6.1,120000,no
1,M,86.2,88.2,40,5.9,80000,no
2,F,83.2,87.4,34,5.4,210000,yes
3,F,72.2,92.6,46,4.6,50000,yes
4,M,87.2,77.2,28,5.11,70000,no


In [16]:
df.describe()

Unnamed: 0,ssc_p,hsc_p,age,height,salary
count,5.0,5.0,5.0,5.0,5.0
mean,82.2,85.32,38.4,5.422,106000.0
std,5.958188,6.093603,7.402702,0.6045,63482.280992
min,72.2,77.2,28.0,4.6,50000.0
25%,82.2,81.2,34.0,5.11,70000.0
50%,83.2,87.4,40.0,5.4,80000.0
75%,86.2,88.2,44.0,5.9,120000.0
max,87.2,92.6,46.0,6.1,210000.0


# Normalization

In [12]:
def normalize(values):
    return (values - values.min())/(values.max() - values.min())

cols = ['ssc_p', 'hsc_p', 'age', 'height', 'salary']
df[cols] = df[cols].apply(normalize)
df.head()

Unnamed: 0,gender,ssc_p,hsc_p,age,height,salary,suffer_from_ds
0,M,0.666667,0.25974,0.888889,1.0,0.4375,no
1,M,0.933333,0.714286,0.666667,0.866667,0.1875,no
2,F,0.733333,0.662338,0.333333,0.533333,1.0,yes
3,F,0.0,1.0,1.0,0.0,0.0,yes
4,M,1.0,0.0,0.0,0.34,0.125,no


# Standardisation


In [17]:
def standardize(values):
    return (values - values.mean())/(values.std())

cols = ['ssc_p', 'hsc_p', 'age', 'height', 'salary']
df[cols] = df[cols].apply(standardize)
df.head()

Unnamed: 0,gender,ssc_p,hsc_p,age,height,salary,suffer_from_ds
0,M,0.0,-0.676119,0.756481,1.121588,0.220534,no
1,M,0.671345,0.472627,0.216137,0.790736,-0.409563,no
2,F,0.167836,0.341342,-0.594378,-0.036394,1.638252,yes
3,F,-1.678363,1.194695,1.026652,-1.359802,-0.882136,yes
4,M,0.839181,-1.332545,-1.404892,-0.516129,-0.567087,no


In [26]:
def normalize(values):
    return (values - values.min())/(values.max() - values.min())
    
def standardize(values):
    return (values - values.mean())/(values.std())

df = pd.DataFrame({
    'gender' : ['M', 'M', 'F', 'F', 'M'],
    'ssc_p' : [82.2,86.2,83.2,72.2,87.2],
    'hsc_p' : [81.2,88.2,87.4,92.6,77.2],
    'age' :[44, 40, 34, 46, 28],
    'height': [6.1, 5.9, 5.4, 4.6, 5.11],
    'salary': [120000, 80000, 210000, 50000, 70000],
    'suffer_from_ds' : ['no', 'no', 'yes', 'yes', 'no']
})

cols = ['ssc_p', 'hsc_p', 'age', 'height', 'salary']
df[cols] = df[cols].apply(standardize)
df[cols] = df[cols].apply(normalize)
df.head()

Unnamed: 0,gender,ssc_p,hsc_p,age,height,salary,suffer_from_ds
0,M,0.666667,0.25974,0.888889,1.0,0.4375,no
1,M,0.933333,0.714286,0.666667,0.866667,0.1875,no
2,F,0.733333,0.662338,0.333333,0.533333,1.0,yes
3,F,0.0,1.0,1.0,0.0,0.0,yes
4,M,1.0,0.0,0.0,0.34,0.125,no


In [18]:
df.describe()

Unnamed: 0,ssc_p,hsc_p,age,height,salary
count,5.0,5.0,5.0,5.0,5.0
mean,-2.2204460000000003e-17,1.376677e-15,1.776357e-16,5.995204e-16,0.0
std,1.0,1.0,1.0,1.0,1.0
min,-1.678363,-1.332545,-1.404892,-1.359802,-0.882136
25%,0.0,-0.6761189,-0.5943775,-0.5161292,-0.567087
50%,0.1678363,0.3413416,0.2161373,-0.03639373,-0.409563
75%,0.6713451,0.4726268,0.7564805,0.7907364,0.220534
max,0.8391814,1.194695,1.026652,1.121588,1.638252


# MinMax Scaler on Iris dataset

In [24]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

iris = datasets.load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.fit_transform(X_test)

print(X_train[0:5])
print(X_train_norm[0:5])

[[5.2 3.4 1.4 0.2]
 [6.4 2.8 5.6 2.1]
 [7.7 2.6 6.9 2.3]
 [6.9 3.1 5.1 2.3]
 [7.7 2.8 6.7 2. ]]
[[0.25       0.63636364 0.05172414 0.04166667]
 [0.58333333 0.36363636 0.77586207 0.83333333]
 [0.94444444 0.27272727 1.         0.91666667]
 [0.72222222 0.5        0.68965517 0.91666667]
 [0.94444444 0.36363636 0.96551724 0.79166667]]


In [25]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

iris = datasets.load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

mms = StandardScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.fit_transform(X_test)

print(X_train[0:5])
print(X_train_norm[0:5])

[[5.1 3.8 1.9 0.4]
 [7.2 3.6 6.1 2.5]
 [5.7 4.4 1.5 0.4]
 [4.9 3.6 1.4 0.1]
 [5.7 3.8 1.7 0.3]]
[[-0.896638    1.56300584 -0.98787809 -0.98148768]
 [ 1.55670376  1.11854921  1.2970717   1.64693841]
 [-0.19568321  2.89637576 -1.20549236 -0.98148768]
 [-1.1302896   1.11854921 -1.25989593 -1.35697712]
 [-0.19568321  1.56300584 -1.09668523 -1.10665083]]


# Side by side comparison of all scaling methodologies

In [31]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

x = pd.DataFrame({
    'x1': np.concatenate([np.random.normal(20,2,1000), np.random.normal(1,2,25)]),
    'x2': np.concatenate([np.random.normal(30,2,1000), np.random.normal(50,2,25)]),
})

scaler = preprocessing.MinMaxScaler()
mms_df = scaler.fit_transform(x)
mms_df = pd.DataFrame(mms_df, columns=['x1', 'x2'])

scaler = preprocessing.StandardScaler()
std_df = scaler.fit_transform(x)
std_df = pd.DataFrame(std_df, columns=['x1', 'x2'])

scaler = preprocessing.RobustScaler()
robust_df = scaler.fit_transform(x)
robust_df = pd.DataFrame(robust_df, columns=['x1', 'x2'])





Unnamed: 0,x1,x2
0,0.339496,1.660856
1,-0.014726,-0.256845
2,0.456811,0.483052
3,1.143906,-0.349368
4,0.627544,-1.097197
...,...,...
1020,-6.685785,7.362535
1021,-6.268173,5.714901
1022,-6.858579,8.770293
1023,-6.765236,7.642710
