### Standardization 

In [1]:
import numpy as np;
import pandas as pd;
import matplotlib as plt;

In [2]:
df = pd.read_csv("CSV_Files\Social_Network_Ads.csv");

In [3]:
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [4]:
df = df.iloc[:, 2:]

In [5]:
df

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0
...,...,...,...
395,46,41000,1
396,51,23000,1
397,50,20000,1
398,36,33000,0


In [6]:
df.sample(5)

Unnamed: 0,Age,EstimatedSalary,Purchased
222,37,144000,1
51,18,44000,0
236,40,57000,0
368,38,71000,0
102,32,86000,0


In [7]:
df.describe()

Unnamed: 0,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0
mean,37.655,69742.5,0.3575
std,10.482877,34096.960282,0.479864
min,18.0,15000.0,0.0
25%,29.75,43000.0,0.0
50%,37.0,70000.0,0.0
75%,46.0,88000.0,1.0
max,60.0,150000.0,1.0


### Train Test Split

In [8]:
from sklearn.model_selection import train_test_split;

In [9]:
X_train,X_test,y_train, y_test = train_test_split(df.drop('Purchased', axis=1), df['Purchased'], test_size=0.3, random_state=0);

In [10]:
X_train

Unnamed: 0,Age,EstimatedSalary
92,26,15000
223,60,102000
234,38,112000
232,40,107000
377,42,53000
...,...,...
323,48,30000
192,29,43000
117,36,52000
47,27,54000


In [11]:
X_test

Unnamed: 0,Age,EstimatedSalary
132,30,87000
309,38,50000
341,35,75000
196,30,79000
246,35,50000
...,...,...
216,49,65000
259,45,131000
49,31,89000
238,46,82000


In [12]:
y_train

92     0
223    1
234    0
232    1
377    0
      ..
323    1
192    0
117    0
47     0
172    0
Name: Purchased, Length: 280, dtype: int64

In [13]:
y_test

132    0
309    0
341    0
196    0
246    0
      ..
216    0
259    1
49     0
238    0
343    1
Name: Purchased, Length: 120, dtype: int64

In [14]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((280, 2), (120, 2), (280,), (120,))

In [15]:
X_train.describe()

Unnamed: 0,Age,EstimatedSalary
count,280.0,280.0
mean,37.864286,69807.142857
std,10.218201,34641.201654
min,18.0,15000.0
25%,30.0,43000.0
50%,37.0,70500.0
75%,46.0,88000.0
max,60.0,150000.0


In [16]:
X_test.describe()

Unnamed: 0,Age,EstimatedSalary
count,120.0,120.0
mean,37.166667,69591.666667
std,11.104797,32933.724084
min,18.0,15000.0
25%,28.0,47000.0
50%,36.0,64500.0
75%,45.25,87000.0
max,60.0,150000.0


In [17]:
y_train.describe()

count    280.000000
mean       0.364286
std        0.482091
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: Purchased, dtype: float64

In [18]:
y_test.describe()

count    120.000000
mean       0.341667
std        0.476257
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: Purchased, dtype: float64

### StandardScaler
- StandardScaler object in sklearn perform the following operation on each element of features.
- X = X-(mean of the feature(column))/ Standard diviation
- In Standardization mean of the feature become 0 and standard diviation become 1 for all feature.

In [19]:
from sklearn.preprocessing import StandardScaler

In [20]:
# Create a StandardScaler Object
scaler = StandardScaler();

# fit the scaler to the train set, it will learn parameter such as mean, std, etc.
scaler.fit(X_train);

# Transform train and test sets
X_train_scaled = scaler.transform(X_train);
X_test_scaled = scaler.transform(X_test);

In [21]:
# Applying scale to data from give us numpy as we must convert it into data from again if we need.
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns);
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns);

In [22]:
X_train_scaled.sample(4)

Unnamed: 0,Age,EstimatedSalary
200,-0.476894,-1.122266
46,0.111345,-0.804156
217,0.209385,-0.37037
246,-0.967093,-0.948751


In [23]:
X_test_scaled.sample(4)

Unnamed: 0,Age,EstimatedSalary
83,-1.163172,-1.006589
44,0.503504,1.220177
66,-1.359252,0.555039
117,-0.672973,0.555039


In [24]:
# Applying the formula of StandardScale to an item of data frame manually (First item of age feature which is 26)

(X_train.iloc[0,0] - X_train.mean(axis=0)[0]) / X_train.std(axis=0)[0]

# Here you can see the return value after applying formula to the first item of age feature in X_train (not scaled yet) 
# dataframe is same as the first value of age in X_train_scaled (scaled) dataframe.

-1.1610934448305064

In [25]:
# Chicking what happen to mean and std after scaling

X_train_scaled.describe()

Unnamed: 0,Age,EstimatedSalary
count,280.0,280.0
mean,3.489272e-17,6.344132000000001e-17
std,1.001791,1.001791
min,-1.947491,-1.58497
25%,-0.7710131,-0.775237
50%,-0.08473441,0.02003677
75%,0.7976239,0.5261201
max,2.170181,2.319101


In [26]:
X_test_scaled.describe()

Unnamed: 0,Age,EstimatedSalary
count,120.0,120.0
mean,-0.068394,-0.006231
std,1.088712,0.952412
min,-1.947491,-1.58497
25%,-0.967093,-0.659561
50%,-0.182774,-0.153478
75%,0.724094,0.497201
max,2.170181,2.319101


### Effict of Scaling