# Using Scaling - Normalization and Standardization

### Load libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Load data from admissions.csv 

In [2]:
df = pd.read_csv("admission.csv")

### Build Machine Learning Model 

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [4]:
df.describe()

Unnamed: 0,Sno,Gre,Toefl,Rating,Sop,Lor,Cgpa,Research,Chance
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,250.5,316.472,107.192,3.114,3.374,3.484,8.57644,0.56,0.72174
std,144.481833,11.295148,6.081868,1.143512,0.991004,0.92545,0.604813,0.496884,0.14114
min,1.0,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,125.75,308.0,103.0,2.0,2.5,3.0,8.1275,0.0,0.63
50%,250.5,317.0,107.0,3.0,3.5,3.5,8.56,1.0,0.72
75%,375.25,325.0,112.0,4.0,4.0,4.0,9.04,1.0,0.82
max,500.0,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


In [5]:
X = df[['Gre','Toefl','Cgpa']]
y = df['Chance'] * 100

In [6]:
# Split data into train and test 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

## Standardization of DataSet

In [8]:
X = df[['Gre','Toefl','Cgpa']]
y = df['Chance'] * 100

In [9]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
ss = StandardScaler()

In [10]:
# Split data into train and test 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [11]:
# Scale Train data 
X_train_scaled = ss.fit_transform(X_train)

In [12]:
X_train_scaled.shape

(400, 3)

In [13]:
X_train.iloc[:10,0]

107    338
336    319
71     336
474    308
6      321
412    314
113    320
236    325
299    305
155    312
Name: Gre, dtype: int64

In [14]:
X_train_scaled[:10,0]

array([ 1.87138145,  0.19003625,  1.69439775, -0.78337412,  0.36701996,
       -0.25242301,  0.27852811,  0.72098737, -1.04884968, -0.42940671])

In [20]:
X_train_scaled[:10,1]

array([ 1.56490551,  0.43325896,  0.75658655, -0.37506   ,  0.27159517,
       -0.86005137,  0.43325896,  0.75658655,  0.75658655,  0.27159517])

In [21]:
print(f"{X_train_scaled[:,0].mean():f} {X_train_scaled[:,0].std()}")  # Scaled data has 0 mean and 1 std

-0.000000 1.0


In [22]:
print(f"{X_train_scaled[:,1].mean():f} {X_train_scaled[:,1].std()}")  # Scaled data has 0 mean and 1 std

0.000000 1.0


In [15]:
## Fit model or Training Model
model = LinearRegression()
model.fit(X_train_scaled,y_train)

In [16]:
model.score(X_train_scaled,y_train)

0.8122343451664386

In [17]:
ss.mean_, ss.var_

(array([316.8525 , 107.32   ,   8.60215]),
 array([127.70074375,  38.2626    ,   0.37071688]))

In [19]:
model.coef_, model.intercept_

(array([2.85908017, 1.96637816, 8.67893264]), 72.655)

In [27]:
# Scale test data with same scale as train data
X_test_scaled = ss.transform(X_test)

In [28]:
y_pred = model.predict(X_test_scaled)

In [29]:
score = r2_score(y_test,y_pred)
print(f"R2 Score: {score:0.2f}")

R2 Score: 0.76


In [30]:
mse = mean_squared_error(y_test,y_pred)
print(f"Mean Squared Error : {mse:0.2f}")

Mean Squared Error : 41.34


### Scaled vs. non-scaled data

In [31]:
scaled_df = pd.DataFrame(X_train_scaled)

In [32]:
scaled_df.head(10)

Unnamed: 0,0,1,2
0,1.871381,1.564906,1.408932
1,0.190036,0.433259,0.308525
2,1.694398,0.756587,1.901652
3,-0.783374,-0.37506,-1.071091
4,0.36702,0.271595,-0.660491
5,-0.252423,-0.860051,-1.186059
6,0.278528,0.433259,-0.069227
7,0.720987,0.756587,0.932637
8,-1.04885,0.756587,0.078589
9,-0.429407,0.271595,0.144285


In [33]:
X_train.head(10)

Unnamed: 0,Gre,Toefl,Cgpa
107,338,117,9.46
336,319,110,8.79
71,336,112,9.76
474,308,105,7.95
6,321,109,8.2
412,314,102,7.88
113,320,110,8.56
236,325,112,9.17
299,305,112,8.65
155,312,109,8.69


#### Scale input features

In [20]:
# Scale input features using StandardScaler used to transform train data 
chances = model.predict(ss.transform([[320,110,8.5],[310,115,9.2]]))



In [21]:
chances

array([72.84721094, 81.8846113 ])

In [22]:
# Non-scaled data is being used as input 
chances = model.predict([[320,110,8.5]])

In [23]:
chances

array([1277.63318018])