### USING SCALING - NORMALIZATION AND STANDARDIZATION

### LOAD LIBRARIES

In [137]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### LOAD DATA FROM ADMISSION.CSV

In [138]:
df = pd.read_csv("admission.csv")

### BUILD MACHINE LEARNING MODEL

In [139]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [140]:
df.describe()

Unnamed: 0,Sno,Gre,Toefl,Rating,Sop,Lor,Cgpa,Research,Chance
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,250.5,316.472,107.192,3.114,3.374,3.484,8.57644,0.56,0.72174
std,144.481833,11.295148,6.081868,1.143512,0.991004,0.92545,0.604813,0.496884,0.14114
min,1.0,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,125.75,308.0,103.0,2.0,2.5,3.0,8.1275,0.0,0.63
50%,250.5,317.0,107.0,3.0,3.5,3.5,8.56,1.0,0.72
75%,375.25,325.0,112.0,4.0,4.0,4.0,9.04,1.0,0.82
max,500.0,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


### MODEL WITH NORMALIZATION

In [141]:
X = df[["Gre", "Toefl", "Cgpa"]]
y = df["Chance"]  * 100

In [142]:
# SPLIT DATA INTO TRAIN AND TEST
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [143]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((400, 3), (100, 3), (400,), (100,))

In [144]:
# FIT MODEL OR TRAIN MODEL
model = LinearRegression(normalize = True) # MIN-MAX SCALE
model.fit(X_train, y_train)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




LinearRegression(normalize=True)

In [145]:
model.coef_, model.intercept_

(array([ 0.2530053 ,  0.31789215, 14.25427517]), -164.24396080709764)

In [146]:
model.score(X_train, y_train)

0.8122343451664387

In [147]:
y_pred = model.predict(X_test)

In [148]:
from sklearn.metrics import mean_squared_error, r2_score

In [149]:
mse = mean_squared_error(y_pred, y_test)
print(f"Mean Squared Error : {mse : 0.3f}")

Mean Squared Error :  41.341


In [150]:
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error : {rmse : 0.3f}")

Root Mean Squared Error :  6.430


In [151]:
r2score = r2_score(y_pred, y_test)
print(f"R2Score : {r2score : 0.3f}")

R2Score :  0.719


### STANDARDIZATION OF DATASET

In [152]:
X = df[["Gre", "Toefl", "Cgpa"]]
y = df["Chance"] * 100

In [153]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [154]:
ss = StandardScaler()

In [155]:
# SPLIT DATA INTO TRAIN AND TEST
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [156]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((400, 3), (100, 3), (400,), (100,))

In [157]:
# SCALE TRAIN DATASET
X_train_scaled = ss.fit_transform(X_train)

In [158]:
X_train_scaled.shape

(400, 3)

In [159]:
X_train.iloc[:10, 0] 

107    338
336    319
71     336
474    308
6      321
412    314
113    320
236    325
299    305
155    312
Name: Gre, dtype: int64

In [160]:
X_train_scaled[:10, 0]

array([ 1.87138145,  0.19003625,  1.69439775, -0.78337412,  0.36701996,
       -0.25242301,  0.27852811,  0.72098737, -1.04884968, -0.42940671])

In [161]:
X_train.iloc[:11, 1]

107    117
336    110
71     112
474    105
6      109
412    102
113    110
236    112
299    112
155    109
272     95
Name: Toefl, dtype: int64

In [162]:
X_train_scaled[:11, 1]

array([ 1.56490551,  0.43325896,  0.75658655, -0.37506   ,  0.27159517,
       -0.86005137,  0.43325896,  0.75658655,  0.75658655,  0.27159517,
       -1.99169792])

In [163]:
print(f"{X_train_scaled[:, 0].mean():f}, {X_train_scaled[:, 0].std()}")    # SCALED DATA HAS MEAN 0 AND STD 1

-0.000000, 1.0


In [164]:
print(f"{X_train_scaled[:, 1].mean():f}, {X_train_scaled[:, 1].std()}") 

0.000000, 1.0


In [165]:
print(f"{X_train_scaled[:10, 1].mean():f}, {X_train_scaled[:10, 1].std()}")

0.400926, 0.6294516290063882


In [166]:
print(f"{X_train_scaled[:11, 1].mean():f}, {X_train_scaled[:11, 1].std()}")

0.183415, 0.9128537157274579


In [167]:
# FIT MODEL OR TRAIN MODEL
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [168]:
model.score(X_train, y_train)

0.8122343451664387

In [169]:
ss.mean_, ss.var_

(array([316.8525 , 107.32   ,   8.60215]),
 array([127.70074375,  38.2626    ,   0.37071688]))

In [170]:
model.coef_

array([ 0.2530053 ,  0.31789215, 14.25427517])

In [171]:
model.coef_

array([ 0.2530053 ,  0.31789215, 14.25427517])

In [187]:
# SCALE TESTDATA WITH SAME SCALE AS TRAIN DATA
X_test_scaled = ss.transform(X_test)

In [188]:
y_pred = model.predict(X_test_scaled)



In [189]:
score = r2_score(y_test,y_pred)
print(f"R2 Score: {score:0.2f}")

R2 Score: -322.47


In [190]:
mse = mean_squared_error(y_test,y_pred)
print(f"Mean Squared Error : {mse:0.2f}")

Mean Squared Error : 56493.15


### SCALED VS NON SCALED DATA

In [191]:
scaled_df = pd.DataFrame(X_train_scaled)

In [192]:
scaled_df

Unnamed: 0,0,1,2
0,1.871381,1.564906,1.408932
1,0.190036,0.433259,0.308525
2,1.694398,0.756587,1.901652
3,-0.783374,-0.375060,-1.071091
4,0.367020,0.271595,-0.660491
...,...,...,...
395,-1.048850,-0.860051,-0.693339
396,0.455512,1.079914,0.554885
397,-2.376227,-0.536724,-1.875867
398,1.959873,1.888233,1.803108


In [194]:
scaled_df.head(10)

Unnamed: 0,0,1,2
0,1.871381,1.564906,1.408932
1,0.190036,0.433259,0.308525
2,1.694398,0.756587,1.901652
3,-0.783374,-0.37506,-1.071091
4,0.36702,0.271595,-0.660491
5,-0.252423,-0.860051,-1.186059
6,0.278528,0.433259,-0.069227
7,0.720987,0.756587,0.932637
8,-1.04885,0.756587,0.078589
9,-0.429407,0.271595,0.144285


In [196]:
X_train.head(10)

Unnamed: 0,Gre,Toefl,Cgpa
107,338,117,9.46
336,319,110,8.79
71,336,112,9.76
474,308,105,7.95
6,321,109,8.2
412,314,102,7.88
113,320,110,8.56
236,325,112,9.17
299,305,112,8.65
155,312,109,8.69


### SCALE INPUT FEATURES

In [197]:
# Scale input features using StandardScaler used to transform train data 
chances = model.predict(ss.transform([[320,110,8.5],[310,115,9.2]]))



In [198]:
chances

array([-166.42721774, -150.00629785])

In [199]:
chances = model.predict([[320, 110, 8.5]])



In [200]:
chances

array([72.84721094])

In [201]:
chances

array([72.84721094])

In [203]:
chances = model.predict([[300, 110, 9.0], [320, 90, 9.5]])



In [204]:
chances

array([74.91424251, 80.74364311])