In [7]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression,Lasso,LassoCV, Ridge, RidgeCV, ElasticNet, ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm 
from sklearn.metrics import r2_score
import pickle

In [8]:
df =pd.read_csv('Admission_Predict.csv')
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [9]:
df.isnull().sum()

Serial No.           0
GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

In [10]:
df.drop(columns=['Serial No.'],inplace=True)

In [11]:
df.describe()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,316.8075,107.41,3.0875,3.4,3.4525,8.598925,0.5475,0.72435
std,11.473646,6.069514,1.143728,1.006869,0.898478,0.596317,0.498362,0.142609
min,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,308.0,103.0,2.0,2.5,3.0,8.17,0.0,0.64
50%,317.0,107.0,3.0,3.5,3.5,8.61,1.0,0.73
75%,325.0,112.0,4.0,4.0,4.0,9.0625,1.0,0.83
max,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


## Splitting Features into Dependent & Independent Features, Data into Train, Test

In [12]:
x=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [13]:
x.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,337,118,4,4.5,4.5,9.65,1
1,324,107,4,4.0,4.5,8.87,1
2,316,104,3,3.0,3.5,8.0,1
3,322,110,3,3.5,2.5,8.67,1
4,314,103,2,2.0,3.0,8.21,0


In [14]:
y.head()

0    0.92
1    0.76
2    0.72
3    0.80
4    0.65
Name: Chance of Admit , dtype: float64

In [15]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=45)

## Standardizing the Dependent Features data using Z-Transform

In [16]:
scalar=StandardScaler()
x_train=scalar.fit_transform(x_train)
x_test=scalar.transform(x_test)

In [17]:
pickle.dump(scalar,open("scaling_model.pkl","wb"))

In [18]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_df=pd.DataFrame()
vif_df['vif']=[variance_inflation_factor(x_train,i) for i in range(x_train.shape[1])]
vif_df['feature']=x.columns
vif_df

Unnamed: 0,vif,feature
0,4.609846,GRE Score
1,4.457171,TOEFL Score
2,3.071257,University Rating
3,3.093621,SOP
4,2.576956,LOR
5,5.16077,CGPA
6,1.505983,Research


In [19]:
linear=LinearRegression()
linear.fit(x_train,y_train)

In [20]:
predict_1=linear.predict(x_test)
predict_1=predict_1.reshape(-1,1)
predict_1

array([[0.69264093],
       [0.51204317],
       [0.73193591],
       [0.96833792],
       [0.6961389 ],
       [0.80896423],
       [0.65401816],
       [0.74222942],
       [0.61723538],
       [0.84045399],
       [0.62270716],
       [0.87367668],
       [0.69061046],
       [0.98122216],
       [0.68338436],
       [0.6352006 ],
       [0.49532814],
       [0.96244479],
       [0.77166362],
       [0.89307479],
       [0.74086036],
       [0.68281416],
       [0.55813106],
       [0.65442459],
       [0.95133775],
       [0.87608186],
       [0.57006913],
       [0.49634946],
       [0.70901898],
       [0.66798761],
       [0.82706863],
       [0.85591726],
       [0.96954803],
       [0.85022733],
       [0.77936033],
       [0.60890881],
       [0.73474673],
       [0.78925785],
       [0.63580307],
       [0.47095921],
       [0.6289702 ],
       [0.70279647],
       [0.78547459],
       [0.58334233],
       [0.71191383],
       [0.77617327],
       [0.72232157],
       [0.723

In [21]:
linear.score(x_test,y_test)

0.8049825387312869

In [22]:
## Adjusted R2_Score

def adjr2_score(x,y):
    r2=linear.score(x,y)
    n=x.shape[0]
    p=x.shape[1]
    adj = 1-(1-r2)*(n-1)/(n-p-1)
    return adj

In [23]:
adjr2_score(x_test,y_test)

0.7860225077746065

In [24]:
linear.intercept_

0.7250624999999998

In [25]:
linear.coef_

array([ 0.02451747,  0.01322677,  0.00614046, -0.00127564,  0.01778   ,
        0.07352879,  0.01144109])

## Regularization

In [26]:
lassocv=LassoCV(alphas=None,cv=100,max_iter=200000)
lassocv.fit(x_train,y_train)

In [27]:
lassocv.alpha_

0.0005494605103436922

## We are able to calculate the hyperparameter alpha by performing cross validation 

In [28]:
lasso_l1=Lasso(alpha=lassocv.alpha_)
lasso_l1.fit(x_train,y_train)


In [29]:
lasso_l1.score(x_test,y_test)

0.8045032883955116

In [30]:
ridgecv=RidgeCV(alphas=np.random.uniform(0,10,50),cv=50)
ridgecv.fit(x_train,y_train)

In [31]:
ridgecv.alpha_

4.2749536502437895

In [32]:
ridge_l2=Ridge(alpha=ridgecv.alpha_)
ridge_l2.fit(x_train,y_train)

In [33]:
ridge_l2.score(x_test,y_test)

0.805628249780225

In [34]:
elasticcv=ElasticNetCV(alphas=None,cv=20)
elasticcv.fit(x_train,y_train)

In [35]:
elasticcv.alpha_

0.0013548011972587893

In [36]:
elasticcv.l1_ratio_

0.5

In [37]:
elastic_lr=ElasticNet(alpha=elasticcv.alpha_,l1_ratio=elasticcv.l1_ratio_)
elastic_lr.fit(x_train,y_train)


In [38]:
elastic_lr.score(x_test,y_test)

0.8045808179493981

In [39]:
pickle.dump(ridge_l2, open("ridge_regression_model.pkl","wb"))