In [116]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Lasso
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from scipy.stats import norm, skew, ttest_ind, f_oneway
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer

from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVC as SVR
from sklearn.ensemble import VotingRegressor
from sklearn.neural_network import MLPRegressor

In [117]:
from google.colab import files
uploaded = files.upload()

Saving Admission_Dataset.csv to Admission_Dataset (2).csv


In [118]:
df= pd.read_csv('Admission_Dataset.csv')
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [119]:
df = df.drop('Serial No.', axis=1)

In [120]:
df.shape

(500, 8)

In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GRE Score          500 non-null    int64  
 1   TOEFL Score        500 non-null    int64  
 2   University Rating  500 non-null    int64  
 3   SOP                500 non-null    float64
 4   LOR                500 non-null    float64
 5   CGPA               500 non-null    float64
 6   Research           500 non-null    int64  
 7   Chance of Admit    500 non-null    float64
dtypes: float64(4), int64(4)
memory usage: 31.4 KB


In [122]:
df.describe()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,316.472,107.192,3.114,3.374,3.484,8.57644,0.56,0.72174
std,11.295148,6.081868,1.143512,0.991004,0.92545,0.604813,0.496884,0.14114
min,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,308.0,103.0,2.0,2.5,3.0,8.1275,0.0,0.63
50%,317.0,107.0,3.0,3.5,3.5,8.56,1.0,0.72
75%,325.0,112.0,4.0,4.0,4.0,9.04,1.0,0.82
max,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


In [123]:
df.isnull().sum()

Unnamed: 0,0
GRE Score,0
TOEFL Score,0
University Rating,0
SOP,0
LOR,0
CGPA,0
Research,0
Chance of Admit,0


In [124]:
df.dropna(inplace=True)

In [125]:
print(df.isnull().sum())

GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64



### **DATA PREROCESSING**

In [126]:
numerical_columns =  [ 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA', 'Research','Chance of Admit ']

REMOVING OUTLIERS

In [127]:
def remove_outliers(df,columns ):       # Outlier detection of the Numeric columns
  for col in columns:

    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

  return df

df = remove_outliers(df , numerical_columns)

NORMALIZE NUMERIC FEATURES

In [128]:
def normalize(df,columns):
  pt=PowerTransformer(method='yeo-johnson')
  for col in columns:
    skewness=skew(df[col])
    if abs(skewness)>0.5:
        df[col]=pt.fit_transform(df[col].values.reshape(-1,1))
    return df

data = normalize(df,numerical_columns)

STANDARD SCALER IN NUMERICAL FEATURES

In [129]:
# Scale numerical features
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [130]:
X = df.drop('Chance of Admit ',axis=1)
y = df['Chance of Admit ']

In [131]:
models = {
    'Linear Regression': LinearRegression(),
    'KNN Regressor': KNeighborsRegressor(),
    'Random forest': RandomForestRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'GD Boosting': GradientBoostingRegressor(),
    'MLP Regressor': MLPRegressor(max_iter=50)
}

In [132]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [133]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

results = []

for model_name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    results.append({
        'Model': model_name,
        'MAE': mean_absolute_error(y_test, y_pred),
        'MSE': mean_squared_error(y_test, y_pred),
        'R2_score': r2_score(y_test, y_pred),

    })

df_results = pd.DataFrame(results)

df_results.sort_values(by='R2_score', ascending=False, inplace=True)
df_results



Unnamed: 0,Model,MAE,MSE,R2_score
0,Linear Regression,0.31473,0.191054,0.795858
5,MLP Regressor,0.326647,0.20562,0.780295
2,Random forest,0.331278,0.207208,0.778598
4,GD Boosting,0.348949,0.221318,0.763522
1,KNN Regressor,0.363912,0.243977,0.73931
3,Decision Tree,0.454349,0.39591,0.576969


In [134]:
x_train.shape

(397, 7)

In [135]:
y_train.shape

(397,)

In [136]:
lr=LinearRegression()
dt=DecisionTreeRegressor()
rf=RandomForestRegressor()
GD=GradientBoostingRegressor()

In [137]:
estimators = [('lr',lr),('dt',dt),('rf',rf),('GD',GD)]

In [138]:
from sklearn.model_selection import cross_val_score

In [139]:
X = df.drop('Chance of Admit ',axis=1)
y = df['Chance of Admit ']

In [140]:
for estimator in estimators:
    scores = cross_val_score(estimator[1], X, y, scoring='r2',cv=10)
    print(estimator[0], np.round(np.mean(scores), 2))

lr 0.81
dt 0.58
rf 0.77
GD 0.77


In [141]:
from sklearn.ensemble import VotingRegressor

### **HARD VOTING**

In [142]:
vc=VotingRegressor(estimators=estimators)
do = cross_val_score(vc,X,y,cv=10,scoring='r2')
print('Voting Regressor',np.round(np.mean(do),2))

Voting Regressor 0.79


### **WEGHTED VOTING**

in this voting we assign a cetain number of weights to our model

In [143]:
for i in range(1,3):
  for j in range(1,3):
      for k in range(1,3):
        for l in range(1,3):
          Vr=VotingRegressor(estimators,weights=[i,j,k,l])
          sew = cross_val_score(Vr,X,y,scoring='r2',cv=10)
          print(f'for i={i},j={j},k={k},l={l}'.format(i,j,k),np.round(np.mean(sew),2))

for i=1,j=1,k=1,l=1 0.78
for i=1,j=1,k=1,l=2 0.79
for i=1,j=1,k=2,l=1 0.78
for i=1,j=1,k=2,l=2 0.78
for i=1,j=2,k=1,l=1 0.76
for i=1,j=2,k=1,l=2 0.77
for i=1,j=2,k=2,l=1 0.76
for i=1,j=2,k=2,l=2 0.77
for i=2,j=1,k=1,l=1 0.79
for i=2,j=1,k=1,l=2 0.79
for i=2,j=1,k=2,l=1 0.8
for i=2,j=1,k=2,l=2 0.79
for i=2,j=2,k=1,l=1 0.78
for i=2,j=2,k=1,l=2 0.78
for i=2,j=2,k=2,l=1 0.78
for i=2,j=2,k=2,l=2 0.78


USING THE SAME ALGORITHM

DECISIONTREE REGRESSOR

In [144]:
dt1=DecisionTreeRegressor(max_depth=1)
dt2=DecisionTreeRegressor(max_depth=3)
dt3=DecisionTreeRegressor(max_depth=6)
dt4=DecisionTreeRegressor(max_depth=9)

In [145]:
estimators=[('dt1',dt1),('dt2',dt2),('dt3',dt3),('dt4',dt4)]

In [146]:
for estimator in estimators:
    scores = cross_val_score(estimator[1], X, y, scoring='r2',cv=10)
    print(estimator[0], np.round(np.mean(scores), 2))

dt1 0.47
dt2 0.73
dt3 0.7
dt4 0.6


LINEAR REGRESSION

In [147]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge

lr1 = make_pipeline(PolynomialFeatures(degree=1),LinearRegression())
lr2 = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
lr3 = make_pipeline(PolynomialFeatures(degree=3),LinearRegression())
lr4 = make_pipeline(PolynomialFeatures(degree=4),LinearRegression())

In [148]:
estimators=[('lr1',lr1),('lr2',lr2),('lr3',lr3),('lr4',lr4)]

In [149]:
for estimator in estimators:
    scores = cross_val_score(estimator[1], X, y, scoring='r2',cv=10)
    print(estimator[0], np.round(np.mean(scores), 2))

lr1 0.81
lr2 0.79
lr3 0.64
lr4 -9.51


### **HARD VOTING**

In [152]:
vc=VotingRegressor(estimators=estimators)
do = cross_val_score(vc,X,y,cv=10,scoring='r2')
print('Voting Regressor',np.round(np.mean(do),2))

Voting Regressor 0.12


### **WEIGHTED VOTING**

In [153]:
for i in range(1,3):
  for j in range(1,3):
      for k in range(1,3):
        for l in range(1,3):
          Vr=VotingRegressor(estimators,weights=[i,j,k,l])
          sew = cross_val_score(Vr,X,y,scoring='r2',cv=10)
          print(f'for i={i},j={j},k={k},l={l}'.format(i,j,k),np.round(np.mean(sew),2))

for i=1,j=1,k=1,l=1 0.12
for i=1,j=1,k=1,l=2 -0.89
for i=1,j=1,k=2,l=1 0.35
for i=1,j=1,k=2,l=2 -0.39
for i=1,j=2,k=1,l=1 0.35
for i=1,j=2,k=1,l=2 -0.39
for i=1,j=2,k=2,l=1 0.48
for i=1,j=2,k=2,l=2 -0.08
for i=2,j=1,k=1,l=1 0.36
for i=2,j=1,k=1,l=2 -0.38
for i=2,j=1,k=2,l=1 0.48
for i=2,j=1,k=2,l=2 -0.08
for i=2,j=2,k=1,l=1 0.49
for i=2,j=2,k=1,l=2 -0.08
for i=2,j=2,k=2,l=1 0.56
for i=2,j=2,k=2,l=2 0.12
