# PREDICT THE CHANCES OF GETTING AN ADMISSION

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle 


In [None]:
df = pd.read_csv('./Files-main/Admission_Prediction.csv')
df.head()

understanding data at a higher level

In [None]:
df.describe()

taking care of Nan values

In [None]:
df['University Rating'] = df['University Rating'].fillna(df['University Rating'].mode()[0])#mode is used since uni ranking is categorical data

df['TOEFL Score']=df['TOEFL Score'].fillna(df['TOEFL Score'].mean())

df['GRE Score']=df['GRE Score'].fillna(df['GRE Score'].mean())

In [None]:
df.describe()

dropping unwanted columns

In [None]:
df.drop(columns=['Serial No.'], inplace=True)
#using inplace= True will save the changes made to the dataset

In [None]:
df.head()

In [None]:
import warnings
warnings.filterwarnings("ignore")

visualizing the data 

In [None]:
plt.figure(figsize=(20,20),facecolor='white')
plotnumber = 1
for column in df:
    if plotnumber<=8:
        ax= plt.subplot(2,4,plotnumber)
        sns.distplot(df[column])
        plt.xlabel(column,fontsize=20)
    plotnumber += 1
plt.tight_layout()

the data distribution looks good and there aren't any skewered distribution.
now to observe the relationship between independent variables and dependent variable

In [None]:
#divide the data set into feature and label
#y is label
y= df['Chance of Admit']
#x is feature
x= df.drop(columns=['Chance of Admit'])

In [None]:
y

In [None]:
x

In [None]:
plt.figure(figsize=(20,35),facecolor='white')
plotnumber= 1
for column in x:
    if plotnumber<=8:
        ax=plt.subplot(4,2,plotnumber)
        plt.scatter(x[column],y)
        plt.xlabel(column,fontsize=20)
        plt.ylabel('Chance of Admit',fontsize=20)
    plotnumber+=1
plt.tight_layout()

data scaling

In [None]:
scaler = StandardScaler()
x_scaled= scaler.fit_transform(x)

split the data to train and test

In [None]:
x_train,x_test,y_train,y_test= train_test_split(x_scaled,y,test_size=0.25,random_state=2435324)
y_train.head()

model instantiating and training

In [None]:
regression = LinearRegression()
regression.fit(x_train,y_train)

predicting the chance of admission with the given features

In [None]:
df.tail(2)

In [None]:
print('Chance of admission is: ',regression.predict(scaler.transform([[327.0,113.0,4.0,4.5,4.5,9.04,0]])))

saviing the model

In [None]:
filename='admission_chance_model.pickle'
pickle.dump(regression,open(filename,'wb'))
loaded_model=pickle.load(open(filename,'rb'))
a=loaded_model.predict(scaler.transform([[314,103,2,2,3,8.21,0]]))
a

check how good is the model

In [None]:
#adjusted R^2 score
regression.score(x_train,y_train)

check how well the model fits the test data

In [None]:
regression.score(x_test,y_test)

plotting and visualizing

In [None]:
y_pred= regression.predict(x_test)

In [None]:
y_pred

In [None]:
plt.scatter(y_test,y_pred)
plt.xlabel('actual chance of admission')
plt.ylabel('predicted chance of admission')
plt.title('actual vs model predicted')
plt.show

model evaluation