In [None]:
#import libraries

import warnings; warnings.simplefilter('ignore')

import pandas as pd
import numpy as np 
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt

import statsmodels.api as sm
from time import time

import sklearn

# Regression /Classification
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

#Building everything
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


# This allows to apply plots in the notebook
%matplotlib inline
plt.rcParams['figure.figsize'] = (8, 6)
#plt.rcParams['font.size'] = 10
#plt.style.use("fivethirtyeight")



In [None]:
# Read the data file  and load into dataframe.
data = pd.read_csv('../input/Admission_Predict_Ver1.1.csv')
# print the first 5 rows of the data
data.head()

In [None]:
#remove serial no.
data.drop(columns = 'Serial No.', inplace = True)
# print the first 5 rows of the data
data.head()

In [None]:
# Check the null values - We can see that there are no missing values in the data set
data.isnull().sum()

In [None]:
data.describe()

In [None]:
# Find correlation between different features
#Correlation is a statistical technique that can show whether and how strongly pairs of variables are related.
plt.figure(figsize=(15,7));
sns.heatmap(data.corr(), annot=True, cmap="YlGnBu");

In [None]:
#lets pick  GRE,TOEFL and CGPA as 3 factors maily affecting Chances of admit
#X=data[['GRE Score','TOEFL Score','CGPA']]

#When dataset is small, it is better to use all given features for better prediction
X=data.iloc[:,:-1]
y=data.iloc[:,-1]


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
#chance of admit is given as percentage/100, values are continuous 
#Linear regression is the best option,still lets check for all available regressors
#Solving it as a regression problem
#Check at once for all available regressors

heldout = [0.90, 0.80, 0.70, 0.60]
time_arr=[]
accuracy_arr=[]
rounds = 50
# ("SGD", SGDClassifier(), "aqua"),
print("Regressor Techniques")
classifiers = [
           ("Linear Regression", LinearRegression(),"red"),
           ("Random Forest Regression", RandomForestRegressor(),"blue"),
           ("Decision Tree Regression", DecisionTreeRegressor(),"green"),
           ("KNN Regression", KNeighborsRegressor(),"yellow" )]

xx = 1. - np.array(heldout)

for name, clf,color in classifiers:
    print("Training %s" % name)
    yy = []
    
    for i in heldout:
        t0 = time()
        yy_ = []
        for r in range(rounds):
            X_train, X_test, y_train, y_test = \
                train_test_split(X, y, train_size=i, random_state=42)
            
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            
        time_=time()-t0
        time_arr.append(time_)
        accuracy_arr.append(clf.score(X_test,y_test))
        yy.append(np.mean(yy_))
        print('Testing Accuracy: %f\tTime: %.2fs' % (clf.score(X_test,y_test),time_)) 


In [None]:
## Accuracy comparison graph
lr=accuracy_arr[0:4]
rf=accuracy_arr[4:8]
dt=accuracy_arr[8:12]
knnr=accuracy_arr[12:16]


acc=[("LR",lr,"red"),
     ("RF",rf,"blue"),
     ("DT",dt,"green"),
     ("KNNR",knnr,"yellow")]

for name, class_,color in acc:
    l = [class_ * 100 for class_ in class_]
    plt.plot(xx,l, label=name, color=color)
    
my_xticks = ['10%','20%','30%','40%','50%']
plt.xticks(xx, my_xticks)
plt.legend(bbox_to_anchor=(1, 1),
           bbox_transform=plt.gcf().transFigure)
plt.xlabel("Training data %")
plt.ylabel("Accuracy %")
plt.title("Accuracy Comparison of Classifiers without DR")
plt.show()

In [None]:
# Time comparison graph
lr=time_arr[0:4]
rf=time_arr[4:8]
dt=time_arr[8:12]
knnr=time_arr[12:16]

t=[("LR",lr,"red"),
     ("RF",rf,"blue"),
     ("DT",dt,"green"),
     ("KNNR",knnr,"yellow")]

for name, class_,color in t:
    plt.plot(xx,class_, label=name, color=color)

my_xticks = ['10%','20%','30%','40%','50%']
plt.xticks(xx, my_xticks)
plt.legend(bbox_to_anchor=(1, 1),
           bbox_transform=plt.gcf().transFigure)
plt.xlabel("Training data %")
plt.ylabel("Time in seconds")
plt.title("Time Comparison of Classifiers without DR")
plt.show()


In [None]:
# Train/Test Split with 70/30

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.70, random_state=42)

In [None]:
#LR gives better accuracy and takes less time
#And also for such data, it is always adviceable to start with something simple:)
#lets pick Linear Regression at 70/30 Split and tune it to get better accuracy
## scikit-learn's Four-Step Modeling Pattern

# Make an instance of a LinearRegression object.
lr = LinearRegression(normalize=False) #we have scalarized
## Fit the model with data (aka "model training").
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
score = lr.score(X_test,y_test)

In [None]:
score

In [None]:
#Lets turn this to a classification model
# Converting continuous variable into categorical value
y_ = [1 if each > 0.8 else 0 for each in y]

y_ = np.array(y_)

In [None]:
time_arr_c=[]
accuracy_arr_c=[]
rounds = 50

print("Classification")
classifiers = [
           ("Logistic Regression", LogisticRegression(),"red"),
           ("Random Forest ", RandomForestClassifier(),"blue"),
           ("Decision Tree", DecisionTreeClassifier(),"green"),
           ("KNN", KNeighborsClassifier(),"yellow" )]

xx = 1. - np.array(heldout)

for name, clf,color in classifiers:
    print("Training %s" % name)
    yy = []
    
    for i in heldout:
        t0 = time()
        yy_ = []
        for r in range(rounds):
            X_train, X_test, y_train, y_test = \
                train_test_split(X, y_, train_size=i, random_state=42)
            
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            
        time_=time()-t0
        time_arr_c.append(time_)
        accuracy_arr_c.append(clf.score(X_test,y_test))
        yy.append(np.mean(yy_))
        print('Testing Accuracy: %f\tTime: %.2fs' % (clf.score(X_test,y_test),time_)) 
