In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load and prepare the data

A critical step in working with neural networks is preparing the data correctly. Variables on different scales make it difficult for the network to efficiently learn the correct weights. Below, we've written the code to load and prepare the data.

In [2]:
data_path = 'Dataset/Admission_Predict_Ver1.1.csv'

admissions = pd.read_csv(data_path)

In [3]:
admissions.columns

Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
       'LOR ', 'CGPA', 'Research', 'Chance of Admit '],
      dtype='object')

In [4]:
admissions.describe()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,250.5,316.472,107.192,3.114,3.374,3.484,8.57644,0.56,0.72174
std,144.481833,11.295148,6.081868,1.143512,0.991004,0.92545,0.604813,0.496884,0.14114
min,1.0,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,125.75,308.0,103.0,2.0,2.5,3.0,8.1275,0.0,0.63
50%,250.5,317.0,107.0,3.0,3.5,3.5,8.56,1.0,0.72
75%,375.25,325.0,112.0,4.0,4.0,4.0,9.04,1.0,0.82
max,500.0,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


## Checking out the data
This dataset has the admission percentage for a university based on various factors like GRE Score, TOEFL Score, University Rankig, SOP, LOR, CGPA and Research. 
Below is a plot showing the number of students getting admitted just basedon the university ranking

In [None]:
data_rank1 = admissions.loc[(admissions["University Rating"]==1)]
data_rank1 = admissions[admissions["University Rating"]==1]
data_rank2 = admissions.loc[(admissions["University Rating"]==2)]
data_rank3 = admissions.loc[(admissions["University Rating"]==3)]
data_rank4 = admissions.loc[(admissions["University Rating"]==4)]
data_rank5 = admissions.loc[(admissions["University Rating"]==5)]

In [None]:
data_rank1.plot.scatter(x='GRE Score', y='Chance of Admit ',)

In [None]:
data_rank2.plot.scatter(x='GRE Score', y='Chance of Admit ', color='red')

In [None]:
data_rank3.plot.scatter(x='GRE Score', y='Chance of Admit ')

In [None]:
data_rank4.plot.scatter(x='GRE Score', y='Chance of Admit ')

In [None]:
data_rank5.plot.scatter(x='GRE Score', y='Chance of Admit ')

### One hot encoding
Here we have some categorical variables like University Rankig and Research. To include these in our model, we'll need to make binary dummy variables (or do one-hot encoding). This is simple to do with Pandas thanks to `get_dummies()`.

In [6]:
dummy_fields = ['University Rating', 'Research']
one_hot_admissions = admissions[:]
for each in dummy_fields:
    dummies = pd.get_dummies(one_hot_admissions[each], prefix=each, drop_first=False)
    one_hot_admissions = pd.concat([one_hot_admissions, dummies], axis=1)

to_be_dropped = ['University Rating', 'Research', 'Serial No.']
one_hot_admissions = one_hot_admissions.drop(to_be_dropped, axis=1)
one_hot_admissions.head()

Unnamed: 0,GRE Score,TOEFL Score,SOP,LOR,CGPA,Chance of Admit,University Rating_1,University Rating_2,University Rating_3,University Rating_4,University Rating_5,Research_0,Research_1
0,337,118,4.5,4.5,9.65,0.92,0,0,0,1,0,0,1
1,324,107,4.0,4.5,8.87,0.76,0,0,0,1,0,0,1
2,316,104,3.0,3.5,8.0,0.72,0,0,1,0,0,0,1
3,322,110,3.5,2.5,8.67,0.8,0,0,1,0,0,0,1
4,314,103,2.0,3.0,8.21,0.65,0,1,0,0,0,1,0


### Normalizing the variables
We could normalize variables like GRE, TOEFL, SOP, LOR and CGPA

In [7]:
data_to_be_normalized = ['GRE Score', 'TOEFL Score', 'SOP', 'LOR ', 'CGPA']

processed_data = one_hot_admissions[:]

for each in data_to_be_normalized:
    processed_data[each] /= processed_data[each].max()
processed_data.head()

Unnamed: 0,GRE Score,TOEFL Score,SOP,LOR,CGPA,Chance of Admit,University Rating_1,University Rating_2,University Rating_3,University Rating_4,University Rating_5,Research_0,Research_1
0,0.991176,0.983333,0.9,0.9,0.972782,0.92,0,0,0,1,0,0,1
1,0.952941,0.891667,0.8,0.9,0.894153,0.76,0,0,0,1,0,0,1
2,0.929412,0.866667,0.6,0.7,0.806452,0.72,0,0,1,0,0,0,1
3,0.947059,0.916667,0.7,0.5,0.873992,0.8,0,0,1,0,0,0,1
4,0.923529,0.858333,0.4,0.6,0.827621,0.65,0,1,0,0,0,1,0


### Splitting the data into training, testing, and validation sets

We'll save the data for the last approximately 10% to use as a test set after we've trained the network. We'll use this set to make predictions and compare them with the actual percentage of admissions.

In [8]:
sample = np.random.choice(processed_data.index, size=int(len(processed_data)*0.9), replace=False)
train_data, test_data = processed_data.iloc[sample], processed_data.drop(sample)

print("Number of training samples is", len(train_data))
print("Number of testing samples is", len(test_data))
print(train_data.shape[1])
#print(test_data[:10])

Number of training samples is 450
Number of testing samples is 50
13


### Splitting the data into features and targets

In [9]:
list_ = list(train_data.columns)
list_.remove('Chance of Admit ')
#print("list: ", list_)
#train_targets = train_data['Chance of Admit ']
train_features = train_data.drop('Chance of Admit ', axis=1)
train_targets = train_data['Chance of Admit '].values
#train_targets = train_data.drop(list_, axis=1)
test_features = test_data.drop('Chance of Admit ', axis=1)
#test_targets = test_data.drop(list_, axis=1)
test_targets =  test_data['Chance of Admit '].values

print("ytsin: ", train_features[:5])

ytsin:       GRE Score  TOEFL Score  SOP  LOR       CGPA  University Rating_1  \
76    0.961765     0.933333  0.6   0.6  0.879032                    0   
475   0.882353     0.841667  0.7   0.5  0.794355                    0   
325   0.958824     0.966667  0.7   0.8  0.921371                    0   
109   0.894118     0.858333  1.0   0.8  0.870968                    0   
345   0.929412     0.816667  0.3   0.4  0.748992                    1   

     University Rating_2  University Rating_3  University Rating_4  \
76                     0                    1                    0   
475                    0                    1                    0   
325                    0                    1                    0   
109                    0                    0                    0   
345                    0                    0                    0   

     University Rating_5  Research_0  Research_1  
76                     0           0           1  
475                    0      

## Finally we have prepared our data. Now it's time to train it with neural nets !!! 

In [10]:
# Imports
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.utils import np_utils

# Building the model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(train_features.shape[1],)))
model.add(Dense(1, activation='sigmoid'))

# Compiling the model
model.compile(loss = 'mean_squared_error', optimizer='adam')
model.summary()

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 64)                832       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 897
Trainable params: 897
Non-trainable params: 0
_________________________________________________________________


## Training the model

In [11]:
# Training the model
model.fit(train_features, train_targets, validation_split=0.2, epochs=50, batch_size=8, verbose=0)

<keras.callbacks.History at 0xb2da63f98>

## Evaluating the model


In [12]:
# Evaluating the model on the training and testing set
score = model.evaluate(train_features, train_targets)
print("score: ", score)
print("\n Training Accuracy:", score)
score = model.evaluate(test_features, test_targets)
print("score: ", score)
print("\n Testing Accuracy:", score)

score:  0.0062895193861590495

 Training Accuracy: 0.0062895193861590495
score:  0.008427593410015106

 Testing Accuracy: 0.008427593410015106
