Import Libraries

In [1]:
from sklearn import datasets
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

Import Dataset

In [2]:
df = pd.read_csv('original_data.csv')

Exploratory Data Analysis

In [3]:
df.shape

(53644, 26)

In [4]:
df.head()

Unnamed: 0,userName,major,researchExp,industryExp,specialization,toeflScore,program,department,toeflEssay,internExp,...,termAndYear,confPubs,ugCollege,gmatA,cgpa,gmatQ,cgpaScale,gmatV,univName,admit
0,143saf,Systems and Control,0,18,Robotics,112.0,MS,Instrumentation & Control,26.0,5.0,...,Fall - 2015,0.0,Dharamsinh Desai University,,8.5,,10,,Worcester Polytechnic Institute,1
1,7790ashish,Manufacturing Engineering,0,0,,,MS,0,,0.0,...,Fall - 2013,0.0,,,0.0,,0,,Worcester Polytechnic Institute,1
2,AB25,(MIS / MSIM / MSIS / MSIT),0,66,,94.0,MS,Computer Engineering,21.0,0.0,...,Fall - 2015,0.0,IET DAVV,,78.28,,100,,Worcester Polytechnic Institute,1
3,abhijitg,,0,0,,,,0,,0.0,...,,,,,0.0,,0,,Worcester Polytechnic Institute,1
4,abhijitgang,MIS,0,0,,81.0,MS,computer,,0.0,...,Fall - 2011,0.0,Pune University,,57.0,,100,,Worcester Polytechnic Institute,1


In [5]:
#view columns
df.columns

Index(['userName', 'major', 'researchExp', 'industryExp', 'specialization',
       'toeflScore', 'program', 'department', 'toeflEssay', 'internExp',
       'greV', 'greQ', 'userProfileLink', 'journalPubs', 'greA', 'topperCgpa',
       'termAndYear', 'confPubs', 'ugCollege', 'gmatA', 'cgpa', 'gmatQ',
       'cgpaScale', 'gmatV', 'univName', 'admit'],
      dtype='object')

In [6]:
# view summary of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53644 entries, 0 to 53643
Data columns (total 26 columns):
userName           53644 non-null object
major              53257 non-null object
researchExp        53644 non-null int64
industryExp        53644 non-null int64
specialization     31949 non-null object
toeflScore         49230 non-null float64
program            53322 non-null object
department         53643 non-null object
toeflEssay         11874 non-null object
internExp          53630 non-null float64
greV               52388 non-null float64
greQ               52424 non-null float64
userProfileLink    53644 non-null object
journalPubs        53322 non-null object
greA               50786 non-null float64
topperCgpa         53641 non-null float64
termAndYear        53322 non-null object
confPubs           53322 non-null object
ugCollege          51366 non-null object
gmatA              119 non-null float64
cgpa               53644 non-null float64
gmatQ              123 non

In [7]:
df = df.drop(['gmatA','gmatQ','gmatV','specialization','department','program',
              'userProfileLink','topperCgpa','termAndYear','userName',
              'industryExp','internExp','confPubs','journalPubs','ugCollege',
              'major','univName'],1)

In [8]:
df.shape

(53644, 9)

In [9]:
df = df.dropna()

Explore categorical variables

In [10]:
categorical = [var for var in df.columns if df[var].dtype=='O']

print('There are {} categorical variables\n'.format(len(categorical)))

print('The categorical variables are :\n\n', categorical)

There are 1 categorical variables

The categorical variables are :

 ['toeflEssay']


In [11]:
# view the categorical variables
df[categorical].head()

Unnamed: 0,toeflEssay
0,26
2,21
5,6
6,27
7,22


Missing values in categorical variables

In [12]:
# check missing values in categorical variables
df[categorical].isnull().sum()

toeflEssay    0
dtype: int64

Explore toeflEssay variable

In [13]:
# check labels in toeflEssay variable
df['toeflEssay'].unique()

array(['26', '21', '6', '27', '22', '24', '29', '30', '25', '28', '3',
       '4.5', '5.5', '20', '23', '5', '4', '0', '3.5', ' Chemistry',
       '2.5', '7', ' Information Technology', '18', '6.5', '2', '10',
       '41', '19', '60', '7.5', '45', '4.6', '1.5', '35'], dtype=object)

In [14]:
df = df[df['toeflEssay'] != ' Information Technology']

In [15]:
df = df[df['toeflEssay'] != ' Chemistry']

Explore admit variable

In [16]:
# check labels in admit variable
df['admit'].unique()

array([1, 0], dtype=int64)

Explore researchExp variable

In [17]:
# check labels in researchExp variable
df['researchExp'].unique()

array([ 0, 17, 12,  7, 15, 24, 10, 36,  2,  6,  3, 14,  4, 18,  8, 20,  9,
       42, 29, 16], dtype=int64)

In [18]:
a = df['toeflScore'].unique()
print(a)
print(type(a))

[ 112.   94.  273.  104.   95.  101.   91.  105.  107.  111.  103.   98.
  109.  108.  106.  110.  114.   89.   92.  237.   97.  257.   96.   85.
   93.   86.   79.   81.  117.  277.   84.   82.   83.  102.  116.  297.
  287.  280.  115.  267.  118.  100.  113.  263.  290.  283.   99.  120.
  270.  293.  300.   80.  119.   88.   90.   87.   57.  260.   65.  233.
   11. 1004.    7.  313.  223.  643.  253.  250. 1210.    0.  240.  247.
  275. 1350.  243.    9.  310.  587.  235.]
<class 'numpy.ndarray'>


Check missing values in categorical variables again

In [19]:
df[categorical].isnull().sum()

toeflEssay    0
dtype: int64

Number of labels: cardinality

In [20]:
# check for cardinality in categorical variables
for var in categorical:
    
    print(var, ' contains ', len(df[var].unique()), ' labels')

toeflEssay  contains  33  labels


Explore Numerical Variables

In [21]:
X = df.drop(['admit'], axis=1)

y = df['admit']

Split data into separate training and test set

In [22]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [23]:
# check the shape of X_train and X_test

X_train.shape, X_test.shape

((8054, 8), (3452, 8))

In [24]:
# check data types in X_train

X_train.dtypes

researchExp      int64
toeflScore     float64
toeflEssay      object
greV           float64
greQ           float64
greA           float64
cgpa           float64
cgpaScale        int64
dtype: object

In [25]:
# display categorical variables
categorical = [col for col in X_train.columns if X_train[col].dtypes == 'O']

categorical

['toeflEssay']

In [26]:
# print percentage of missing values in the categorical variables in training set

X_train[categorical].isnull().mean()

toeflEssay    0.0
dtype: float64

In [27]:
# print categorical variables with missing data
for col in categorical:
    if X_train[col].isnull().mean()>0:
        print(col, (X_train[col].isnull().mean()))

In [28]:
# impute missing categorical variables with most frequent value
for df2 in [X_train, X_test]:
    df2['toeflEssay'].fillna(X_train['toeflEssay'].mode()[0], inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [29]:
# check missing values in categorical variables in X_test

X_train[categorical].isnull().sum()

toeflEssay    0
dtype: int64

In [30]:
# check missing values in X_train

X_test[categorical].isnull().sum()

toeflEssay    0
dtype: int64

In [31]:
# check missing values in X_test

X_train.isnull().sum()

researchExp    0
toeflScore     0
toeflEssay     0
greV           0
greQ           0
greA           0
cgpa           0
cgpaScale      0
dtype: int64

In [32]:
X_test.isnull().sum()

researchExp    0
toeflScore     0
toeflEssay     0
greV           0
greQ           0
greA           0
cgpa           0
cgpaScale      0
dtype: int64

Encode categorical variables

In [33]:
# print categorical variables
categorical

['toeflEssay']

In [34]:
X_train[categorical].head()

Unnamed: 0,toeflEssay
19111,6
6313,22
46481,23
52653,25
33262,29


Model training 

In [35]:
# train a Gaussian Naive Bayes classifier on the training set
from sklearn.naive_bayes import GaussianNB

In [36]:
# instantiate the model
gnb = GaussianNB()

In [37]:
# fit the model
gnb.fit(X_train, y_train)

GaussianNB(priors=None)

Predict the results

In [38]:
y_pred = gnb.predict(X_test)

y_pred

array([0, 1, 0, ..., 1, 1, 0], dtype=int64)

Check accuracy score

In [39]:
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score: 0.5072


Compare the train-set and test-set accuracy

In [40]:
y_pred_train = gnb.predict(X_train)

y_pred_train

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [41]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))

Training-set accuracy score: 0.5024


Check for overfitting and underfitting

In [42]:
# print the scores on training and test set
print('Training set score: {:.4f}'.format(gnb.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(gnb.score(X_test, y_test)))

Training set score: 0.5024
Test set score: 0.5072


Compare model accuracy with null accuracy

In [43]:
# check class distribution in test set

y_test.value_counts()

0    1743
1    1709
Name: admit, dtype: int64

In [44]:
# check null accuracy score
null_accuracy = (1742/(1742+1712))

print('Null accuracy score: {0:0.4f}'. format(null_accuracy))

Null accuracy score: 0.5043
