#### 1) Importing the relevant libraries :

In [1]:
import pandas as pd
import numpy as np

#### 2) Loading the Dataset

In [2]:
adult_data=pd.read_csv('adult.csv',skipinitialspace=True,na_values='?')

In [3]:
adult_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [4]:
adult_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
age                48842 non-null int64
workclass          46043 non-null object
fnlwgt             48842 non-null int64
education          48842 non-null object
educational-num    48842 non-null int64
marital-status     48842 non-null object
occupation         46033 non-null object
relationship       48842 non-null object
race               48842 non-null object
gender             48842 non-null object
capital-gain       48842 non-null int64
capital-loss       48842 non-null int64
hours-per-week     48842 non-null int64
native-country     47985 non-null object
income             48842 non-null object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


#### 2.1) Dropping the NaN values:

In [5]:
adult_data.dropna(inplace=True)
adult_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45222 entries, 0 to 48841
Data columns (total 15 columns):
age                45222 non-null int64
workclass          45222 non-null object
fnlwgt             45222 non-null int64
education          45222 non-null object
educational-num    45222 non-null int64
marital-status     45222 non-null object
occupation         45222 non-null object
relationship       45222 non-null object
race               45222 non-null object
gender             45222 non-null object
capital-gain       45222 non-null int64
capital-loss       45222 non-null int64
hours-per-week     45222 non-null int64
native-country     45222 non-null object
income             45222 non-null object
dtypes: int64(6), object(9)
memory usage: 5.5+ MB


#### 2.2) Dropping the redundant column:

In [6]:
adult_data.drop(labels='education',inplace=True,axis=1)
adult_data.columns

Index(['age', 'workclass', 'fnlwgt', 'educational-num', 'marital-status',
       'occupation', 'relationship', 'race', 'gender', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', 'income'],
      dtype='object')

In [7]:
adult_data.head()

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
5,34,Private,198693,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K


In [8]:
adult_data['income'].value_counts()

<=50K    34014
>50K     11208
Name: income, dtype: int64

#### 3) Data Preprocessing :

#### 3.1) Label Encoding

In [9]:
#fetching categorical attributes from among input features
categorical_df=adult_data.loc[:,'age':'native-country'].select_dtypes(include='object')
#creating a list of categorical input features
categorical_attributes=categorical_df.columns.tolist()
#extracting the indices of categorical input attributes
categorical_indices=[]
for attributes in categorical_attributes:
    categorical_indices.append(adult_data.columns.get_loc(attributes))

In [10]:
#importing the label encoder class and label encoding categorical attributes
from sklearn.preprocessing import LabelEncoder
encoder_object=LabelEncoder()
for attribute in categorical_attributes:
    adult_data.loc[:,attribute]=encoder_object.fit_transform(adult_data.loc[:,attribute])
adult_data.head()

#labelencoding the target feature
adult_data.loc[:,'income']=encoder_object.fit_transform(adult_data.loc[:,'income'])

#### 3.2) OneHotEncoding

In [11]:
from sklearn.preprocessing import OneHotEncoder
hot_encoder=OneHotEncoder(categorical_features=categorical_indices)
adult_data=hot_encoder.fit_transform(adult_data).toarray()
adult_data=pd.DataFrame(data=adult_data)

#### Checking the splits rendered to each categorical feature:

In [12]:
splits_rendered=pd.DataFrame(data={'feature':categorical_attributes,'indicex':categorical_indices,'splits':hot_encoder.n_values_})
splits_rendered

Unnamed: 0,feature,indicex,splits
0,workclass,1,7
1,marital-status,4,7
2,occupation,5,14
3,relationship,6,6
4,race,7,5
5,gender,8,2
6,native-country,12,41


#### 4) Sampling instances uniformly across the classes:

In [13]:
adult_data[88].value_counts()

0.0    34014
1.0    11208
Name: 88, dtype: int64

In [14]:
adult_negative=adult_data[adult_data[88]==0].sample(n=7500,replace=False)
adult_positive=adult_data[adult_data[88]==1].sample(n=7500,replace=False)
training_data=pd.concat([adult_negative,adult_positive])
training_data=training_data.reindex(np.random.permutation(training_data.index))
testing_data=adult_data.drop(training_data.index)
X_train=training_data.iloc[:,0:88]
Y_train=training_data.iloc[:,88]
X_test=testing_data.iloc[:,0:88]
Y_test=testing_data.iloc[:,88]

#### 5) Standardizing the training data and test data and applying PCA:


In [15]:
#standardixing the data
from sklearn.preprocessing import StandardScaler
standardizer=StandardScaler()
X_train=standardizer.fit_transform(X_train)
X_test=standardizer.transform(X_test)

# applying PrincipalComponentAnalaysis(PCA)
from sklearn.decomposition import PCA
pca_obj=PCA(0.90)
X_train=pca_obj.fit_transform(X_train)
X_test=pca_obj.transform(X_test)

print('Number of components :',pca_obj.n_components_)


Number of components : 66


#### 6) Applying ANN:

In [16]:
#importing deep learning libraries
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
#creating a function that returns a Neural Network
def Neural_Network(node_count,layer_count,optimizer):
    classifier=Sequential()
    input_layer=Dense(input_dim=pca_obj.n_components_,units=node_count,activation='relu',kernel_initializer='uniform')
    classifier.add(input_layer)
    for count in range(layer_count):
        hidden_layer=Dense(units=node_count,activation='relu',kernel_initializer='uniform')
        classifier.add(hidden_layer)
        classifier.add(Dropout(rate=0.45))
    output_layer=Dense(units=1,activation='sigmoid',kernel_initializer='uniform')
    classifier.add(output_layer)
    classifier.compile(optimizer=optimizer,metrics=['accuracy'],loss='binary_crossentropy')
    return classifier

from keras.wrappers.scikit_learn import KerasClassifier
neural_clf=KerasClassifier(build_fn=Neural_Network)

from sklearn.model_selection import GridSearchCV
hyperparams={'node_count':[5,15,25,35,45],
            'layer_count':[1,2,3,4,5],
            'optimizer':['adam','rmsprop']}
grid_object=GridSearchCV(estimator=neural_clf,scoring='accuracy',cv=5,param_grid=hyperparams,verbose=3)
grid_object.fit(X_train,Y_train)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] layer_count=1, node_count=5, optimizer=adam .....................
Epoch 1/1
[CV]  layer_count=1, node_count=5, optimizer=adam, score=0.805, total=   1.1s
[CV] layer_count=1, node_count=5, optimizer=adam .....................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


Epoch 1/1
[CV]  layer_count=1, node_count=5, optimizer=adam, score=0.8023333333333333, total=   1.2s
[CV] layer_count=1, node_count=5, optimizer=adam .....................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.6s remaining:    0.0s


Epoch 1/1
[CV]  layer_count=1, node_count=5, optimizer=adam, score=0.8003333333333333, total=   1.2s
[CV] layer_count=1, node_count=5, optimizer=adam .....................
Epoch 1/1
[CV]  layer_count=1, node_count=5, optimizer=adam, score=0.793, total=   1.3s
[CV] layer_count=1, node_count=5, optimizer=adam .....................
Epoch 1/1
[CV]  layer_count=1, node_count=5, optimizer=adam, score=0.8053333333333333, total=   1.3s
[CV] layer_count=1, node_count=5, optimizer=rmsprop ..................
Epoch 1/1
[CV]  layer_count=1, node_count=5, optimizer=rmsprop, score=0.8033333333333333, total=   1.3s
[CV] layer_count=1, node_count=5, optimizer=rmsprop ..................
Epoch 1/1
[CV]  layer_count=1, node_count=5, optimizer=rmsprop, score=0.7903333333333333, total=   1.2s
[CV] layer_count=1, node_count=5, optimizer=rmsprop ..................
Epoch 1/1
[CV]  layer_count=1, node_count=5, optimizer=rmsprop, score=0.7933333333333333, total=   1.2s
[CV] layer_count=1, node_count=5, optimizer

[CV]  layer_count=1, node_count=35, optimizer=adam, score=0.8026666666666666, total=   2.4s
[CV] layer_count=1, node_count=35, optimizer=adam ....................
Epoch 1/1
[CV]  layer_count=1, node_count=35, optimizer=adam, score=0.8023333333333333, total=   2.6s
[CV] layer_count=1, node_count=35, optimizer=rmsprop .................
Epoch 1/1
[CV]  layer_count=1, node_count=35, optimizer=rmsprop, score=0.808, total=   2.4s
[CV] layer_count=1, node_count=35, optimizer=rmsprop .................
Epoch 1/1
[CV]  layer_count=1, node_count=35, optimizer=rmsprop, score=0.8046666666666666, total=   2.4s
[CV] layer_count=1, node_count=35, optimizer=rmsprop .................
Epoch 1/1
[CV]  layer_count=1, node_count=35, optimizer=rmsprop, score=0.8093333333333333, total=   2.4s
[CV] layer_count=1, node_count=35, optimizer=rmsprop .................
Epoch 1/1
[CV]  layer_count=1, node_count=35, optimizer=rmsprop, score=0.8, total=   2.5s
[CV] layer_count=1, node_count=35, optimizer=rmsprop ......

[CV]  layer_count=2, node_count=15, optimizer=rmsprop, score=0.8033333333333333, total=   3.9s
[CV] layer_count=2, node_count=15, optimizer=rmsprop .................
Epoch 1/1
[CV]  layer_count=2, node_count=15, optimizer=rmsprop, score=0.806, total=   3.6s
[CV] layer_count=2, node_count=15, optimizer=rmsprop .................
Epoch 1/1
[CV]  layer_count=2, node_count=15, optimizer=rmsprop, score=0.8063333333333333, total=   3.7s
[CV] layer_count=2, node_count=15, optimizer=rmsprop .................
Epoch 1/1
[CV]  layer_count=2, node_count=15, optimizer=rmsprop, score=0.794, total=   3.7s
[CV] layer_count=2, node_count=15, optimizer=rmsprop .................
Epoch 1/1
[CV]  layer_count=2, node_count=15, optimizer=rmsprop, score=0.8066666666666666, total=   3.7s
[CV] layer_count=2, node_count=25, optimizer=adam ....................
Epoch 1/1
[CV]  layer_count=2, node_count=25, optimizer=adam, score=0.8076666666666666, total=   4.0s
[CV] layer_count=2, node_count=25, optimizer=adam ....

[CV]  layer_count=2, node_count=45, optimizer=rmsprop, score=0.801, total=   4.8s
[CV] layer_count=2, node_count=45, optimizer=rmsprop .................
Epoch 1/1
[CV]  layer_count=2, node_count=45, optimizer=rmsprop, score=0.809, total=   4.8s
[CV] layer_count=2, node_count=45, optimizer=rmsprop .................
Epoch 1/1
[CV]  layer_count=2, node_count=45, optimizer=rmsprop, score=0.8, total=   4.8s
[CV] layer_count=2, node_count=45, optimizer=rmsprop .................
Epoch 1/1
[CV]  layer_count=2, node_count=45, optimizer=rmsprop, score=0.8053333333333333, total=   4.9s
[CV] layer_count=3, node_count=5, optimizer=adam .....................
Epoch 1/1
[CV]  layer_count=3, node_count=5, optimizer=adam, score=0.792, total=   5.3s
[CV] layer_count=3, node_count=5, optimizer=adam .....................
Epoch 1/1
[CV]  layer_count=3, node_count=5, optimizer=adam, score=0.497, total=   5.3s
[CV] layer_count=3, node_count=5, optimizer=adam .....................
Epoch 1/1
[CV]  layer_count=3

Epoch 1/1
[CV]  layer_count=3, node_count=25, optimizer=rmsprop, score=0.798, total=  11.1s
[CV] layer_count=3, node_count=25, optimizer=rmsprop .................
Epoch 1/1
[CV]  layer_count=3, node_count=25, optimizer=rmsprop, score=0.8023333333333333, total=   7.5s
[CV] layer_count=3, node_count=35, optimizer=adam ....................
Epoch 1/1
[CV]  layer_count=3, node_count=35, optimizer=adam, score=0.806, total=   7.5s
[CV] layer_count=3, node_count=35, optimizer=adam ....................
Epoch 1/1
[CV]  layer_count=3, node_count=35, optimizer=adam, score=0.7996666666666666, total=   7.6s
[CV] layer_count=3, node_count=35, optimizer=adam ....................
Epoch 1/1
[CV]  layer_count=3, node_count=35, optimizer=adam, score=0.8093333333333333, total=   7.6s
[CV] layer_count=3, node_count=35, optimizer=adam ....................
Epoch 1/1
[CV]  layer_count=3, node_count=35, optimizer=adam, score=0.7993333333333333, total=   7.7s
[CV] layer_count=3, node_count=35, optimizer=adam ...

[CV]  layer_count=4, node_count=5, optimizer=rmsprop, score=0.5036666666666667, total=   9.7s
[CV] layer_count=4, node_count=15, optimizer=adam ....................
Epoch 1/1
[CV]  layer_count=4, node_count=15, optimizer=adam, score=0.8083333333333333, total=  10.1s
[CV] layer_count=4, node_count=15, optimizer=adam ....................
Epoch 1/1
[CV]  layer_count=4, node_count=15, optimizer=adam, score=0.8, total=  10.2s
[CV] layer_count=4, node_count=15, optimizer=adam ....................
Epoch 1/1
[CV]  layer_count=4, node_count=15, optimizer=adam, score=0.49066666666666664, total=  10.3s
[CV] layer_count=4, node_count=15, optimizer=adam ....................
Epoch 1/1
[CV]  layer_count=4, node_count=15, optimizer=adam, score=0.793, total=  10.4s
[CV] layer_count=4, node_count=15, optimizer=adam ....................
Epoch 1/1
[CV]  layer_count=4, node_count=15, optimizer=adam, score=0.49633333333333335, total=  10.6s
[CV] layer_count=4, node_count=15, optimizer=rmsprop ..............

Epoch 1/1
[CV]  layer_count=4, node_count=45, optimizer=adam, score=0.8043333333333333, total=  13.6s
[CV] layer_count=4, node_count=45, optimizer=adam ....................
Epoch 1/1
[CV]  layer_count=4, node_count=45, optimizer=adam, score=0.81, total=  13.7s
[CV] layer_count=4, node_count=45, optimizer=adam ....................
Epoch 1/1
[CV]  layer_count=4, node_count=45, optimizer=adam, score=0.801, total=  14.3s
[CV] layer_count=4, node_count=45, optimizer=adam ....................
Epoch 1/1
[CV]  layer_count=4, node_count=45, optimizer=adam, score=0.795, total=  15.5s
[CV] layer_count=4, node_count=45, optimizer=rmsprop .................
Epoch 1/1
[CV]  layer_count=4, node_count=45, optimizer=rmsprop, score=0.8073333333333333, total=  14.9s
[CV] layer_count=4, node_count=45, optimizer=rmsprop .................
Epoch 1/1
[CV]  layer_count=4, node_count=45, optimizer=rmsprop, score=0.8, total=  14.3s
[CV] layer_count=4, node_count=45, optimizer=rmsprop .................
Epoch 1/1
[

[CV]  layer_count=5, node_count=25, optimizer=adam, score=0.49066666666666664, total=  25.4s
[CV] layer_count=5, node_count=25, optimizer=adam ....................
Epoch 1/1
[CV]  layer_count=5, node_count=25, optimizer=adam, score=0.5023333333333333, total=  25.8s
[CV] layer_count=5, node_count=25, optimizer=adam ....................
Epoch 1/1
[CV]  layer_count=5, node_count=25, optimizer=adam, score=0.777, total=  25.9s
[CV] layer_count=5, node_count=25, optimizer=rmsprop .................
Epoch 1/1
[CV]  layer_count=5, node_count=25, optimizer=rmsprop, score=0.803, total=  22.9s
[CV] layer_count=5, node_count=25, optimizer=rmsprop .................
Epoch 1/1
[CV]  layer_count=5, node_count=25, optimizer=rmsprop, score=0.8033333333333333, total=  23.5s
[CV] layer_count=5, node_count=25, optimizer=rmsprop .................
Epoch 1/1
[CV]  layer_count=5, node_count=25, optimizer=rmsprop, score=0.8053333333333333, total=  22.9s
[CV] layer_count=5, node_count=25, optimizer=rmsprop ......

[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed: 43.5min finished


Epoch 1/1


GridSearchCV(cv=5, error_score='raise',
       estimator=<keras.wrappers.scikit_learn.KerasClassifier object at 0x00000270F4BA7710>,
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'node_count': [5, 15, 25, 35, 45], 'layer_count': [1, 2, 3, 4, 5], 'optimizer': ['adam', 'rmsprop']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=3)

In [20]:
grid_object.best_params_


{'layer_count': 2, 'node_count': 35, 'optimizer': 'adam'}

In [21]:
neural_classifier=Neural_Network(node_count=35,layer_count=1,optimizer='adam')
neural_classifier.fit(X_train,Y_train,epochs=100,batch_size=30)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x271bdb57048>

In [33]:
from sklearn.preprocessing import binarize
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.preprocessing import binarize
Y_pred1=neural_classifier.predict(X_train)
Y_pred2=neural_classifier.predict(X_test)

Y_pred1=binarize(Y_pred1,threshold=0.50)
Y_pred2=binarize(Y_pred2,threshold=0.50)

cm1=confusion_matrix(Y_train,Y_pred1)
cm2=confusion_matrix(Y_test,Y_pred2)


print('PERFORMANCE OF THE CLASSIFIER ON THE TRAINING SET:')
print('Accuracy:',100*accuracy_score(Y_train,Y_pred1))
print(cm1)
print('Specificity:',100*(cm1[0,0]/(cm1[0,0]+cm1[0,1])))
print('Sensitivity:',100*(cm1[1,1]/(cm1[1,0]+cm1[1,1])))

print('\n')

print('PERFORMANCE OF THE CLASSIFIER ON THE TESTING SET:')
print('Accuracy:',100*accuracy_score(Y_test,Y_pred2))
print(cm2)
print('Specificity:',100*(cm2[0,0]/(cm2[0,0]+cm2[0,1])))
print('Sensitivity:',100*(cm2[1,1]/(cm1[1,0]+cm2[1,1])))



PERFORMANCE OF THE CLASSIFIER ON THE TRAINING SET:
Accuracy: 84.72
[[5959 1541]
 [ 751 6749]]
Specificity: 79.45333333333333
Sensitivity: 89.98666666666666


PERFORMANCE OF THE CLASSIFIER ON THE TESTING SET:
Accuracy: 77.3575540996625
[[20164  6350]
 [  493  3215]]
Specificity: 76.05038847401373
Sensitivity: 81.06404437720626
