In [185]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from keras.models import Sequential
from keras.layers import Dense

import eli5
from eli5.sklearn import PermutationImportance

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

In [167]:
seed = 7
np.random.seed(seed)

In [168]:
# load dataset
train_dataset = pd.read_csv("USCensusTraining.csv", header=None)
test_dataset = pd.read_csv("USCensusTest.csv", header=None)

In [169]:
new_header = train_dataset.iloc[0] #grab the first row for the header
train_df = train_dataset[1:] #take the data less the header row
train_df.columns = new_header #set the header row as the df header
train_dataset = train_df

In [170]:
new_header = test_dataset.iloc[0] #grab the first row for the header
test_df = test_dataset[1:] #take the data less the header row
test_df.columns = new_header #set the header row as the df header
test_dataset = test_df

In [171]:
train_dataset.shape

(25000, 15)

In [172]:
test_dataset.shape

(7561, 14)

In [68]:
#dataset = dataset.drop(dataset[dataset['native-country'] == '?'].index)
#dataset = dataset.drop(dataset[dataset['capital-gain'] == '99999'].index)

In [173]:
# function to preprocess the data
def data_pre_process(dataset):
    
    dummies_workclass = pd.get_dummies(dataset['workclass']).rename(columns = lambda x: 'workclass_' + str(x))
    dummies_marital_status = pd.get_dummies(dataset['marital-status']).rename(columns = lambda x: 'marital_status_' + str(x))
    dummies_occupation = pd.get_dummies(dataset['occupation']).rename(columns = lambda x: 'occupation_' + str(x))
    dummies_relationship = pd.get_dummies(dataset['relationship']).rename(columns = lambda x: 'relationship_' + str(x))
    dummies_race = pd.get_dummies(dataset['race']).rename(columns = lambda x: 'race_' + str(x))
    dummies_sex = pd.get_dummies(dataset['sex']).rename(columns = lambda x:'sex_' + str(x))
    dummies_countries = pd.get_dummies(dataset['native-country']).rename(columns = lambda x:'native_country_' + str(x))
    
    df_new = pd.concat([dataset, dummies_workclass, dummies_marital_status, dummies_occupation, dummies_relationship, dummies_race, dummies_sex, dummies_countries], axis = 1)
    df_new = df_new.drop(['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'], axis = 1)
    df_new = df_new.drop(['education'], axis = 1)
    
    return df_new

In [174]:
train_df_new = data_pre_process(train_dataset)
test_df_new = data_pre_process(test_dataset)
#test_df_new does not contain any observation for 'native_country_Holand-Netherlands'. So we create a new column with all 0s
test_df_new['native_country_Holand-Netherlands'] = 0

In [175]:
#Remove rows with capital-gain = 99999. We consider these as outliers
train_df_new = train_df_new.drop(train_df_new[train_df_new['capital-gain'] == '99999'].index)

In [176]:
test_df_new.shape

(7561, 92)

In [177]:
train_df_new.shape

(24874, 93)

In [178]:
train_df_new.loc[(train_df_new.income == '>50K.'),'income']=1
train_df_new.loc[(train_df_new.income == '<=50K.'),'income']=0

In [179]:
train_df_new.head(3)

Unnamed: 0,age,demogweight,education-num,capital-gain,capital-loss,hours-per-week,income,workclass_?,workclass_Federal-gov,workclass_Local-gov,...,native_country_Portugal,native_country_Puerto-Rico,native_country_Scotland,native_country_South,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United-States,native_country_Vietnam,native_country_Yugoslavia
1,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [180]:
# scale
def normalizeData(dataset):
    scaler = preprocessing.MinMaxScaler()
    scaled_dataset = scaler.fit_transform(dataset)
    return scaled_dataset

In [181]:
train_df_scaled = normalizeData(train_df_new)
test_df_scaled = normalizeData(test_df_new)

In [184]:
# split predictors from target
numpyArr = pd.DataFrame(train_df_scaled)
dfX = numpyArr.loc[:, numpyArr.columns != 6].to_numpy()
dfY = train_df_scaled[:, 6]

In [186]:
X_train, X_val, Y_train, Y_val = train_test_split(dfX, dfY, test_size=0.4, random_state=1)

In [187]:
X_train

(14924, 92)

In [188]:
X_val.shape

(9950, 92)

In [186]:
#Tune number of neurons in hidden layer

In [189]:
# function to create model
def create_model(neurons=1):
    model = Sequential()
    model.add(Dense(neurons, input_dim=92, activation="sigmoid"))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

In [190]:
model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=10, verbose=1)

In [194]:
# grid search parameters
neurons = [28, 29, 30]
param_grid = dict(neurons = neurons)

In [197]:
grid = GridSearchCV(estimator = model, param_grid= param_grid)

In [198]:
grid_result = grid.fit(X_train, Y_train)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [199]:
means = grid_result.cv_results_["mean_test_score"]
stds = grid_result.cv_results_["std_test_score"]
parms = grid_result.cv_results_["params"]

In [200]:
for mean, stdev, param in zip(means, stds, parms):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.845685 (0.002961) with: {'neurons': 28}
0.847226 (0.001970) with: {'neurons': 29}
0.846958 (0.001908) with: {'neurons': 30}


In [206]:
#Run model for 29 neurons
def create_model():
    model = Sequential()
    model.add(Dense(29, input_dim=92, activation="sigmoid"))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=10, verbose=1)

In [208]:
model.fit(X_train, Y_train, epochs=10, batch_size=10, verbose=2)

Epoch 1/10
 - 2s - loss: 0.4167 - accuracy: 0.8017
Epoch 2/10
 - 2s - loss: 0.3567 - accuracy: 0.8327
Epoch 3/10
 - 2s - loss: 0.3437 - accuracy: 0.8381
Epoch 4/10
 - 2s - loss: 0.3352 - accuracy: 0.8431
Epoch 5/10
 - 2s - loss: 0.3294 - accuracy: 0.8454
Epoch 6/10
 - 2s - loss: 0.3260 - accuracy: 0.8465
Epoch 7/10
 - 2s - loss: 0.3231 - accuracy: 0.8505
Epoch 8/10
 - 2s - loss: 0.3217 - accuracy: 0.8503
Epoch 9/10
 - 2s - loss: 0.3208 - accuracy: 0.8513
Epoch 10/10
 - 2s - loss: 0.3201 - accuracy: 0.8506


<keras.callbacks.callbacks.History at 0x1983ab38108>

In [209]:
perm = PermutationImportance(model, scoring="accuracy", random_state=1).fit(X_train, Y_train)









In [210]:
eli5.show_weights(perm, feature_names = train_df_new.drop('income', axis=1).columns.tolist())

Weight,Feature
0.0284  ± 0.0029,capital-gain
0.0259  ± 0.0033,education-num
0.0131  ± 0.0028,marital_status_Never-married
0.0094  ± 0.0027,sex_Female
0.0063  ± 0.0010,hours-per-week
0.0058  ± 0.0014,relationship_Wife
0.0053  ± 0.0019,capital-loss
0.0049  ± 0.0024,relationship_Own-child
0.0042  ± 0.0014,marital_status_Divorced
0.0038  ± 0.0009,occupation_Other-service


In [211]:
val_predictions = model.predict(X_val)
val_predictions_rounded = [round(x[0]) for x in val_predictions]



In [212]:
from sklearn.metrics import confusion_matrix
# predictions
confusion_matrix = confusion_matrix(Y_val, val_predictions_rounded)

In [213]:
confusion_matrix

array([[6975,  645],
       [ 856, 1474]], dtype=int64)

In [214]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_val, val_predictions_rounded)

0.8491457286432161

In [217]:
predictions = model.predict(test_df_scaled)
rounded = list(map(int, [round(x[0]) for x in predictions]))



In [218]:
data = rounded

with open("Team3predictions.txt", "w") as txt_file:
    for line in data:
        txt_file.write(" ".join(str(line)) + "\n") # works with any number of elements in a line