In [1]:
import numpy as np
import pandas as pd


POS_STR = ' >50K'

train_file_path='adult_data.csv'
test_file_path='adult_test.csv'



def load_train_data(train_file_path, valid_rate=0.25, is_df=True):
    data_frame = pd.read_csv(train_file_path).sample(frac=1, random_state=11)
    np.random.seed(11)
    mask = np.random.rand(len(data_frame)) < 1 - valid_rate
    train_df, valid_df = data_frame.iloc[mask, :], data_frame.iloc[~mask, :]
    if is_df:
        return train_df, valid_df

    train_labels = [1 if x == POS_STR else 0 for x in train_df.iloc[:, 14].values]
    valid_labels = [1 if x == POS_STR else 0 for x in valid_df.iloc[:, 14].values]
    return train_df.iloc[:, :14].values, np.array(train_labels), valid_df.iloc[:, :14].values, np.array(valid_labels)


def load_test_data(test_file_path, is_df=True):
    data_frame = pd.read_csv(test_file_path)
    if is_df:
        return data_frame

    test_labels = [1 if x == POS_STR else 0 for x in data_frame.iloc[:, 14].values]
    return data_frame.iloc[:, :14].values, np.array(test_labels)


In [4]:
Xtr,Xva=load_train_data(train_file_path)
Xte=load_test_data(test_file_path)
Xtr.corr()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
age,1.0,-0.078252,0.030518,0.080028,0.053824,0.067805
fnlwgt,-0.078252,1.0,-0.040856,-0.000322,-0.012357,-0.018081
education_num,0.030518,-0.040856,1.0,0.124092,0.078441,0.154133
capital_gain,0.080028,-0.000322,0.124092,1.0,-0.032003,0.081663
capital_loss,0.053824,-0.012357,0.078441,-0.032003,1.0,0.051736
hours_per_week,0.067805,-0.018081,0.154133,0.081663,0.051736,1.0


In [5]:
Xtr.drop('education',axis=1,inplace=True)
Xva.drop('education',axis=1,inplace=True)
Xte.drop('education',axis=1,inplace=True)

In [6]:
Xtr.drop('fnlwgt',axis=1,inplace=True)
Xva.drop('fnlwgt',axis=1,inplace=True)
Xte.drop('fnlwgt',axis=1,inplace=True)

In [152]:
def gen_onehot_features(X,featureList):
    for feature in featureList:
        get_dummy_features= pd.get_dummies(X[feature])
        X=pd.concat([X,get_dummy_features],axis=1)
    return X  


In [7]:
Xtr['income'] = Xtr['income'].map( {' <=50K':0, ' >50K':1} )

Xte['income'] = Xte['income'].map( {' <=50K.':0, ' >50K.':1} )

Xva['income'] = Xva['income'].map( {' <=50K':0, ' >50K':1} )

In [8]:
Xtr.columns = [
   "Age", "WorkClass", 'EducationNum',
   'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender',
   'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'NativeCountry', 'Income'
]
Xva.columns = [
   "Age", "WorkClass", 'EducationNum',
   'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender',
   'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'NativeCountry', 'Income'
]
Xte.columns = [
   "Age", "WorkClass", 'EducationNum',
   'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender',
   'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'NativeCountry', 'Income'
]

In [9]:
# Convert income to 0 and 1
Ytr = Xtr["Income"].values
Xtr.drop("Income", axis=1, inplace=True,)
# Convert income to 0 and 1
Yva = Xva["Income"].values
Xva.drop("Income", axis=1, inplace=True,)
# Convert income to 0 and 1
Yte = Xte["Income"].values
Xte.drop("Income", axis=1, inplace=True,)

In [10]:
print(f"Mean of the features {np.mean(Xtr[contiCol],axis=0)}")
print(f"Minimum of the features {np.min(Xtr[contiCol],axis=0)}")
print(f"Maximum of the features {np.max(Xtr[contiCol],axis=0)}")
print(f"Variance of the features {np.var(Xtr[contiCol],axis=0)}")

NameError: name 'contiCol' is not defined

In [11]:
Xtr.Age = Xtr.Age.astype(float)
# Xtr.FnlWgt = Xtr.FnlWgt.astype(float)
Xtr.EducationNum = Xtr.EducationNum.astype(float)
Xtr.HoursPerWeek = Xtr.HoursPerWeek.astype(float)

Xva.Age = Xva.Age.astype(float)
# Xva.FnlWgt = Xva.FnlWgt.astype(float)
Xva.EducationNum = Xva.EducationNum.astype(float)
Xva.HoursPerWeek = Xva.HoursPerWeek.astype(float)

Xte.Age = Xte.Age.astype(float)
# Xte.FnlWgt = Xte.FnlWgt.astype(float)
Xte.EducationNum = Xte.EducationNum.astype(float)
Xte.HoursPerWeek = Xte.HoursPerWeek.astype(float)

In [12]:
Xtr = pd.get_dummies(Xtr, columns=[
   "WorkClass",
   'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender', 'NativeCountry',
])

Xva = pd.get_dummies(Xva, columns=[
   "WorkClass",
   'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender', 'NativeCountry',
])

Xte = pd.get_dummies(Xte, columns=[
   "WorkClass",
   'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender', 'NativeCountry',
])

In [13]:
Xva.insert(loc=65, column='NativeCountry_ Holand_Netherlands', value=0)
Xte.insert(loc=65, column='NativeCountry_ Holand_Netherlands', value=0)

In [14]:
print(Xtr.shape)
print(Xte.shape)

(24413, 91)
(16281, 91)


In [15]:
print(Xtr.columns)
print(Xte.columns)

Index(['Age', 'EducationNum', 'CapitalGain', 'CapitalLoss', 'HoursPerWeek',
       'WorkClass_ ?', 'WorkClass_ Federal_gov', 'WorkClass_ Local_gov',
       'WorkClass_ Never_worked', 'WorkClass_ Private',
       'WorkClass_ Self_emp_inc', 'WorkClass_ Self_emp_not_inc',
       'WorkClass_ State_gov', 'WorkClass_ Without_pay',
       'MaritalStatus_ Divorced', 'MaritalStatus_ Married_AF_spouse',
       'MaritalStatus_ Married_civ_spouse',
       'MaritalStatus_ Married_spouse_absent', 'MaritalStatus_ Never_married',
       'MaritalStatus_ Separated', 'MaritalStatus_ Widowed', 'Occupation_ ?',
       'Occupation_ Adm_clerical', 'Occupation_ Armed_Forces',
       'Occupation_ Craft_repair', 'Occupation_ Exec_managerial',
       'Occupation_ Farming_fishing', 'Occupation_ Handlers_cleaners',
       'Occupation_ Machine_op_inspct', 'Occupation_ Other_service',
       'Occupation_ Priv_house_serv', 'Occupation_ Prof_specialty',
       'Occupation_ Protective_serv', 'Occupation_ Sales',
      

In [16]:
contiCol = ["Age", "EducationNum", "HoursPerWeek","CapitalGain","CapitalLoss"]
categCol =['WorkClass_ ?', 'WorkClass_ Federal_gov', 'WorkClass_ Local_gov',
       'WorkClass_ Never_worked', 'WorkClass_ Private',
       'WorkClass_ Self_emp_inc', 'WorkClass_ Self_emp_not_inc',
       'WorkClass_ State_gov', 'WorkClass_ Without_pay',
       'MaritalStatus_ Divorced', 'MaritalStatus_ Married_AF_spouse',
       'MaritalStatus_ Married_civ_spouse',
       'MaritalStatus_ Married_spouse_absent', 'MaritalStatus_ Never_married',
       'MaritalStatus_ Separated', 'MaritalStatus_ Widowed', 'Occupation_ ?',
       'Occupation_ Adm_clerical', 'Occupation_ Armed_Forces',
       'Occupation_ Craft_repair', 'Occupation_ Exec_managerial',
       'Occupation_ Farming_fishing', 'Occupation_ Handlers_cleaners',
       'Occupation_ Machine_op_inspct', 'Occupation_ Other_service',
       'Occupation_ Priv_house_serv', 'Occupation_ Prof_specialty',
       'Occupation_ Protective_serv', 'Occupation_ Sales',
       'Occupation_ Tech_support', 'Occupation_ Transport_moving',
       'Relationship_ Husband', 'Relationship_ Not_in_family',
       'Relationship_ Other_relative', 'Relationship_ Own_child',
       'Relationship_ Unmarried', 'Relationship_ Wife',
       'Race_ Amer_Indian_Eskimo', 'Race_ Asian_Pac_Islander', 'Race_ Black',
       'Race_ Other', 'Race_ White', 'Gender_ Female', 'Gender_ Male',
       'NativeCountry_ ?', 'NativeCountry_ Cambodia', 'NativeCountry_ Canada',
       'NativeCountry_ China', 'NativeCountry_ Columbia',
       'NativeCountry_ Cuba', 'NativeCountry_ Dominican_Republic',
       'NativeCountry_ Ecuador', 'NativeCountry_ El_Salvador',
       'NativeCountry_ England', 'NativeCountry_ France',
       'NativeCountry_ Germany', 'NativeCountry_ Greece',
       'NativeCountry_ Guatemala', 'NativeCountry_ Haiti',
       'NativeCountry_ Holand_Netherlands', 'NativeCountry_ Honduras',
       'NativeCountry_ Hong', 'NativeCountry_ Hungary', 'NativeCountry_ India',
       'NativeCountry_ Iran', 'NativeCountry_ Ireland', 'NativeCountry_ Italy',
       'NativeCountry_ Jamaica', 'NativeCountry_ Japan', 'NativeCountry_ Laos',
       'NativeCountry_ Mexico', 'NativeCountry_ Nicaragua',
       'NativeCountry_ Outlying_US(Guam_USVI_etc)', 'NativeCountry_ Peru',
       'NativeCountry_ Philippines', 'NativeCountry_ Poland',
       'NativeCountry_ Portugal', 'NativeCountry_ Puerto_Rico',
       'NativeCountry_ Scotland', 'NativeCountry_ South',
       'NativeCountry_ Taiwan', 'NativeCountry_ Thailand',
       'NativeCountry_ Trinadad&Tobago', 'NativeCountry_ United_States',
       'NativeCountry_ Vietnam', 'NativeCountry_ Yugoslavia']

categCol1 = ['WorkClass_ ?', 'WorkClass_ Federal-gov', 'WorkClass_ Local-gov',
       'WorkClass_ Never-worked', 'WorkClass_ Private',
       'WorkClass_ Self-emp-inc', 'WorkClass_ Self-emp-not-inc',
       'WorkClass_ State-gov', 'WorkClass_ Without-pay',
       'MaritalStatus_ Divorced', 'MaritalStatus_ Married-AF-spouse',
       'MaritalStatus_ Married-civ-spouse',
       'MaritalStatus_ Married-spouse-absent', 'MaritalStatus_ Never-married',
       'MaritalStatus_ Separated', 'MaritalStatus_ Widowed', 'Occupation_ ?',
       'Occupation_ Adm-clerical', 'Occupation_ Armed-Forces',
       'Occupation_ Craft-repair', 'Occupation_ Exec-managerial',
       'Occupation_ Farming-fishing', 'Occupation_ Handlers-cleaners',
       'Occupation_ Machine-op-inspct', 'Occupation_ Other-service',
       'Occupation_ Priv-house-serv', 'Occupation_ Prof-specialty',
       'Occupation_ Protective-serv', 'Occupation_ Sales',
       'Occupation_ Tech-support', 'Occupation_ Transport-moving',
       'Relationship_ Husband', 'Relationship_ Not-in-family',
       'Relationship_ Other-relative', 'Relationship_ Own-child',
       'Relationship_ Unmarried', 'Relationship_ Wife',
       'Race_ Amer-Indian-Eskimo', 'Race_ Asian-Pac-Islander', 'Race_ Black',
       'Race_ Other', 'Race_ White', 'Gender_ Female', 'Gender_ Male',
       'NativeCountry_ ?', 'NativeCountry_ Cambodia', 'NativeCountry_ Canada',
       'NativeCountry_ China', 'NativeCountry_ Columbia',
       'NativeCountry_ Cuba', 'NativeCountry_ Dominican-Republic',
       'NativeCountry_ Ecuador', 'NativeCountry_ El-Salvador',
       'NativeCountry_ England', 'NativeCountry_ France',
       'NativeCountry_ Germany', 'NativeCountry_ Greece',
       'NativeCountry_ Guatemala', 'NativeCountry_ Haiti',
       'NativeCountry_ Honduras', 'NativeCountry_ Holand_Netherlands',
       'NativeCountry_ Hong', 'NativeCountry_ Hungary', 'NativeCountry_ India',
       'NativeCountry_ Iran', 'NativeCountry_ Ireland', 'NativeCountry_ Italy',
       'NativeCountry_ Jamaica', 'NativeCountry_ Japan', 'NativeCountry_ Laos',
       'NativeCountry_ Mexico', 'NativeCountry_ Nicaragua',
       'NativeCountry_ Outlying-US(Guam-USVI-etc)', 'NativeCountry_ Peru',
       'NativeCountry_ Philippines', 'NativeCountry_ Poland',
       'NativeCountry_ Portugal', 'NativeCountry_ Puerto-Rico',
       'NativeCountry_ Scotland', 'NativeCountry_ South',
       'NativeCountry_ Taiwan', 'NativeCountry_ Thailand',
       'NativeCountry_ Trinadad&Tobago', 'NativeCountry_ United-States',
       'NativeCountry_ Vietnam', 'NativeCountry_ Yugoslavia']

In [160]:
Xtr.head()

Unnamed: 0,Age,EducationNum,CapitalGain,CapitalLoss,HoursPerWeek,WorkClass_ ?,WorkClass_ Federal_gov,WorkClass_ Local_gov,WorkClass_ Never_worked,WorkClass_ Private,...,NativeCountry_ Portugal,NativeCountry_ Puerto_Rico,NativeCountry_ Scotland,NativeCountry_ South,NativeCountry_ Taiwan,NativeCountry_ Thailand,NativeCountry_ Trinadad&Tobago,NativeCountry_ United_States,NativeCountry_ Vietnam,NativeCountry_ Yugoslavia
24337,62.0,9.0,0,0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
17049,50.0,9.0,0,0,40.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
21016,36.0,9.0,0,0,50.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2790,64.0,10.0,0,0,40.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
13511,28.0,11.0,0,0,60.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [146]:
# Standardize data (0 mean, 1 stdev)
from sklearn.preprocessing import StandardScaler
import pandas
import numpy

# df1 = df[['a','b']]
def standardization(X):
    Xscaler = X[contiCol]
    array = Xscaler.values
    scaler = StandardScaler().fit(Xscaler)
    rescaledX = scaler.transform(Xscaler)
    return rescaledX  

Xtr_scaled = standardization(Xtr)
Xtr.Age = Xtr_scaled[:,0]
# Xtr.FnlWgt = Xtr_scaled[:,1]
Xtr.EducationNum = Xtr_scaled[:,1]
Xtr.HoursPerWeek = Xtr_scaled[:,2]
Xtr.CapitalGain = Xtr_scaled[:,3]
Xtr.CapitalLoss = Xtr_scaled[:,4]

Xva_scaled = standardization(Xva)
Xva.Age = Xva_scaled[:,0]
# Xva.FnlWgt = Xva_scaled[:,1]
Xva.EducationNum = Xva_scaled[:,1]
Xva.HoursPerWeek = Xva_scaled[:,2]
Xva.CapitalGain= Xva_scaled[:,3]
Xva.CapitalLoss= Xva_scaled[:,4]


Xte_scaled = standardization(Xte)
Xte.Age = Xte_scaled[:,0]
# Xte.FnlWgt = Xte_scaled[:,1]
Xte.EducationNum = Xte_scaled[:,1]
Xte.HoursPerWeek = Xte_scaled[:,2]
Xte.CapitalGain= Xte_scaled[:,3]
Xte.CapitalLoss= Xte_scaled[:,4]


Xtr.drop('CapitalGain',axis=1,inplace=True)
Xva.drop('CapitalGain',axis=1,inplace=True)
Xte.drop('CapitalGain',axis=1,inplace=True)
Xtr.drop('CapitalLoss',axis=1,inplace=True)
Xva.drop('CapitalLoss',axis=1,inplace=True)
Xte.drop('CapitalLoss',axis=1,inplace=True)

In [161]:
#Logistic regression
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

clf = LogisticRegression(random_state=0).fit(Xtr, Ytr)
Ytrhat = clf.predict(Xtr)
auc_tr = roc_auc_score(Ytr, Ytrhat)
accuracy_tr = clf.score(Xtr, Ytr)
print(f"AUC for training data {auc_tr}")
print(f"Acuracy for training data {accuracy_tr}")

AUC for training data 0.7632802889071495
Acuracy for training data 0.8509400729119732


In [162]:
Yvahat = clf.predict(Xva)
auc_val = roc_auc_score(Yva, Yvahat)
accuracy_val = clf.score(Xva, Yva)
print(f"AUC for validation data {auc_val}")
print(f"Acuracy for validation data {accuracy_val}")

AUC for validation data 0.7736834459647698
Acuracy for validation data 0.8565292096219931


In [163]:
Ytehat = clf.predict(Xte)
auc_test = roc_auc_score(Yte, Ytehat)
accuracy_test = clf.score(Xte, Yte)
print(f"AUC for test data {auc_test}")
print(f"Acuracy for test data {accuracy_test}")

AUC for test data 0.7634631440746171
Acuracy for test data 0.8519746944290891


In [164]:
#Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

gnb.fit(Xtr[contiCol], Ytr)

YtrhatGNB = gnb.predict(Xtr[contiCol])
auc_tr_gnb = roc_auc_score(Ytr, YtrhatGNB)
accuracy_tr_gnb = gnb.score(Xtr[contiCol], Ytr)
print(f"AUC for training data {auc_tr_gnb}")
print(f"Acuracy for training data {accuracy_tr_gnb}")

AUC for training data 0.63012424107179
Acuracy for training data 0.7962151312825134


In [165]:
YvahatGNB = gnb.predict(Xva[contiCol])
auc_va_gnb = roc_auc_score(Yva, YvahatGNB)
accuracy_va_gnb = gnb.score(Xva[contiCol], Yva)
print(f"AUC for validation data {auc_va_gnb}")
print(f"Acuracy for validation data {accuracy_va_gnb}")

AUC for validation data 0.6275590817552993
Acuracy for validation data 0.7986008836524301


In [166]:
YtehatGNB = gnb.predict(Xte[contiCol])
auc_te_gnb = roc_auc_score(Yte, YtehatGNB)
accuracy_te_gnb = gnb.score(Xte[contiCol], Yte)
print(f"AUC for test data {auc_te_gnb}")
print(f"Acuracy for test data {accuracy_te_gnb}")

AUC for test data 0.6248953215064671
Acuracy for test data 0.796265585651987


In [167]:
Yvahat1=pd.DataFrame(YvahatGNB)
Ytehat1=pd.DataFrame(YtehatGNB)

In [168]:
#MultinomialNB

from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(Xtr[categCol], Ytr)
YtrhatMNB = mnb.predict(Xtr[categCol])
auc_tr_mnb = roc_auc_score(Ytr, YtrhatMNB)
accuracy_tr_mnb = mnb.score(Xtr[categCol], Ytr)
print(f"AUC for training data {auc_tr_mnb}")
print(f"Acuracy for training data {accuracy_tr_mnb}")

AUC for training data 0.7632765156281706
Acuracy for training data 0.7620530045467578


In [169]:
YvahatMNB = mnb.predict(Xva[categCol])
auc_va_mnb = roc_auc_score(Yva, YvahatMNB)
accuracy_va_mnb = mnb.score(Xva[categCol], Yva)
print(f"AUC for validation data {auc_va_mnb}")
print(f"Acuracy for validation data {accuracy_va_mnb}")

AUC for validation data 0.7656534812267674
Acuracy for validation data 0.7622729504172803


In [170]:
YtehatMNB = mnb.predict(Xte[categCol1])
auc_test_mnb = roc_auc_score(Yte, YtehatMNB)
accuracy_test_mnb = mnb.score(Xte[categCol1], Yte)
print(f"AUC for test data {auc_test_mnb}")
print(f"Acuracy for test data {accuracy_test_mnb}")

AUC for test data 0.7673673460810567
Acuracy for test data 0.7667219458264234


In [171]:
Yvahat2=pd.DataFrame(YvahatMNB)
Ytehat2=pd.DataFrame(YtehatMNB)

In [172]:
Xva.reset_index(drop=True, inplace=True)
Yvahat1.reset_index(drop=True, inplace=True)
Yvahat2.reset_index(drop=True, inplace=True)
Xte.reset_index(drop=True, inplace=True)
Ytehat1.reset_index(drop=True, inplace=True)
Ytehat2.reset_index(drop=True, inplace=True)

In [173]:

df_val=pd.concat([Xva, Yvahat1,Yvahat2],axis=1)
df_test=pd.concat([Xte, Ytehat1,Ytehat2],axis=1)
# print(df_val,df_test)

model = LogisticRegression()
model.fit(df_val,Yva)
Ytehaten = model.predict(df_test)
accuracy_test_ensemble = model.score(df_test,Yte)
auc_test_ensemble = roc_auc_score(Yte,Ytehaten)
print(f"AUC for test data {auc_test_ensemble}")
print(f"Acuracy for test data {accuracy_test_ensemble}")

AUC for test data 0.7659694686943087
Acuracy for test data 0.8544315459738345


In [18]:
Xtr.shape

(24413, 91)

In [174]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score

mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)

mlp.fit(Xtr, Ytr)  
YtrhatNN = mlp.predict(Xtr)
auc_tr_nn= roc_auc_score(Ytr, YtrhatNN)
accuracy_tr_nn = mlp.score(Xtr, Ytr)
print(f"AUC for training data for neural networks  {auc_tr_nn}")
print(f"Acuracy for training data for neural networks {accuracy_tr_nn}")

AUC for training data for neural networks  0.5
Acuracy for training data for neural networks 0.7584483676729611


In [175]:
YvahatNN = mlp.predict(Xva)
auc_va_nn= roc_auc_score(Yva, YvahatNN)
accuracy_va_nn = mlp.score(Xva, Yva)
print(f"AUC for validation data for neural networks  {auc_va_nn}")
print(f"Acuracy for validation data for neural networks {accuracy_va_nn}")

AUC for validation data for neural networks  0.5
Acuracy for validation data for neural networks 0.7614138438880707


In [176]:
YtehatNN = mlp.predict(Xte)
auc_te_nn= roc_auc_score(Yte, YtehatNN)
accuracy_te_nn = mlp.score(Xte, Yte)
print(f"AUC for test data for neural networks  {auc_te_nn}")
print(f"Acuracy for test data for neural networks {accuracy_te_nn}")

AUC for test data for neural networks  0.5
Acuracy for test data for neural networks 0.7637737239727289
