In [1]:

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler
import numpy as np

### Data Prep

In [2]:
column=["age","work_class","gnlwgt","education","education-num","marital_status","occupation","relationship",
        "race","sex","capital-gain","capital-loss","hours-per-week","native-country","salary"]
x_train=pd.read_csv("adult-new.data.txt",header=None,names=column)
x_test=pd.read_csv("adult-new.test",header=None,names=column)

In [3]:
#make target column
x_train["target"]=np.where(x_train["salary"]==" <=50K",0,1)
x_test["target"]=np.where(x_test["salary"]==" <=50K",0,1)
y_train=x_train['target']
y_test=x_test["target"]

#split test y data by sex
y_test_female=x_test[x_test.sex==" Female"]["target"]
y_test_male=x_test[x_test.sex==" Male"]["target"]

#drop target from train
x_train=x_train.drop("salary",axis=1)
x_test=x_test.drop("salary",axis=1)
x_train=x_train.drop("target",axis=1)
x_test=x_test.drop("target",axis=1)

In [4]:
#categorical data
dfall=pd.concat([x_test,x_train],axis=0)
all_dummy=pd.get_dummies(dfall,dummy_na=False)
x_test_dum=all_dummy[:x_test.shape[0]]
x_train_dum=all_dummy[x_test.shape[0]:]

In [5]:
#split test x data by sex
female_test_x=x_test_dum[x_test_dum["sex_ Female"]==1] 
male_test_x=x_test_dum[x_test_dum["sex_ Male"]==1]

In [6]:
x_train_dum.shape

(32561, 108)

### Affine Classifier

In [7]:
import time
start=time.time()
pipe = make_pipeline(MaxAbsScaler(), LogisticRegression(fit_intercept=True,dual=False))
param_grid={"logisticregression__C":np.logspace(-3,2,10),            
            "logisticregression__penalty":["l1","l2"]}
grid = GridSearchCV(pipe, cv=10, param_grid=param_grid,verbose=False)
grid.fit(x_train_dum, y_train)
print("Time: ", time.time()-start)

Time:  356.6058702468872


In [27]:
print(grid.best_params_)

{'logisticregression__C': 100.0, 'logisticregression__penalty': 'l1'}


In [8]:
pred_logistic=grid.predict(x_test_dum)
train_logistic=grid.predict(x_train_dum)

In [9]:
def acc(y1,ypred):
    same=np.sum(y1 == ypred)
    return (1-same/y1.shape[0])

In [10]:
print("The train error rate for logistic regression is: ",acc(y_train,train_logistic))
print("The test error rate for logistic regression is: ",acc(y_test,pred_logistic))

The train error rate for logistic regression is:  0.146647830226
The test error rate for logistic regression is:  0.147841041705


In [11]:
#get prediction for both female and male records
logistic_f_pred=grid.predict(female_test_x)
logistic_m_pred=grid.predict(male_test_x)

In [12]:
def fp(y1,ypred):
    n=ypred.shape[0]
    num=np.sum((ypred==1)&(ypred!=y1))
    return num/n

In [13]:
def fn(y1,ypred):
    n=ypred.shape[0]
    num=np.sum((ypred==0)&(ypred!=y1))
    return num/n

In [14]:
fn_logistic_female=fn(y_test_female,logistic_f_pred)
fp_logistic_female=fp(y_test_female,logistic_f_pred)
fn_logistic_male=fn(y_test_male,logistic_m_pred)
fp_logistic_male=fp(y_test_male,logistic_m_pred)

In [15]:
print(fn_logistic_female,fp_logistic_female)
print(fn_logistic_male,fp_logistic_male)

0.0521739130435 0.0188714153562
0.115575579257 0.0704303052593


### Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier
start=time.time()
n=x_train.shape[1]
rfc = RandomForestClassifier(max_features=n ,n_estimators=128) 
param_grid = { 
    'n_estimators': [150,200],
    'max_features': [1.0,0.5]
    #'max_depth':[8,15,30,60]
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=3)
CV_rfc.fit(x_train_dum, y_train)
print("The process took: ",time.time()-start,"seconds")
print("The Best parameters are: ",CV_rfc.best_params_)

The process took:  241.01130032539368 seconds
The Best parameters are:  {'max_features': 0.5, 'n_estimators': 150}


In [30]:
pred_rf=CV_rfc.predict(x_test_dum)
train_rf=CV_rfc.predict(x_train_dum)

In [31]:
print("The train error rate for random forest regression is: ",acc(y_train,train_rf))
print("The test error rate for random forest regression is: ",acc(y_test,pred_rf))

The train error rate for random forest regression is:  6.14231749639e-05
The test error rate for random forest regression is:  0.142128861864


In [32]:
rf_f_pred=CV_rfc.predict(female_test_x)
rf_m_pred=CV_rfc.predict(male_test_x)
fn_rf_female=fn(y_test_female,rf_f_pred)
fp_rf_female=fp(y_test_female,rf_f_pred)

fn_rf_male=fn(y_test_male,rf_m_pred)
fp_rf_male=fp(y_test_male,rf_m_pred)

In [33]:
print(fn_rf_female,fp_rf_female)
print(fn_rf_male,fp_rf_male)

0.0460684551341 0.0212765957447
0.110794409709 0.0684994483266


### Linear SVC

In [21]:
from sklearn.svm import LinearSVC
pipe1=make_pipeline(MaxAbsScaler(), LinearSVC(dual=False))
start_time = time.time()
param_grid = {'linearsvc__C': np.logspace(-2, 1, 6)}
grid2 = GridSearchCV(pipe1,param_grid=param_grid)
grid2.fit(x_train_dum, y_train)
print("The model ran for %s seconds ---" % (time.time() - start_time))
print("The Best parameters are: ",grid2.best_params_)

The model ran for 3.388603925704956 seconds ---
The Best parameters are:  {'linearsvc__C': 2.5118864315095797}


In [22]:
train_linearsvc=grid2.predict(x_train_dum)
pred_linearsvc=grid2.predict(x_test_dum)

In [23]:
print("The train error rate for SVC regression is: ",acc(y_train,train_linearsvc))
print("The test error rate for SVC regression is: ",acc(y_test,pred_linearsvc))

The train error rate for SVC regression is:  0.146432849114
The test error rate for SVC regression is:  0.147103986242


In [24]:
svc_f_pred=grid2.predict(female_test_x)
svc_m_pred=grid2.predict(male_test_x)

fn_svc_female=fn(y_test_female,svc_f_pred)
fp_svc_female=fp(y_test_female,svc_f_pred)

fn_svc_male=fn(y_test_male,svc_m_pred)
fp_svc_male=fp(y_test_male,svc_m_pred)

In [26]:
print(fn_svc_female,fp_svc_female)
print(fn_svc_male,fp_svc_male)

0.0527289546716 0.0175763182239
0.119253401986 0.0660169179846
