In [14]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel


In [15]:
df=pd.read_csv('adult-new.data', sep=', ', engine='python', header=None)
train=pd.get_dummies(df, columns=[1, 3, 5 ,6, 7, 8, 9, 13, 14], prefix=None)
df2=pd.read_csv('adult-new.test', sep=', ', engine='python', header=None)
test=pd.get_dummies(df2, columns=[1, 3, 5 ,6, 7, 8, 9, 13, 14], prefix=None)
test['13_Holand-Netherlands']=0


In [16]:
columns_train=list(train.columns)
columns_test=list(test.columns)
ytrain =np.asarray(train['14_>50K']).astype(int)
xtrain =np.asarray(train)[:,:-2].astype(float)
ytest =np.asarray(test['14_>50K']).astype(int)
xtest =np.asarray(test)[:,:-2].astype(float)
feature_names=list(test.columns)
female_col=feature_names.index('9_Female')
male_col=feature_names.index('9_Male')


In [17]:
def misclass_rate(x,y, ypred):
    female_idx=np.where(x[:,female_col] ==1)[0]
    male_idx=np.where(x[:,male_col] ==1)[0]
    yf=y[female_idx]
    ym=y[male_idx]
    yf_pred=ypred[female_idx]
    ym_pred=ypred[male_idx]
    FemaleTP_S=np.where(yf ==1)[0]   #True Positive Cases
    FemaleTN_S=np.where(yf ==0)[0]   #True Negative Cases
    FemaleFP_Q=np.where(yf_pred[FemaleTN_S] ==1)[0].shape[0]   #Number of False Positive
    FemaleFN_Q=np.where(yf_pred[FemaleTP_S] ==0)[0].shape[0]  #Number of False Negative  
    MaleTP_S=np.where(ym ==1)[0]
    MaleTN_S=np.where(ym ==0)[0]
    MaleFP_Q=np.where(ym_pred[MaleTN_S] ==1)[0].shape[0]
    MaleFN_Q=np.where(ym_pred[MaleTP_S] ==0)[0].shape[0]
    return {'Total Female':len(female_idx),
            'Total Female misclassified':FemaleFP_Q+FemaleFN_Q, 
            'Female FP rate':FemaleFP_Q/len(female_idx), 
            'Female FN rate':FemaleFN_Q/len(female_idx),
            'Total Male':len(male_idx),'Total Male misclassified':MaleFP_Q+MaleFN_Q, 
            'Male FP rate':MaleFP_Q/len(male_idx), 
            'Male FN rate':MaleFN_Q/len(male_idx)}
    

In [18]:
scaler=StandardScaler()
xtrain_s=scaler.fit(xtrain).transform(xtrain)
xtest_s=scaler.fit(xtest).transform(xtest)
tuned_param={'C': [1, 10, 100, 1000]}
log_reg_gs = GridSearchCV(LogisticRegression(), tuned_param, cv=10,
                       scoring='average_precision')
log_reg_gs.fit(xtrain_s, ytrain)
print("Best parameter:", log_reg_gs.best_params_)
ypredtrain=log_reg_gs.predict(xtrain_s)
ypredtest = log_reg_gs.predict(xtest_s)
trainerror=metrics.zero_one_loss(ypredtrain,ytrain)
testerror=metrics.zero_one_loss(ypredtest,ytest)
print('Logistic Regression training error:',trainerror)
print('Logistic Regression test error:', testerror)


Best parameter: {'C': 100}
Logistic Regression training error: 0.146647830226
Logistic Regression test error: 0.151219212579


In [19]:
result_table_cols=['Algorithm','Female FP rate','Female FN rate','Male FP rate','Male FN rate']
result_table=[]
m_rate=misclass_rate(xtest, ytest, ypredtest)
result_table.append(['Logistic Regression', m_rate['Female FP rate'], m_rate['Female FN rate']
                     , m_rate['Male FP rate'], m_rate['Male FN rate']])
print(result_table)

[['Logistic Regression', 0.01868640148011101, 0.05272895467160037, 0.07162559764619346, 0.11925340198602427]]


In [20]:
#SVM with Stockastic gradiend and alpha = 10/n , alpha_sgd = 1 /(C_svc * n_samples)
svm_sgd = SGDClassifier(alpha=0.01,average=False, class_weight=None, epsilon=0.01,eta0=0.1
                    , fit_intercept=True, l1_ratio=0.15,learning_rate='optimal'
                    , loss='hinge', n_iter=5, n_jobs=1,penalty='l2', power_t=0.5
                    , random_state=None, shuffle=True,verbose=0,warm_start=False)
svm_sgd.fit(xtrain_s, ytrain)
ypredtrain=svm_sgd.predict(xtrain_s)
ypredtest=svm_sgd.predict(xtest_s)
trainerror=metrics.zero_one_loss(ypredtrain,ytrain)
testerror=metrics.zero_one_loss(ypredtest,ytest)
print('SVM training error:',trainerror)
print('SVM test error:', testerror)
m_rate=misclass_rate(xtest, ytest, ypredtest)
result_table.append(['SVM', m_rate['Female FP rate'], m_rate['Female FN rate']
                     , m_rate['Male FP rate'], m_rate['Male FN rate']])

SVM training error: 0.149442584687
SVM test error: 0.14851667588


In [21]:
#Decision Tree with feature selection max_depth=#of features selected
select = SelectFromModel(DecisionTreeClassifier())
select.fit(xtrain,ytrain)
xtrain_s=select.transform(xtrain)
xtest_s=select.transform(xtest)
print('Number of features after transformation:',xtrain_s.shape[1])
f=xtrain_s.shape[1]
tree=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=f,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=2,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
tree.fit(xtrain_s, ytrain)
ypredtrain=tree.predict(xtrain_s)
ypredtest=tree.predict(xtest_s)
trainerror=metrics.zero_one_loss(ypredtrain,ytrain)
testerror=metrics.zero_one_loss(ypredtest,ytest)
print('Decision Tree train error:',trainerror)
print('Decision Treetest error:', testerror)


Number of features after transformation: 8
Decision Tree train error: 0.138447836369
Decision Treetest error: 0.142313125729


In [22]:
m_rate=misclass_rate(xtest, ytest, ypredtest);print(m_rate)
result_table.append(['Decision Tree', m_rate['Female FP rate'], m_rate['Female FN rate']
                     , m_rate['Male FP rate'], m_rate['Male FN rate']])


{'Total Female': 5405, 'Total Female misclassified': 374, 'Female FP rate': 0.012210915818686401, 'Female FN rate': 0.05698427382053654, 'Total Male': 10876, 'Total Male misclassified': 1943, 'Male FP rate': 0.06059212945936006, 'Male FN rate': 0.11805810959911732}


In [23]:
result=pd.DataFrame(result_table)

In [24]:
result.columns=['Classifier','Female FP','Female FN','Male FP','Male FN']

In [25]:
result


Unnamed: 0,Classifier,Female FP,Female FN,Male FP,Male FN
0,Logistic Regression,0.018686,0.052729,0.071626,0.119253
1,SVM,0.014616,0.057354,0.061052,0.125506
2,Decision Tree,0.012211,0.056984,0.060592,0.118058
