In [980]:
import pandas as pd
from numpy.random import RandomState
from data_loader import *

In [985]:
# so that same output is produced

def load_train_data(train_file_path, valid_rate=0.1, is_df=True):
    data_frame = pd.read_csv(train_file_path, header=None).sample(frac=1, random_state=11)
    np.random.seed(11)
    mask = np.random.rand(len(data_frame)) < 1 - valid_rate
    print(mask)
    train_df, valid_df = data_frame.iloc[mask, :], data_frame.iloc[~mask, :]
    if is_df:
        return train_df, valid_df

    train_labels = [1 if x == POS_STR else 0 for x in train_df.iloc[:, 14].values]
    valid_labels = [1 if x == POS_STR else 0 for x in valid_df.iloc[:, 14].values]
    return train_df.iloc[:, :14].values, np.array(train_labels), valid_df.iloc[:, :14].values, np.array(valid_labels)

rs = RandomState(130917)
numpy.random.seed()
Xtr,Xva = load_train_data("adult_data.csv")
Xte = load_test_data("adult_test.csv")

# give names to columns
Xtr.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]
Xva.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]
Xte.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]





[ True  True  True ...  True  True  True]


In [986]:
Xtr.drop("Education",axis=1, inplace=True,)

In [987]:
Xva.drop("Education",axis=1, inplace=True,)

In [988]:
Xte.drop("Education",axis=1, inplace=True,)

In [989]:
Xtr.drop("fnlwgt",axis=1, inplace=True,)

In [990]:
Xva.drop("fnlwgt",axis=1, inplace=True,)

In [991]:
Xte.drop("fnlwgt",axis=1, inplace=True,)

In [992]:
# Convert income to 0 and 1
Xtr["Income"] = Xtr["Income"].map({ " <=50K": 0, " >50K": 1 })
Ytr = Xtr["Income"].values
Xtr.drop("Income", axis=1, inplace=True,)

In [993]:
Xva["Income"] = Xva["Income"].map({ " <=50K": 0, " >50K": 1 })
Yva = Xva["Income"].values
Xva.drop("Income", axis=1, inplace=True,)

In [994]:
Xte["Income"] = Xte["Income"].map({ " <=50K.": 0, " >50K.": 1 })
Yte = Xte["Income"].values
Xte.drop("Income", axis=1, inplace=True,)

In [995]:
Xtr = pd.get_dummies(Xtr, columns=[
    "WorkClass", "MaritalStatus", "Occupation", "Relationship",
    "Race", "Gender", "NativeCountry",
])

In [996]:
Xva = pd.get_dummies(Xva, columns=[
    "WorkClass", "MaritalStatus", "Occupation", "Relationship",
    "Race", "Gender", "NativeCountry",
])

In [997]:
Xte = pd.get_dummies(Xte, columns=[
    "WorkClass", "MaritalStatus", "Occupation", "Relationship",
    "Race", "Gender", "NativeCountry",
])

In [998]:
Xte.insert(loc=65, column='NativeCountry_ Holand-Netherlands', value=0)

In [999]:

Xva.insert(loc=66, column='WorkClass_ Never-worked', value=0)
Xva.insert(loc=67, column='NativeCountry_ Cambodia', value=0)
Xva.insert(loc=68, column='NativeCountry_ Honduras', value=0)
Xva.insert(loc=69, column='NativeCountry_ Hungary', value=0)
Xva.insert(loc=70, column='NativeCountry_ Scotland', value=0)
Xva.insert(loc=71, column='NativeCountry_ Trinadad&Tobago', value=0)
Xva.insert(loc=65, column='NativeCountry_ Holand-Netherlands', value=0)

In [1000]:
Xtr.Age = Xtr.Age.astype(float)
Xtr.EducationNum = Xtr.EducationNum.astype(float)
Xtr.HoursPerWeek = Xtr.HoursPerWeek.astype(float)

Xva.Age = Xva.Age.astype(float)
Xva.EducationNum = Xva.EducationNum.astype(float)
Xva.HoursPerWeek = Xva.HoursPerWeek.astype(float)

Xte.Age = Xte.Age.astype(float)
Xte.EducationNum = Xte.EducationNum.astype(float)
Xte.HoursPerWeek = Xte.HoursPerWeek.astype(float)

In [1001]:
# Standardize data (0 mean, 1 stdev)
# also try without standardizing the results
from sklearn.preprocessing import StandardScaler
import pandas
import numpy

def standardization(X):
    
    Xscaler = X.filter(["Age", "EducationNum", "HoursPerWeek", "CapitalGain","CapitalLoss"],axis=1)
    array = Xscaler.values
    scaler = StandardScaler().fit(Xscaler)
    rescaledX = scaler.transform(Xscaler)
    return rescaledX

Xtr_scaled = standardization(Xtr)
Xtr.Age = Xtr_scaled[:,0]
Xtr.EducationNum = Xtr_scaled[:,1]
Xtr.HoursPerWeek = Xtr_scaled[:,2]
Xtr.CapitalGain = Xtr_scaled[:,3]
Xtr.CapitalLoss = Xtr_scaled[:,4]


Xva_scaled = standardization(Xva)
Xva.Age = Xva_scaled[:,0]
Xva.EducationNum = Xva_scaled[:,1]
Xva.HoursPerWeek = Xva_scaled[:,2]
Xva.CapitalGain = Xva_scaled[:,3]
Xva.CapitalLoss = Xva_scaled[:,4]

Xte_scaled = standardization(Xte)
Xte.Age = Xte_scaled[:,0]
Xte.EducationNum = Xte_scaled[:,1]
Xte.HoursPerWeek = Xte_scaled[:,2]
Xte.CapitalGain = Xte_scaled[:,3]
Xte.CapitalLoss = Xte_scaled[:,4]


In [685]:
# # also try without normalizing the results
# def normalization(X):
#     Xnormal = X.filter(["CapitalGain","CapitalLoss"],axis=1)
#     scaler = Normalizer().fit(Xnormal)
#     normalizedX = scaler.transform(Xnormal)
#     return normalizedX

# Xtr_normal = normalization(Xtr)
# Xtr.CapitalGain = Xtr_normal[:,0]
# Xtr.CapitalLoss = Xtr_normal[:,1]

# Xva_normal = normalization(Xva)
# Xva.CapitalGain= Xva_normal[:,0]
# Xva.CapitalLoss= Xva_normal[:,1]

# Xte_normal = normalization(Xte)
# Xte.CapitalGain= Xte_normal[:,0]
# Xte.CapitalLoss= Xte_normal[:,1]

In [567]:
print(Yte)

[0 0 1 ... 0 0 1]


In [845]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import model_selection
from sklearn.metrics import roc_auc_score
num_trees = 100
max_features = 3
# kfold = model_selection.KFold(n_splits=5,random_state=7)
randomForest = RandomForestClassifier(n_estimators=100,random_state=42)
# cv_result = model_selection.cross_val_score(randomForest,Xtr,Ytr,cv=kfold,scoring='accuracy')
score=randomForest.fit(Xtr,Ytr)
# YvaHat=score.predict(Xva)
# acc_score=roc_auc_score(Yva, YvaHat)
# YteHat = score.predict(Xte)
# acc_score1=roc_auc_score(Yte, YteHat)
prediction = randomForest.predict(Xva)
acc_score = accuracy_score(Yva,prediction)
prediction1 = randomForest.predict(Xte)
acc_score1 = accuracy_score(Yte,prediction1)
print("For validation")
print(acc_score)
print("For test")
print(acc_score1)
# print("For test")
# print(acc_score1)

For validation
0.8043746149106593
For test
0.8469381487623611


In [791]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve,auc

decisionTree = DecisionTreeClassifier(random_state=43)
criterion = ["gini","entropy"]
max_depth = [1,3,5,None]
splitter = ["best","random"]
max_leaf_nodes = [20,40,60]
grid = GridSearchCV(estimator=decisionTree,cv=3,param_grid=dict(criterion=criterion,max_depth=max_depth,splitter=splitter,max_leaf_nodes=max_leaf_nodes))
YvaHat=grid.fit(Xtr,Ytr).predict(Xva)
# kfold = model_selection.KFold(n_splits=5,random_state=7)
# cv_result = model_selection.cross_val_score(decisionTree,Xtr,Ytr,cv=kfold,scoring='accuracy')
# score=decisionTree.fit(Xtr,Ytr)
# YvaHat=score.predict(Xva)
acc_score=accuracy_score(Yva,YvaHat)
print(acc_score)
# acc_score=roc_auc_score(Yva, YvaHat)
YteHat = grid.fit(Xtr,Ytr).predict(Xte)
# acc_score1=roc_auc_score(Yte, YteHat)
false_positive_rate, true_positive_rate, thresholds = roc_curve(Yva, YvaHat)
roc_auc = auc(false_positive_rate, true_positive_rate)
false_positive_rate1, true_positive_rate1, thresholds1 = roc_curve(Yte, YteHat)
roc_auc1 = auc(false_positive_rate1, true_positive_rate1)
print("For validation")
print(roc_auc)
print("For test")
print(roc_auc1)
print(grid.best_params_)

0.8105360443622921
For validation
0.6070226324702706
For test
0.7768107732753218
{'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': 60, 'splitter': 'best'}


In [926]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, accuracy_score

learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=3, max_depth = 6, random_state = 0)
    gb.fit(Xtr,Ytr)
    prediction = gb.predict(Xva)
    acc_score = accuracy_score(Yva,prediction)
    roc=roc_auc_score(Yva,prediction)
       
    prediction1=gb.predict(Xte)
    acc_score1=accuracy_score(Yte, YteHat)
    roc1=roc_auc_score(Yte,prediction1)
    
    print("Validation accuracy")
    print(acc_score)
    print("Test accuracy")
    print(acc_score1)
    print("Validation auc")
    print(roc)
    print("test auc")
    print(roc1)
    


Validation accuracy
0.7674060382008626
Test accuracy
0.8576868742706222
Validation auc
0.5
test auc
0.5628823078134223
Validation accuracy
0.7766481823783118
Test accuracy
0.8576868742706222
Validation auc
0.5203290787231384
test auc
0.6613268246049504
Validation accuracy
0.7957486136783734
Test accuracy
0.8576868742706222
Validation auc
0.5618502104264091
test auc
0.7407957259183009
Validation accuracy
0.7960566851509551
Test accuracy
0.8576868742706222
Validation auc
0.5643585782990953
test auc
0.7622248798275212
Validation accuracy
0.7658656808379544
Test accuracy
0.8576868742706222
Validation auc
0.7186842168229468
test auc
0.7701979780035592
Validation accuracy
0.8219346888478127
Test accuracy
0.8576868742706222
Validation auc
0.7053705924108247
test auc
0.7681686423066091


In [962]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=10,leaf_size=6)

knn.fit(Xtr,Ytr)

prediction = knn.predict(Xva)
acc_score = accuracy_score(Yva,prediction)
roc=roc_auc_score(Yva,prediction)

prediction1=knn.predict(Xte)
acc_score1=accuracy_score(Yte, YteHat)
roc1=roc_auc_score(Yte,prediction1)
    
print("Validation accuracy")
print(acc_score)
print("Test accuracy")
print(acc_score1)
print("Validation auc")
print(roc)
print("test auc")
print(roc1)

Validation accuracy
0.8213185459026494
Test accuracy
0.8576868742706222
Validation auc
0.6694314100297495
test auc
0.7670792645939853


In [969]:
from sklearn.ensemble import VotingClassifier


model1 = RandomForestClassifier(random_state=1,max_depth=4,max_features=3)
model2 = LogisticRegression(random_state=5)
model3 = KNeighborsClassifier(n_neighbors=6)
model = VotingClassifier(estimators=[('lr', model1), ('dt', model2), ('kn',model3)], voting='hard')
model.fit(Xtr,Ytr)

prediction = model.predict(Xva)
acc_score = accuracy_score(Yva,prediction)
roc=roc_auc_score(Yva,prediction)

prediction1=model.predict(Xte)
acc_score1=accuracy_score(Yte, YteHat)
roc1=roc_auc_score(Yte,prediction1)
    
print("Validation accuracy")
print(acc_score)
print("Test accuracy")
print(acc_score1)
print("Validation auc")
print(roc)
print("test auc")
print(roc1)

  if diff:


Validation accuracy
0.8179297597042514
Test accuracy
0.8576868742706222
Validation auc
0.6303011370735974
test auc
0.72711061639088


  if diff:


In [970]:
model1 = DecisionTreeClassifier()
model2 = KNeighborsClassifier()
model3= LogisticRegression()

model1.fit(Xtr,Ytr)
model2.fit(Xtr,Ytr)
model3.fit(Xtr,Ytr)

pred1=model1.predict_proba(Xva)
pred2=model2.predict_proba(Xva)
pred3=model3.predict_proba(Xva)

finalpred=(pred1+pred2+pred3)/3
acc_score = accuracy_score(Yva,finalpred)
roc=roc_auc_score(Yva,finalpred)

pred4=model1.predict_proba(Xte)
pred5=model2.predict_proba(Xte)
pred6=model3.predict_proba(Xte)

finalpred1=(pred4+pred5+pred6)/3
acc_score = accuracy_score(Yte,finalpred1)
roc=roc_auc_score(Yte,finalpred1)
    
print("Validation accuracy")
print(acc_score)
print("Test accuracy")
print(acc_score1)
print("Validation auc")
print(roc)
print("test auc")
print(roc1)

ValueError: Classification metrics can't handle a mix of binary and continuous-multioutput targets

In [1003]:
# rbf kernal

from sklearn import svm

def model_eval(actual, pred):
    
    confusion = pd.crosstab(actual, pred, rownames=['Actual'], colnames=['Predicted'])
    TP = confusion.loc['>50K','>50K']
    TN = confusion.loc['<=50K','<=50K']
    FP = confusion.loc['<=50K','>50K']
    FN = confusion.loc['>50K','<=50K']

    accuracy = ((TP+TN))/(TP+FN+FP+TN)
    precision = (TP)/(TP+FP)
    recall = (TP)/(TP+FN)
    f_measure = (2*recall*precision)/(recall+precision)
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    error_rate = 1 - accuracy
    
    out = {}
    out['accuracy'] =  accuracy
    out['precision'] = precision
    out['recall'] = recall
    out['f_measure'] = f_measure
    out['sensitivity'] = sensitivity
    out['specificity'] = specificity
    out['error_rate'] = error_rate
    
    return out

svm_clf_rbf = svm.SVC(kernel = 'rbf', C = 1, tol = 1e-3)
svm_clf_rbf.fit(Xtr, Ytr)
svm_clf_rbf_pred = svm_clf_rbf.predict(Xte)
SVM_rbf = model_eval(Yte, svm_clf_rbf_pred)
print('SVM using rbf kernel : %.2f percent.' % (round(SVM_rbf['accuracy']*100,2)))

# Linear kernel
svm_clf_linear = svm.SVC(kernel = 'linear')
svm_clf_linear.fit(Xtr, Ytr)
svm_clf_linear_pred = svm_clf_linear.predict(Xte)
SVM_linear = model_eval(Yte, svm_clf_linear_pred)
print('SVM using linear kernel : %.2f percent.' % (round(SVM_linear['accuracy']*100,2)))


# Poly kernal
svm_clf_poly = svm.SVC(kernel = 'poly')
svm_clf_poly.fit(Xtr, Ytr)
svm_clf_poly_pred = svm_clf_poly.predict(Xte)
SVM_poly = model_eval(Yte, svm_clf_poly_pred)
print('SVM using poly kernel : %.2f percent.' % (round(SVM_poly['accuracy']*100,2)))


svm_clf_sigmoid = svm.SVC(kernel = 'sigmoid')
svm_clf_sigmoid.fit(Xtr, Ytr)
svm_clf_sigmoid_pred = svm_clf_sigmoid.predict(Xte)
SVM_sigmoid = model_eval(Yte, svm_clf_sigmoid_pred)
print('SVM using sigmoid kernel : %.2f percent.' % (round(SVM_sigmoid['accuracy']*100,2)))



KeyError: 'the label [>50K] is not in the [index]'