# Adult Classification Problem

In [1]:
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import math
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

## 1. Import Data

In [2]:
cols = ['age', 'workclass', 'fnlwgt','education', 'education-num', 'marital-status', 
        'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
        'hours-per-week', 'native-country', 'class']
features = ['age', 'workclass', 'fnlwgt','education', 'education-num', 'marital-status', 
        'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
        'hours-per-week', 'native-country']
data = pd.read_csv('./ADULT/adult.data', header = None, names = cols)

In [3]:
data.head(50)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


## 2. Clean and Preprocess Data

In [3]:
# Types in each column
print(data.dtypes)

# Drop any nulls, shown as question marks
for i in range(len(data)):
    for col in cols:
        if data[col][i] == ' ?':
            data.drop(i, inplace = True)
            break

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
class             object
dtype: object


In [4]:
data.shape

(30162, 15)

In [5]:
# Preprocess last column
def transform_income(income):
    if income == ' <=50K':
        return 0
    else:
        return 1

In [6]:
data['class'] = data['class'].apply(transform_income)

In [7]:
data.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,0
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,0
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,1
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,1
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,1


In [8]:
X = data[features]
y = data['class']

In [9]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [10]:
features

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country']

In [9]:
X = pd.get_dummies(X, columns=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race',
 'sex','native-country'])

In [10]:
# Types in each column
print(X.dtypes)
print(y.dtypes)

age                                int64
fnlwgt                             int64
education-num                      int64
capital-gain                       int64
capital-loss                       int64
                                   ...  
native-country_ Thailand           uint8
native-country_ Trinadad&Tobago    uint8
native-country_ United-States      uint8
native-country_ Vietnam            uint8
native-country_ Yugoslavia         uint8
Length: 104, dtype: object
int64


## Algorithm 1: Logistic Regression

In [14]:
def draw_heatmap(training_accuracy, C_list, label):
    # training_accuracy: A NumPy array with the shape (len(C_list))
    # C_list: List of C(s).
    plt.figure(figsize = (2,4))
    ax = sns.heatmap(training_accuracy, annot=True, fmt='.3f', 
                     xticklabels=[], yticklabels=C_list)
    ax.collections[0].colorbar.set_label("accuracy")
    ax.set(ylabel='$C$')
    sns.set_style("whitegrid", {'axes.grid' : False})
    plt.title(label + 'accuracy w.r.t $C$')
    plt.show()

In [88]:
def log_reg():
    # Logistic Regression binary classification
    C_list = [1e-4, 1e-3, 1e-2, 1e-1,1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8]
    scoring = ['accuracy', 'f1', 'roc_auc']
    parameters = {'C': C_list}
    classifier = LogisticRegression(n_jobs = -1)
    clf = GridSearchCV(classifier, parameters, return_train_score = True, n_jobs = -1, scoring = scoring, refit = False)
    clf.fit(X_train, y_train)
    
    # Find best parameters for each performance metric
    best_acc = 0
    best_C_acc = 0
    for i in range(len(clf.cv_results_['mean_test_accuracy'])):
        if clf.cv_results_['mean_test_accuracy'][i] > best_acc:
            best_C_acc = clf.cv_results_['param_C'][i]
        
    best_f1 = 0
    best_C_f1 = 0
    for j in range(len(clf.cv_results_['mean_test_f1'])):
        if clf.cv_results_['mean_test_f1'][j] > best_f1:
            best_C_f1 = clf.cv_results_['param_C'][j]
        
    best_auc = 0
    best_C_auc = 0
    for k in range(len(clf.cv_results_['mean_test_roc_auc'])):
        if clf.cv_results_['mean_test_roc_auc'][k] > best_auc:
            best_C_auc = clf.cv_results_['param_C'][k]
    print("yes")  
    # Train 3 models using the 5000 samples and each of the 3 best parameter settings (one model per metric)
    best_model_acc = LogisticRegression(C=best_C_acc, n_jobs = -1, max_iter = 5000)
    best_model_f1 = LogisticRegression(C=best_C_f1, n_jobs = -1, max_iter = 5000)
    best_model_auc = LogisticRegression(C=best_C_auc, n_jobs = -1, max_iter = 5000)
    best_model_acc.fit(X_train, y_train)
    best_model_f1.fit(X_train, y_train)
    best_model_auc.fit(X_train, y_train)
    
    # Find and store accuracy, FSC, and AUC of the 3 models from previous line on test set
    y_pred1 = best_model_acc.predict(X_test)
    y_pred2 = best_model_f1.predict(X_test)
    y_pred3 = best_model_auc.predict(X_test)
    
    metrics_acc = accuracy_score(y_test, y_pred1)
    metrics_f1 = f1_score(y_test, y_pred2)
    metrics_auc = roc_auc_score(y_test, y_pred3)
    
    
    
    
    
    #Extract training and validation accuracies and plot as heat maps to visualize best C parameter
    #train_acc = clf.cv_results_['mean_train_score'].reshape(-1, 1)
    #draw_heatmap(train_acc, C_list, 'Training ')
    
    #val_acc = clf.cv_results_['mean_test_score'].reshape(-1, 1)
    #draw_heatmap(val_acc, C_list, 'Validation ')
    
    # Use best parameters to define best model and fit to all training data
    #best_model = LogisticRegression(C=clf.best_params_['C'])
    #best_model.fit(X_train, y_train)
    
    # Find test accuracy and generalization error
    #for i,j in enumerate(C_list):
        #if j == clf.best_params_['C']:
            #best_train_acc = train_acc[i]
            
    #test_acc = best_model.score(X_test, y_test)
    #return test_acc, best_train_acc, clf.best_params_['C']
    
    return metrics_acc, metrics_f1, metrics_auc

In [89]:
# Run Logistic Regression + collection of data for 5 trials and store in tables for LR, D1
LRD1 = np.zeros((5, 3))
for trial in range(5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 5000)
    metrics_acc, metrics_f1, metrics_auc = log_reg()
    LRD1[trial][0] = metrics_acc
    LRD1[trial][1] = metrics_f1
    LRD1[trial][2] = metrics_auc
LRD1 = pd.DataFrame(LRD1, columns=['accuracy', 'f1', 'roc_auc'])

yes
yes
yes
yes
yes


In [90]:
LRD1

Unnamed: 0,accuracy,f1,roc_auc
0,0.791551,0.384895,0.614438
1,0.798108,0.492507,0.662807
2,0.790557,0.394392,0.617791
3,0.790001,0.385867,0.61489
4,0.789921,0.37562,0.611049


## Algorithm 2: KNN

In [11]:
def knn():
    # K-Neighbors binary classification
    k_list = [i for i in range(1,106,4)]
    scoring = ['accuracy', 'f1', 'roc_auc']
    parameters = {'n_neighbors': k_list}
    classifier = KNeighborsClassifier(n_jobs = -1)
    clf = GridSearchCV(classifier, parameters, return_train_score = True, n_jobs = -1, scoring = scoring, refit = False)
    clf.fit(X_train, y_train)
    
    # Find best parameters for each performance metric
    best_acc = 0
    best_k_acc = 0
    for i in range(len(clf.cv_results_['mean_test_accuracy'])):
        if clf.cv_results_['mean_test_accuracy'][i] > best_acc:
            best_k_acc = clf.cv_results_['param_n_neighbors'][i]
        
    best_f1 = 0
    best_k_f1 = 0
    for j in range(len(clf.cv_results_['mean_test_f1'])):
        if clf.cv_results_['mean_test_f1'][j] > best_f1:
            best_k_f1 = clf.cv_results_['param_n_neighbors'][j]
        
    best_auc = 0
    best_k_auc = 0
    for k in range(len(clf.cv_results_['mean_test_roc_auc'])):
        if clf.cv_results_['mean_test_roc_auc'][k] > best_auc:
            best_k_auc = clf.cv_results_['param_n_neighbors'][k]
    
    print("yes") 
    
    # Train 3 models using the 5000 samples and each of the 3 best parameter settings (one model per metric)
    best_model_acc = KNeighborsClassifier(n_neighbors=best_k_acc, n_jobs = -1)
    best_model_f1 = KNeighborsClassifier(n_neighbors=best_k_f1, n_jobs = -1)
    best_model_auc = KNeighborsClassifier(n_neighbors=best_k_auc, n_jobs = -1)
    best_model_acc.fit(X_train, y_train)
    best_model_f1.fit(X_train, y_train)
    best_model_auc.fit(X_train, y_train)
    
    # Find and store accuracy, FSC, and AUC of the 3 models from previous line on test set
    y_pred1 = best_model_acc.predict(X_test)
    y_pred2 = best_model_f1.predict(X_test)
    y_pred3 = best_model_auc.predict(X_test)
    
    metrics_acc = accuracy_score(y_test, y_pred1)
    metrics_f1 = f1_score(y_test, y_pred2)
    metrics_auc = roc_auc_score(y_test, y_pred3)
    return metrics_acc, metrics_f1, metrics_auc

In [12]:
# Run KNN + collection of data for 5 trials and store in tables for LR, D1
KNND1 = np.zeros((5, 3))
for trial in range(5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 5000)
    metrics_acc, metrics_f1, metrics_auc = knn()
    KNND1[trial][0] = metrics_acc
    KNND1[trial][1] = metrics_f1
    KNND1[trial][2] = metrics_auc
KNND1 = pd.DataFrame(KNND1, columns=['accuracy', 'f1', 'roc_auc'])

yes
yes
yes
yes
yes


In [13]:
KNND1

Unnamed: 0,accuracy,f1,roc_auc
0,0.754551,0.040696,0.510336
1,0.756418,0.037683,0.509552
2,0.756697,0.034689,0.508825
3,0.756856,0.043764,0.511112
4,0.757332,0.040842,0.510374
