In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.neural_network import MLPClassifier

In [2]:
# This is where we'll put constants
TRAIN_DATA_PATH     = "./data/adultTrain.data"
DATA_HEADERS        = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
    "occupation", "relationship", "race", "sec", "capital-gain", "capital-loss", 
    "hours-per-week", "native-country", "y"
]
CLEAN_EDUCATION_MAP = {
    "HS-grad":      "High-school",
    "Some-college": "Higher-education",
    "Bachelors":    "Undergraduate",
    "Masters":      "Graduate",
    "Assoc-voc":    "Higher-education",
    "11th":         "Grade-school",
    "Assoc-acdm":   "Higher-education",
    "10th":         "Grade-school",
    "7th-8th":      "Grade-school",
    "Prof-school":  "Higher-education",
    "9th":          "Grade-school",              
    "12th":         "Grade-school",
    "Doctorate":    "Graduate",
    "5th-6th":      "Grade-school",
    "1st-4th":      "Grade-school",
    "Preschool":    "Grade-school"
}
CLEAN_WORKCLASS_MAP = {
    "Private":          "Private",
    "Self-emp-not-inc": "Self-employed",
    "Self-emp-inc":     "Self-employed",
    "Local-gov":        "Government",
    "State-gov":        "Government",
    "Federal-gov":      "Government",
    "Without-pay":      "Without-pay"
}

In [3]:
def read_data(path):
    dataset = pd.read_csv(path)
    dataset.columns = DATA_HEADERS
    return dataset

In [4]:
def clean_marital_status(value):
    lowerValue = value.lower()
    
    if lowerValue == "never-married":
        return "Never-married"
    elif "married" in lowerValue:
        return "Married"
    else:
        return "Previously-married"
    
def clean_native_country(value):
    lowerValue = value.lower()
    
    if lowerValue == "united-states":
        return "United-States"
    else:
        return "Other"

def clean_data(data):
    # Strip whitespaces from all string values
    # and replace "?" with None,
    # and drop all na rows
    data = data.apply(lambda x: x.str.strip() if x.dtype == "object" else x) \
               .replace(['?'], [None]) \
               .dropna()
    # Clean 'marital-status'
    data['marital-status'] = data['marital-status'].map(clean_marital_status)
    # Clean 'native-country'
    data = data[data['native-country'] != "?"]
    data['native-country'] = data['native-country'].map(clean_native_country)
    # Clean 'education'
    data['education'] = data['education'].map(CLEAN_EDUCATION_MAP)
    # Clean 'workclass'
    data['workclass'] = data['workclass'].map(CLEAN_WORKCLASS_MAP)
    # Drop unecessary columns
    # - education-num - this looks like an identifier for the original education value (not needed!)
    data.drop(['fnlwgt', 'capital-gain', 'capital-loss', 'education-num'], axis=1, inplace=True)
    return data

In [None]:
def train_model (model, train_X, train_y):
    model.fit (train_X, train_y)    
    return model
    
def test_model (model, test_X, test_y):
    predicted= model.predict(test_X)
    cnf_matrix = confusion_matrix(test_y, predicted)
    print(cnf_matrix)
    score = model.score(test_X, test_y)
    print(score)

In [5]:
dataset = read_data(TRAIN_DATA_PATH)
dataset = clean_data(dataset)
dataset.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sec,hours-per-week,native-country,y
0,50,Self-employed,Undergraduate,Married,Exec-managerial,Husband,White,Male,13,United-States,<=50K
1,38,Private,High-school,Previously-married,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
2,53,Private,Grade-school,Married,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
3,28,Private,Undergraduate,Married,Prof-specialty,Wife,Black,Female,40,Other,<=50K
4,37,Private,Graduate,Married,Exec-managerial,Wife,White,Female,40,United-States,<=50K


In [6]:
dataset['relationship'].value_counts()

Husband           12463
Not-in-family      7725
Own-child          4466
Unmarried          3212
Wife               1406
Other-relative      889
Name: relationship, dtype: int64

In [7]:
dataset2 = pd.get_dummies(dataset, columns =['workclass', 'education', 'marital-status', 'occupation', 'relationship', 
                                             'race', 'sec','native-country'])

In [8]:
dataset2.columns

Index(['age', 'hours-per-week', 'y', 'workclass_Government',
       'workclass_Private', 'workclass_Self-employed', 'workclass_Without-pay',
       'education_Grade-school', 'education_Graduate', 'education_High-school',
       'education_Higher-education', 'education_Undergraduate',
       'marital-status_Married', 'marital-status_Never-married',
       'marital-status_Previously-married', 'occupation_Adm-clerical',
       'occupation_Armed-Forces', 'occupation_Craft-repair',
       'occupation_Exec-managerial', 'occupation_Farming-fishing',
       'occupation_Handlers-cleaners', 'occupation_Machine-op-inspct',
       'occupation_Other-service', 'occupation_Priv-house-serv',
       'occupation_Prof-specialty', 'occupation_Protective-serv',
       'occupation_Sales', 'occupation_Tech-support',
       'occupation_Transport-moving', 'relationship_Husband',
       'relationship_Not-in-family', 'relationship_Other-relative',
       'relationship_Own-child', 'relationship_Unmarried', 'relat

In [9]:
X = dataset2.loc[:, dataset2.columns != "y"]
y = dataset2.loc[:, "y"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state=0)

In [10]:
X_train.shape

(22620, 43)

In [11]:
y_train.shape

(22620, 1)

In [None]:
# SVM Model
svc_linear = svm.SVC(kernel='linear', C=10)
svc_linear.fit(X_train, y_train)

In [None]:
predicted= svc_linear.predict(X_test)
cnf_matrix = confusion_matrix(y_test, predicted)
print(cnf_matrix)
svc_linear.score(X_test,y_test

In [None]:
# NN 
mlp = MLPClassifier(hidden_layer_sizes=(23,23,23))

In [None]:
mlp.fit(X_train,y_train)

In [None]:
predictions = mlp.predict(X_test)

In [None]:
print(confusion_matrix(y_test,predictions))
mlp.score(X_test,y_test)

In [None]:
#Using KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors=5)

In [None]:
KNN.fit(X_train,y_train)

In [None]:
predictions = KNN.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
KNN.score(X_test,y_test)

In [None]:
# Using SVM

In [None]:
from sklearn import svm

svc_linear = svm.SVC(kernel='linear', C=10)
svc_linear.fit(X_train, y_train)

In [None]:
predicted= svc_linear.predict(X_test)
cnf_matrix = confusion_matrix(y_test, predicted)
print(cnf_matrix)
svc_linear.score(X_test,y_test)