In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [2]:
# This is where we'll put constants
TRAIN_DATA_PATH     = "./data/adultTrain.data"
DATA_HEADERS        = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
    "occupation", "relationship", "race", "sec", "capital-gain", "capital-loss", 
    "hours-per-week", "native-country", "y"
]
CLEAN_EDUCATION_MAP = {
    "HS-grad":      "High-school",
    "Some-college": "Higher-education",
    "Bachelors":    "Undergraduate",
    "Masters":      "Graduate",
    "Assoc-voc":    "Higher-education",
    "11th":         "Grade-school",
    "Assoc-acdm":   "Higher-education",
    "10th":         "Grade-school",
    "7th-8th":      "Grade-school",
    "Prof-school":  "Higher-education",
    "9th":          "Grade-school",              
    "12th":         "Grade-school",
    "Doctorate":    "Graduate",
    "5th-6th":      "Grade-school",
    "1st-4th":      "Grade-school",
    "Preschool":    "Grade-school"
}
CLEAN_WORKCLASS_MAP = {
    "Private":          "Private",
    "Self-emp-not-inc": "Self-employed",
    "Self-emp-inc":     "Self-employed",
    "Local-gov":        "Government",
    "State-gov":        "Government",
    "Federal-gov":      "Government",
    "Without-pay":      "Without-pay"
}

In [3]:
def read_data(path):
    dataset = pd.read_csv(path)
    dataset.columns = DATA_HEADERS
    return dataset

In [18]:
def clean_marital_status(value):
    lowerValue = value.lower()
    
    if lowerValue == "never-married":
        return "Never-married"
    elif "married" in lowerValue:
        return "Married"
    else:
        return "Previously-married"
    
def clean_native_country(value):
    lowerValue = value.lower()
    
    if lowerValue == "united-states":
        return "United-States"
    else:
        return "Other"

def clean_data(data):
    # Strip whitespaces from all string values
    # and replace "?" with None,
    # and drop all na rows
    data = data.apply(lambda x: x.str.strip() if x.dtype == "object" else x) \
               .replace(['?'], [None]) \
               .dropna()
    # Clean 'marital-status'
    data['marital-status'] = data['marital-status'].map(clean_marital_status)
    # Clean 'native-country'
    data = data[data['native-country'] != "?"]
    data['native-country'] = data['native-country'].map(clean_native_country)
    # Clean 'education'
    data['education'] = data['education'].map(CLEAN_EDUCATION_MAP)
    # Clean 'workclass'
    data['workclass'] = data['workclass'].map(CLEAN_WORKCLASS_MAP)
    # Drop unecessary columns
    # - education-num - this looks like an identifier for the original education value (not needed!)
    data.drop(['fnlwgt', 'capital-gain', 'capital-loss', 'education-num'], axis=1, inplace=True)
    return data

In [19]:
dataset = read_data(TRAIN_DATA_PATH)
dataset = clean_data(dataset)
dataset.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sec,hours-per-week,native-country,y
0,50,Self-employed,Undergraduate,Married,Exec-managerial,Husband,White,Male,13,United-States,<=50K
1,38,Private,High-school,Previously-married,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
2,53,Private,Grade-school,Married,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
3,28,Private,Undergraduate,Married,Prof-specialty,Wife,Black,Female,40,Other,<=50K
4,37,Private,Graduate,Married,Exec-managerial,Wife,White,Female,40,United-States,<=50K


In [17]:
dataset['education-num'].value_counts()

9     9840
10    6678
13    5043
14    1627
11    1307
7     1048
12    1008
6      820
4      557
15     542
5      455
8      377
16     375
3      288
2      151
1       45
Name: education-num, dtype: int64

In [None]:
dataset2 = pd.get_dummies(dataset, columns =['age', 'workclass', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sec',
      'hours-per-week', 'native-country'])

In [None]:
dataset2.columns

In [None]:
X = dataset2.iloc[:,1:]
y = dataset2.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state=0)

In [None]:
# Again we scale the data ... although this is probably not needed (data is binary)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train.shape

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(23,23,23))

In [None]:
mlp.fit(X_train,y_train)

In [None]:
predictions = mlp.predict(X_test)

In [None]:
print(confusion_matrix(y_test,predictions))
mlp.score(X_test,y_test)

In [None]:
#Using KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors=5)

In [None]:
KNN.fit(X_train,y_train)

In [None]:
predictions = KNN.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
KNN.score(X_test,y_test)

In [None]:
# Using SVM

In [None]:
from sklearn import svm

svc_linear = svm.SVC(kernel='linear', C=10)
svc_linear.fit(X_train, y_train)

In [None]:
predicted= svc_linear.predict(X_test)
cnf_matrix = confusion_matrix(y_test, predicted)
print(cnf_matrix)
svc_linear.score(X_test,y_test)