In [None]:
""" TODO
Clean data ✓
Figure out best classification model ✓
Figure out most important attributes
Figure out best visualization to display attributes that are important
Clean up notebook to be presentable
"""

In [139]:
import pandas as pd
import numpy as np
# Reading data:
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'martial-status', 'occupation', 'relationship', 'race',
                'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'label']

train_df = pd.read_csv('adult.data',names=column_names,header=None, index_col=False)
test_df = pd.read_csv('adult.test',names=column_names, header=None, index_col=False, skiprows=1)

In [140]:
# Data cleaning:
# education num is a label encoding, we want to do one-hot so we drop education-num
train_df = train_df.drop('education-num', axis='columns')
test_df = test_df.drop('education-num', axis='columns')

# drop all rows with unknown values
def clear_unknown_rows(df):
    df = df.replace(' ?',np.NaN)
    # existed in test but not training set, can do this in a better way but skipping for now
    df = df.replace(' Holand-Netherlands', np.NaN)
    df = df.dropna()
    return df

train_df = clear_unknown_rows(train_df)
test_df = clear_unknown_rows(test_df)

In [141]:
# one hot encoding
def encode_categories(df):
    df = df.join(pd.get_dummies(df.pop('workclass')))
    df = df.join(pd.get_dummies(df.pop('education')))
    df = df.join(pd.get_dummies(df.pop('martial-status')))
    df = df.join(pd.get_dummies(df.pop('occupation')))
    df = df.join(pd.get_dummies(df.pop('relationship')))
    df = df.join(pd.get_dummies(df.pop('race')))
    df = df.join(pd.get_dummies(df.pop('sex')))
    df = df.join(pd.get_dummies(df.pop('native-country')))
    # label needs to be one column instead of two
    # need 4 because there's one with a period and one without
    df.replace({' <=50K.': 0, ' >50K.' : 1, ' <=50K': 0, ' >50K' : 1}, inplace=True)
    return df

train_df = encode_categories(train_df)
test_df = encode_categories(test_df)

In [142]:
# last step is to get the labels from the training set
train_labels_df = train_df['label']
test_labels_df = test_df['label']
train_df = train_df.drop(['label'],axis=1)
test_df = test_df.drop(['label'],axis=1)


In [143]:
#Classification 
#https://www.datacamp.com/community/tutorials/random-forests-classifier-python
from sklearn.ensemble import RandomForestClassifier

clf=RandomForestClassifier(n_estimators=100)

clf.fit(train_df,train_labels_df)

y_pred=clf.predict(test_df)

In [144]:
from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(test_labels_df,y_pred))

Accuracy: 0.8468127490039841


In [145]:
# just a quick shot at guessing what features are most important
# a bit more difficult to piece together because we used one-hot encoding but this can 
# be the subject of the next meeting
feature_imp = pd.Series(clf.feature_importances_,index=train_df.columns).sort_values(ascending=False)
feature_imp

fnlwgt                         0.166609
age                            0.152310
capital-gain                   0.098370
hours-per-week                 0.084681
 Married-civ-spouse            0.069032
                                 ...   
 Laos                          0.000064
 Thailand                      0.000058
 Outlying-US(Guam-USVI-etc)    0.000038
 Armed-Forces                  0.000018
 Honduras                      0.000004
Length: 102, dtype: float64