# Income prediction

#### Goal: Classify income as greater or lesser than 50K using random forest. 

In [1]:
%matplotlib inline

# Imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

### Load and preprocess data

In [2]:
raw_data = pd.read_csv('adult.csv')
print raw_data.shape
raw_data.head()

(32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [3]:
# Remove observations with missing values
filtered_data = raw_data[~raw_data.eq('?').any(1)]
filtered_data = filtered_data.drop(['fnlwgt','education.num'],1)
print filtered_data.shape
print filtered_data.income.value_counts()
filtered_data.head()

(30162, 13)
<=50K    22654
>50K      7508
Name: income, dtype: int64


Unnamed: 0,age,workclass,education,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,Private,HS-grad,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
3,54,Private,7th-8th,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,Some-college,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,HS-grad,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,10th,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K


In [4]:
# Convert categorical data to numbers using one-hot encoding
filtered_data['income'] = filtered_data['income'].replace({'<=50K': 0, '>50K': 1})
encoded_data = pd.get_dummies(filtered_data)
encoded_data.head()

Unnamed: 0,age,capital.gain,capital.loss,hours.per.week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native.country_Portugal,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia
1,82,0,4356,18,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,54,0,3900,40,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,41,0,3900,40,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
5,34,0,3770,45,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
6,38,0,3770,40,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
# Training and testing data
X = encoded_data.loc[:, encoded_data.columns != 'income']
Y = encoded_data['income']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 42)

### Decision Tree

In [6]:
# Decision tree
clf = DecisionTreeClassifier()
clf.fit(X_train,Y_train)
Y_pred = clf.predict(X_test)
print classification_report(Y_test,Y_pred)
print accuracy_score(Y_test,Y_pred)

             precision    recall  f1-score   support

          0       0.88      0.88      0.88      5643
          1       0.65      0.63      0.64      1898

avg / total       0.82      0.82      0.82      7541

0.8207134332316669


### Random Forest

In [7]:
clf = RandomForestClassifier()
clf.fit(X_train,Y_train)
Y_pred = clf.predict(X_test)
print classification_report(Y_test,Y_pred)
print accuracy_score(Y_test,Y_pred)

             precision    recall  f1-score   support

          0       0.87      0.92      0.90      5643
          1       0.72      0.60      0.65      1898

avg / total       0.83      0.84      0.83      7541

0.8395438270786368


In [8]:
# Tuning random forest model
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 600, num = 3)]
max_depth = [int(x) for x in np.linspace(5, 25, num = 3)]
max_depth.append(None)

param_grid = { 
    'n_estimators': n_estimators,
    'max_depth' : max_depth,
    'max_features': ['auto', 'sqrt', 'log2'],
}

grid_search = GridSearchCV(clf, param_grid=param_grid, verbose = 2, n_jobs= -1)
grid_search.fit(X_train,Y_train)
grid_search.best_params_

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   52.6s
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  5.1min finished


{'max_depth': 15, 'max_features': 'sqrt', 'n_estimators': 350}

In [9]:
# Retrain using tuned paramters
clf = RandomForestClassifier(criterion='gini',max_depth=15,max_features='sqrt',n_estimators=350)
clf.fit(X_train,Y_train)
Y_pred = clf.predict(X_test)
print classification_report(Y_test,Y_pred)
print accuracy_score(Y_test,Y_pred)

             precision    recall  f1-score   support

          0       0.87      0.95      0.91      5643
          1       0.79      0.58      0.67      1898

avg / total       0.85      0.86      0.85      7541

0.8559872695928922


Tuned random forest (85.6% accuracy) outperforms un-tuned random forest(83.7%), which outperforms un-tuned decision tree (81.9%)

### Feature rankings

In [11]:
idx = np.argsort(clf.feature_importances_)[::-1]

print('Feature importance rankings:')
for f in range(X_train.shape[1]):
    print('%d. %s (%f)' % (f + 1, X_train.columns[indices[f]], clf.feature_importances_[indices[f]]))

Feature importance rankings:
1. capital.gain (0.156884)
2. marital.status_Married-civ-spouse (0.122876)
3. age (0.088665)
4. relationship_Husband (0.083567)
5. hours.per.week (0.060559)
6. capital.loss (0.044951)
7. marital.status_Never-married (0.043286)
8. education_Bachelors (0.033386)
9. occupation_Exec-managerial (0.033326)
10. occupation_Prof-specialty (0.031614)
11. relationship_Wife (0.019713)
12. education_Masters (0.019597)
13. relationship_Not-in-family (0.018735)
14. relationship_Own-child (0.017551)
15. sex_Male (0.016820)
16. sex_Female (0.015859)
17. education_HS-grad (0.015112)
18. education_Prof-school (0.013293)
19. occupation_Other-service (0.011671)
20. marital.status_Divorced (0.009831)
21. education_Doctorate (0.008107)
22. relationship_Unmarried (0.008071)
23. workclass_Self-emp-not-inc (0.008009)
24. workclass_Private (0.007692)
25. workclass_Self-emp-inc (0.006335)
26. education_Some-college (0.005698)
27. occupation_Farming-fishing (0.005528)
28. workclass_Fed