- https://scikit-learn.org/stable/modules/tree.html
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.plot_tree.html

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 6)
plt.rcParams['font.size'] = 14
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/adult.data', index_col=False)

In [3]:
golden = pd.read_csv('../data/adult.test', index_col=False)

In [4]:
golden.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [5]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [7]:
from sklearn import preprocessing

In [8]:
enc = preprocessing.OrdinalEncoder()

In [9]:
transform_columns = ['sex']
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']

In [10]:
pd.get_dummies(df[transform_columns]).head()

Unnamed: 0,sex_ Female,sex_ Male
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0


In [11]:
x = df.copy()

x = pd.concat([x.drop(non_num_columns, axis=1), 
               pd.get_dummies(df[transform_columns])], axis=1,)

x["salary"] = enc.fit_transform(df[["salary"]])

In [12]:
x.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,sex_ Female,sex_ Male
0,39,77516,13,2174,0,40,0.0,0,1
1,50,83311,13,0,0,13,0.0,0,1
2,38,215646,9,0,0,40,0.0,0,1
3,53,234721,7,0,0,40,0.0,0,1
4,28,338409,13,0,0,40,0.0,1,0


In [13]:
xt = golden.copy()

xt = pd.concat([xt.drop(non_num_columns, axis=1), 
               pd.get_dummies(golden[transform_columns])], axis=1,)

xt["salary"] = enc.fit_transform(golden[["salary"]])

In [14]:
xt.salary.value_counts()

0.0    12435
1.0     3846
Name: salary, dtype: int64

In [15]:
enc.categories_

[array([' <=50K.', ' >50K.'], dtype=object)]

In [85]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

#### Choose the model of your preference: DecisionTree or RandomForest

In [17]:
model = RandomForestClassifier(criterion='entropy')

In [18]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=None)

In [19]:
model.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [20]:
model.tree_.node_count

8331

In [21]:
list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, model.feature_importances_))

[('age', 0.3228253894694752),
 ('education-num', 0.16098142933278192),
 ('capital-gain', 0.22737275251771055),
 ('capital-loss', 0.07826321855438988),
 ('hours-per-week', 0.15480842658955965),
 ('sex_ Female', 0.03415121571249182),
 ('sex_ Male', 0.021597567823591006)]

In [22]:
list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, model.feature_importances_))

[('age', 0.3228253894694752),
 ('education-num', 0.16098142933278192),
 ('capital-gain', 0.22737275251771055),
 ('capital-loss', 0.07826321855438988),
 ('hours-per-week', 0.15480842658955965),
 ('sex_ Female', 0.03415121571249182),
 ('sex_ Male', 0.021597567823591006)]

In [23]:
x.drop(['fnlwgt','salary'], axis=1).head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,sex_ Female,sex_ Male
0,39,13,2174,0,40,0,1
1,50,13,0,0,13,0,1
2,38,9,0,0,40,0,1
3,53,7,0,0,40,0,1
4,28,13,0,0,40,1,0


In [24]:
set(x.columns) - set(xt.columns)

set()

In [25]:
list(x.drop('salary', axis=1).columns)

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'sex_ Female',
 'sex_ Male']

In [26]:
predictions = model.predict(xt.drop(['fnlwgt','salary'], axis=1))
predictionsx = model.predict(x.drop(['fnlwgt','salary'], axis=1))

In [27]:
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, auc, roc_curve
)

In [28]:
accuracy_score(xt.salary, predictions)

0.8207112585222038

In [29]:
accuracy_score(xt.salary, predictions)

0.8207112585222038

In [30]:
confusion_matrix(xt.salary, predictions)

array([[11457,   978],
       [ 1941,  1905]], dtype=int64)

In [31]:
print(classification_report(xt.salary, predictions))

              precision    recall  f1-score   support

         0.0       0.86      0.92      0.89     12435
         1.0       0.66      0.50      0.57      3846

    accuracy                           0.82     16281
   macro avg       0.76      0.71      0.73     16281
weighted avg       0.81      0.82      0.81     16281



In [32]:
print(classification_report(xt.salary, predictions))

              precision    recall  f1-score   support

         0.0       0.86      0.92      0.89     12435
         1.0       0.66      0.50      0.57      3846

    accuracy                           0.82     16281
   macro avg       0.76      0.71      0.73     16281
weighted avg       0.81      0.82      0.81     16281



In [33]:
accuracy_score(x.salary, predictionsx)

0.8955806025613464

In [34]:
confusion_matrix(x.salary, predictionsx)

array([[24097,   623],
       [ 2777,  5064]], dtype=int64)

In [35]:
print(classification_report(x.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.90      0.97      0.93     24720
         1.0       0.89      0.65      0.75      7841

    accuracy                           0.90     32561
   macro avg       0.89      0.81      0.84     32561
weighted avg       0.90      0.90      0.89     32561



In [36]:
print(classification_report(x.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.90      0.97      0.93     24720
         1.0       0.89      0.65      0.75      7841

    accuracy                           0.90     32561
   macro avg       0.89      0.81      0.84     32561
weighted avg       0.90      0.90      0.89     32561



# For the following use the above `adult` dataset. Start with only numerical features/columns.  

# 1. Show the RandomForest outperforms the DecisionTree for a fixed `max_depth` by training using the train set and `precision`, `recall`, `f1` on golden-test set.

In [127]:
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 
                     'native-country']

df_num = df.drop(non_num_columns, axis=1)
df_num.head()

Unnamed: 0,age,fnlwgt,education-num,sex,capital-gain,capital-loss,hours-per-week,salary
0,39,77516,13,Male,2174,0,40,<=50K
1,50,83311,13,Male,0,0,13,<=50K
2,38,215646,9,Male,0,0,40,<=50K
3,53,234721,7,Male,0,0,40,<=50K
4,28,338409,13,Female,0,0,40,<=50K


In [128]:
df_num.dtypes

age                int64
fnlwgt             int64
education-num      int64
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
salary            object
dtype: object

In [129]:
df_num["salary"] = enc.fit_transform(df_num[["salary"]])
df_num["sex"] = enc.fit_transform(df_num[["sex"]])

df_num.head()

Unnamed: 0,age,fnlwgt,education-num,sex,capital-gain,capital-loss,hours-per-week,salary
0,39,77516,13,1.0,2174,0,40,0.0
1,50,83311,13,1.0,0,0,13,0.0
2,38,215646,9,1.0,0,0,40,0.0
3,53,234721,7,1.0,0,0,40,0.0
4,28,338409,13,0.0,0,0,40,0.0


In [130]:
df_num.dtypes

age                 int64
fnlwgt              int64
education-num       int64
sex               float64
capital-gain        int64
capital-loss        int64
hours-per-week      int64
salary            float64
dtype: object

In [131]:
df_test = golden.drop(non_num_columns, axis=1)
df_test["salary"] = enc.fit_transform(golden[["salary"]])
df_test["sex"] = enc.fit_transform(golden[["sex"]])

df_test.head()

Unnamed: 0,age,fnlwgt,education-num,sex,capital-gain,capital-loss,hours-per-week,salary
0,25,226802,7,1.0,0,0,40,0.0
1,38,89814,9,1.0,0,0,50,0.0
2,28,336951,12,1.0,0,0,40,1.0
3,44,160323,10,1.0,7688,0,40,1.0
4,18,103497,10,0.0,0,0,30,0.0


In [42]:
model_rf = RandomForestClassifier(criterion='entropy')
model_rf.fit(df_num.drop(['fnlwgt','salary'], axis=1), df_num.salary)

pred_rf = model_rf.predict(df_test.drop(['fnlwgt','salary'], axis=1))

accuracy_score(df_test.salary, pred_rf)

0.8289417111971009

In [43]:
print(classification_report(df_test.salary, pred_rf))

              precision    recall  f1-score   support

         0.0       0.86      0.93      0.89     12435
         1.0       0.68      0.51      0.59      3846

    accuracy                           0.83     16281
   macro avg       0.77      0.72      0.74     16281
weighted avg       0.82      0.83      0.82     16281



In [44]:
model_dt = DecisionTreeClassifier(criterion='entropy', max_depth=4)
model_dt.fit(df_num.drop(['fnlwgt','salary'], axis=1), df_num.salary)

pred_dt = model_dt.predict(df_test.drop(['fnlwgt','salary'], axis=1))

accuracy_score(df_test.salary, pred_dt)

0.8192985688839752

In [45]:
print(classification_report(df_test.salary, pred_dt))

              precision    recall  f1-score   support

         0.0       0.85      0.92      0.89     12435
         1.0       0.66      0.49      0.56      3846

    accuracy                           0.82     16281
   macro avg       0.76      0.70      0.72     16281
weighted avg       0.81      0.82      0.81     16281



# 2. For RandomForest or DecisionTree and using the `adult` dataset, systematically add new columns, one by one, that are non-numerical but converted using the feature-extraction techniques we learned. Show [`precision`, `recall`, `f1`] for each additional feature added.

In [46]:
list(zip(df.drop(['fnlwgt','salary'], axis=1).columns, model.feature_importances_))

[('age', 0.3228253894694752),
 ('workclass', 0.16098142933278192),
 ('education', 0.22737275251771055),
 ('education-num', 0.07826321855438988),
 ('marital-status', 0.15480842658955965),
 ('occupation', 0.03415121571249182),
 ('relationship', 0.021597567823591006)]

In [47]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [48]:
# add workclass
non_num_columns = ['education', 'marital-status', 'occupation', 
                   'relationship', 'race', 'native-country']
transform = ['salary', 'sex', 'workclass']

df_num = df.drop(non_num_columns, axis=1)

for feat in transform:
    df_num[feat] = enc.fit_transform(df_num[[feat]])

df_test = golden.drop(non_num_columns, axis=1)
for feat in transform:
    df_test[feat] = enc.fit_transform(df_test[[feat]])

model_rf.fit(df_num.drop(['fnlwgt','salary'], axis=1), df_num.salary)
pred_rf = model_rf.predict(df_test.drop(['fnlwgt','salary'], axis=1))

print(classification_report(df_test.salary, pred_rf))

              precision    recall  f1-score   support

         0.0       0.86      0.92      0.89     12435
         1.0       0.67      0.52      0.59      3846

    accuracy                           0.83     16281
   macro avg       0.77      0.72      0.74     16281
weighted avg       0.82      0.83      0.82     16281



In [49]:
# add race
non_num_columns = ['education', 'marital-status', 'occupation', 
                   'relationship', 'native-country']
transform = ['race', 'salary', 'sex', 'workclass']

df_num = df.drop(non_num_columns, axis=1)

for feat in transform:
    df_num[feat] = enc.fit_transform(df_num[[feat]])

df_test = golden.drop(non_num_columns, axis=1)
for feat in transform:
    df_test[feat] = enc.fit_transform(df_test[[feat]])

model_rf.fit(df_num.drop(['fnlwgt','salary'], axis=1), df_num.salary)
pred_rf = model_rf.predict(df_test.drop(['fnlwgt','salary'], axis=1))

print(classification_report(df_test.salary, pred_rf))

              precision    recall  f1-score   support

         0.0       0.86      0.92      0.89     12435
         1.0       0.68      0.53      0.59      3846

    accuracy                           0.83     16281
   macro avg       0.77      0.72      0.74     16281
weighted avg       0.82      0.83      0.82     16281



In [50]:
# add morital-status
non_num_columns = ['education', 'occupation', 
                   'relationship', 'native-country']
transform = ['marital-status', 'race', 'salary', 'sex', 'workclass']

df_num = df.drop(non_num_columns, axis=1)

for feat in transform:
    df_num[feat] = enc.fit_transform(df_num[[feat]])

df_test = golden.drop(non_num_columns, axis=1)
for feat in transform:
    df_test[feat] = enc.fit_transform(df_test[[feat]])

model_rf.fit(df_num.drop(['fnlwgt','salary'], axis=1), df_num.salary)
pred_rf = model_rf.predict(df_test.drop(['fnlwgt','salary'], axis=1))

print(classification_report(df_test.salary, pred_rf))

              precision    recall  f1-score   support

         0.0       0.88      0.92      0.90     12435
         1.0       0.70      0.60      0.65      3846

    accuracy                           0.84     16281
   macro avg       0.79      0.76      0.77     16281
weighted avg       0.84      0.84      0.84     16281



In [51]:
# add occupation
non_num_columns = ['education', 'relationship', 'native-country']
transform = ['occupation', 'marital-status', 'race', 'salary', 
             'sex', 'workclass']

df_num = df.drop(non_num_columns, axis=1)

for feat in transform:
    df_num[feat] = enc.fit_transform(df_num[[feat]])

df_test = golden.drop(non_num_columns, axis=1)
for feat in transform:
    df_test[feat] = enc.fit_transform(df_test[[feat]])

model_rf.fit(df_num.drop(['fnlwgt','salary'], axis=1), df_num.salary)
pred_rf = model_rf.predict(df_test.drop(['fnlwgt','salary'], axis=1))

print(classification_report(df_test.salary, pred_rf))

              precision    recall  f1-score   support

         0.0       0.88      0.92      0.90     12435
         1.0       0.70      0.61      0.66      3846

    accuracy                           0.85     16281
   macro avg       0.79      0.77      0.78     16281
weighted avg       0.84      0.85      0.84     16281



In [52]:
# add relationship
non_num_columns = ['education', 'native-country']
transform = ['relationship', 'occupation', 'marital-status', 
             'race', 'salary', 'sex', 'workclass']

df_num = df.drop(non_num_columns, axis=1)

for feat in transform:
    df_num[feat] = enc.fit_transform(df_num[[feat]])

df_test = golden.drop(non_num_columns, axis=1)
for feat in transform:
    df_test[feat] = enc.fit_transform(df_test[[feat]])

model_rf.fit(df_num.drop(['fnlwgt','salary'], axis=1), df_num.salary)
pred_rf = model_rf.predict(df_test.drop(['fnlwgt','salary'], axis=1))

print(classification_report(df_test.salary, pred_rf))

              precision    recall  f1-score   support

         0.0       0.89      0.92      0.90     12435
         1.0       0.71      0.62      0.66      3846

    accuracy                           0.85     16281
   macro avg       0.80      0.77      0.78     16281
weighted avg       0.85      0.85      0.85     16281



In [53]:
# add education
non_num_columns = ['native-country']
transform = ['education', 'relationship', 'occupation', 'marital-status', 
             'race', 'salary', 'sex', 'workclass']

df_num = df.drop(non_num_columns, axis=1)

for feat in transform:
    df_num[feat] = enc.fit_transform(df_num[[feat]])

df_test = golden.drop(non_num_columns, axis=1)
for feat in transform:
    df_test[feat] = enc.fit_transform(df_test[[feat]])

model_rf.fit(df_num.drop(['fnlwgt','salary'], axis=1), df_num.salary)
pred_rf = model_rf.predict(df_test.drop(['fnlwgt','salary'], axis=1))

print(classification_report(df_test.salary, pred_rf))

              precision    recall  f1-score   support

         0.0       0.89      0.92      0.90     12435
         1.0       0.71      0.61      0.66      3846

    accuracy                           0.85     16281
   macro avg       0.80      0.77      0.78     16281
weighted avg       0.84      0.85      0.85     16281



In [103]:
# add native-country
non_num_columns = []
transform = ['education', 'relationship', 'occupation', 'marital-status', 
             'race', 'salary', 'sex', 'workclass', 'native-country']

df_num = df.drop(non_num_columns, axis=1)

for feat in transform:
    df_num[feat] = enc.fit_transform(df_num[[feat]])

df_test = df.drop(non_num_columns, axis=1)
for feat in transform:
    df_test[feat] = enc.fit_transform(df_test[[feat]])

model_rf.fit(df_num.drop(['fnlwgt','salary'], axis=1), df_num.salary)
pred_rf = model_rf.predict(df_test.drop(['fnlwgt','salary'], axis=1))

print(classification_report(df_test.salary, pred_rf))

              precision    recall  f1-score   support

         0.0       0.98      0.99      0.98     24720
         1.0       0.96      0.94      0.95      7841

    accuracy                           0.98     32561
   macro avg       0.97      0.96      0.97     32561
weighted avg       0.98      0.98      0.98     32561



# 3. Optional: Using gridSearch find the most optimal parameters for your model
Warning: this can be computationally intensive and may take some time.
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
- https://scikit-learn.org/stable/modules/grid_search.html

In [84]:
from sklearn.model_selection import GridSearchCV

In [56]:
model_rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [57]:
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]

In [132]:
df_target = df_test['salary']
df_test = df_test.drop(['salary'], axis = 1)

In [133]:
df_train = df_num.drop(['salary'], axis = 1)
df_train_target = df_num['salary']

In [122]:
parameters = param_grid
rf = model_rf
clf = GridSearchCV(rf, parameters)
clf.fit(df_test, df_target)
sorted(clf.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_bootstrap',
 'param_max_depth',
 'param_max_features',
 'param_min_samples_leaf',
 'param_min_samples_split',
 'param_n_estimators',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [126]:
print(grid)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='entropy',
                                              max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,