- https://scikit-learn.org/stable/modules/tree.html
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.plot_tree.html

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 6)
plt.rcParams['font.size'] = 14
import pandas as pd

In [2]:
df = pd.read_csv('../data/adult.data', index_col=False)

In [3]:
golden = pd.read_csv('../data/adult.test', index_col=False)

In [4]:
golden.salary.unique()

array([' <=50K.', ' >50K.'], dtype=object)

In [5]:
golden.salary.replace(' <=50K.', ' <=50K').replace(' >50K.', ' >50K').unique()

array([' <=50K', ' >50K'], dtype=object)

In [6]:
golden.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [7]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [8]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [9]:
from sklearn import preprocessing

In [10]:
# Encode categorical features as an integer array.
enc = preprocessing.OrdinalEncoder()

In [11]:
transform_columns = ['sex']
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']

In [12]:
pd.get_dummies(df[transform_columns]).head()

Unnamed: 0,sex_ Female,sex_ Male
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0


- Copy dummy values back to df
- delete non numerical features

In [13]:
x = df.copy()

x = x.drop(non_num_columns, axis=1)

x["salary"] = enc.fit_transform(df[["salary"]])

In [14]:
x.salary.value_counts()

0.0    24720
1.0     7841
Name: salary, dtype: int64

In [15]:
x.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary
0,39,77516,13,2174,0,40,0.0
1,50,83311,13,0,0,13,0.0
2,38,215646,9,0,0,40,0.0
3,53,234721,7,0,0,40,0.0
4,28,338409,13,0,0,40,0.0


In [16]:
xt = golden.copy()
xt = xt.drop(non_num_columns, axis=1)
xt["salary"] = enc.fit_transform(golden[["salary"]])

In [17]:
xt.salary.value_counts()

0.0    12435
1.0     3846
Name: salary, dtype: int64

In [18]:
enc.categories_

[array([' <=50K.', ' >50K.'], dtype=object)]

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

#### Choose the model of your preference: DecisionTree or RandomForest

In [20]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=None)

In [21]:
# dropping features from training dataset
model.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

DecisionTreeClassifier(criterion='entropy')

In [22]:
model.tree_.node_count

7469

In [23]:
list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, model.feature_importances_))

[('age', 0.327135310262162),
 ('education-num', 0.16584618417122646),
 ('capital-gain', 0.24817183124316666),
 ('capital-loss', 0.0977206590738914),
 ('hours-per-week', 0.16112601524955353)]

In [24]:
list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, model.feature_importances_))

[('age', 0.327135310262162),
 ('education-num', 0.16584618417122646),
 ('capital-gain', 0.24817183124316666),
 ('capital-loss', 0.0977206590738914),
 ('hours-per-week', 0.16112601524955353)]

In [25]:
x.drop(['fnlwgt','salary'], axis=1).head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
0,39,13,2174,0,40
1,50,13,0,0,13
2,38,9,0,0,40
3,53,7,0,0,40
4,28,13,0,0,40


In [26]:
set(x.columns) - set(xt.columns)

set()

In [27]:
list(x.drop('salary', axis=1).columns)

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [28]:
predictions = model.predict(xt.drop(['fnlwgt','salary'], axis=1))
predictionsx = model.predict(x.drop(['fnlwgt','salary'], axis=1))

In [29]:
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, auc, roc_curve
)

In [30]:
accuracy_score(xt.salary, predictions)

0.8165960321847552

In [31]:
accuracy_score(xt.salary, predictions)

0.8165960321847552

In [32]:
confusion_matrix(xt.salary, predictions)

array([[11517,   918],
       [ 2068,  1778]], dtype=int64)

In [33]:
confusion_matrix(xt.salary, predictions)

array([[11517,   918],
       [ 2068,  1778]], dtype=int64)

In [34]:
print(classification_report(xt.salary, predictions))

              precision    recall  f1-score   support

         0.0       0.85      0.93      0.89     12435
         1.0       0.66      0.46      0.54      3846

    accuracy                           0.82     16281
   macro avg       0.75      0.69      0.71     16281
weighted avg       0.80      0.82      0.80     16281



In [35]:
print(classification_report(xt.salary, predictions))

              precision    recall  f1-score   support

         0.0       0.85      0.93      0.89     12435
         1.0       0.66      0.46      0.54      3846

    accuracy                           0.82     16281
   macro avg       0.75      0.69      0.71     16281
weighted avg       0.80      0.82      0.80     16281



In [36]:
accuracy_score(x.salary, predictionsx)

0.8841558920180584

In [37]:
confusion_matrix(x.salary, predictionsx)

array([[24136,   584],
       [ 3188,  4653]], dtype=int64)

In [38]:
print(classification_report(x.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.88      0.98      0.93     24720
         1.0       0.89      0.59      0.71      7841

    accuracy                           0.88     32561
   macro avg       0.89      0.78      0.82     32561
weighted avg       0.88      0.88      0.88     32561



In [39]:
print(classification_report(x.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.88      0.98      0.93     24720
         1.0       0.89      0.59      0.71      7841

    accuracy                           0.88     32561
   macro avg       0.89      0.78      0.82     32561
weighted avg       0.88      0.88      0.88     32561



# For the following use the above `adult` dataset. Start with only numerical features/columns.  

# 1. Show the RandomForest outperforms the DecisionTree for a fixed `max_depth` by training using the train set and `precision`, `recall`, `f1` on golden-test set.

In [40]:
# see 12:40 of lecture
model = RandomForestClassifier(criterion='entropy')

In [41]:
model.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

RandomForestClassifier(criterion='entropy')

In [42]:
# Will only work for one tree.  this will not work for RandomForest because there are many trees (n_estimator=100).
# model.tree_.node_count

In [43]:
list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, model.feature_importances_))

[('age', 0.3452224111624206),
 ('education-num', 0.17477583047390988),
 ('capital-gain', 0.2076936675965432),
 ('capital-loss', 0.08337997448932363),
 ('hours-per-week', 0.18892811627780273)]

In [44]:
predictions = model.predict(xt.drop(['fnlwgt','salary'], axis=1))
predictionsx = model.predict(x.drop(['fnlwgt','salary'], axis=1))

In [45]:
accuracy_score(xt.salary, predictions)

0.8228610036238561

In [46]:
confusion_matrix(xt.salary, predictions)

array([[11592,   843],
       [ 2041,  1805]], dtype=int64)

In [47]:
Rpt_RndFrst_00 = classification_report(xt.salary, predictions)
print(Rpt_RndFrst_00)

              precision    recall  f1-score   support

         0.0       0.85      0.93      0.89     12435
         1.0       0.68      0.47      0.56      3846

    accuracy                           0.82     16281
   macro avg       0.77      0.70      0.72     16281
weighted avg       0.81      0.82      0.81     16281



# 2. For RandomForest or DecisionTree and using the `adult` dataset, systematically add new columns, one by one, that are non-numerical but converted using the feature-extraction techniques we learned. Show [`precision`, `recall`, `f1`] for each additional feature added.

### Add 'workclass' column

In [48]:
non_num_columns = ['education', 'marital-status', 'sex', 'occupation', 'relationship', 'race', 'native-country']

In [49]:
x = df.copy()
x = x.drop(non_num_columns, axis=1)
x["salary"] = enc.fit_transform(df[["salary"]])
x["workclass"] = enc.fit_transform(df[["workclass"]])

xt = golden.copy()
xt = xt.drop(non_num_columns, axis=1)
xt["salary"] = enc.fit_transform(golden[["salary"]])
xt["workclass"] = enc.fit_transform(golden[["workclass"]])

In [50]:
model = RandomForestClassifier(criterion='entropy')
model.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

RandomForestClassifier(criterion='entropy')

In [51]:
predictions = model.predict(xt.drop(['fnlwgt','salary'], axis=1))
predictionsx = model.predict(x.drop(['fnlwgt','salary'], axis=1))

In [52]:
Rpt_RndFrst_01 = (classification_report(xt.salary, predictions))

### Add 'marital-status' column

In [53]:
non_num_columns = ['education', 'sex', 'occupation', 'relationship', 'race', 'native-country']

In [54]:
x = df.copy()
x = x.drop(non_num_columns, axis=1)
x["salary"] = enc.fit_transform(df[["salary"]])
x["workclass"] = enc.fit_transform(df[["workclass"]])
x["marital-status"] = enc.fit_transform(df[["marital-status"]])

xt = golden.copy()
xt = xt.drop(non_num_columns, axis=1)
xt["salary"] = enc.fit_transform(golden[["salary"]])
xt["workclass"] = enc.fit_transform(golden[["workclass"]])
xt["marital-status"] = enc.fit_transform(golden[["marital-status"]])

In [55]:
model = RandomForestClassifier(criterion='entropy')
model.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

RandomForestClassifier(criterion='entropy')

In [56]:
predictions = model.predict(xt.drop(['fnlwgt','salary'], axis=1))
predictionsx = model.predict(x.drop(['fnlwgt','salary'], axis=1))

In [57]:
Rpt_RndFrst_02 = classification_report(xt.salary, predictions)

### Add 'sex' column

In [58]:
non_num_columns = ['education', 'occupation', 'relationship', 'race', 'native-country']

In [59]:
x = df.copy()
x = x.drop(non_num_columns, axis=1)
x["salary"] = enc.fit_transform(df[["salary"]])
x["workclass"] = enc.fit_transform(df[["workclass"]])
x["marital-status"] = enc.fit_transform(df[["marital-status"]])
x["sex"] = enc.fit_transform(df[["sex"]])

xt = golden.copy()
xt = xt.drop(non_num_columns, axis=1)
xt["salary"] = enc.fit_transform(golden[["salary"]])
xt["workclass"] = enc.fit_transform(golden[["workclass"]])
xt["marital-status"] = enc.fit_transform(golden[["marital-status"]])
xt["sex"] = enc.fit_transform(golden[["sex"]])

In [60]:
model = RandomForestClassifier(criterion='entropy')
model.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

RandomForestClassifier(criterion='entropy')

In [61]:
predictions = model.predict(xt.drop(['fnlwgt','salary'], axis=1))
predictionsx = model.predict(x.drop(['fnlwgt','salary'], axis=1))

In [62]:
Rpt_RndFrst_03 = classification_report(xt.salary, predictions)

### Add 'occupation' column

In [63]:
non_num_columns = ['education', 'relationship', 'race', 'native-country']

In [64]:
x = df.copy()
x = x.drop(non_num_columns, axis=1)
x["salary"] = enc.fit_transform(df[["salary"]])
x["workclass"] = enc.fit_transform(df[["workclass"]])
x["marital-status"] = enc.fit_transform(df[["marital-status"]])
x["sex"] = enc.fit_transform(df[["sex"]])
x["occupation"] = enc.fit_transform(df[["occupation"]])

xt = golden.copy()
xt = xt.drop(non_num_columns, axis=1)
xt["salary"] = enc.fit_transform(golden[["salary"]])
xt["workclass"] = enc.fit_transform(golden[["workclass"]])
xt["marital-status"] = enc.fit_transform(golden[["marital-status"]])
xt["sex"] = enc.fit_transform(golden[["sex"]])
xt["occupation"] = enc.fit_transform(golden[["occupation"]])

In [65]:
model = RandomForestClassifier(criterion='entropy')
model.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

RandomForestClassifier(criterion='entropy')

In [66]:
predictions = model.predict(xt.drop(['fnlwgt','salary'], axis=1))
predictionsx = model.predict(x.drop(['fnlwgt','salary'], axis=1))

In [67]:
Rpt_RndFrst_04 = classification_report(xt.salary, predictions)

### Add 'relationship' column

In [68]:
non_num_columns = ['education', 'race', 'native-country']

In [69]:
x = df.copy()
x = x.drop(non_num_columns, axis=1)
x["salary"] = enc.fit_transform(df[["salary"]])
x["workclass"] = enc.fit_transform(df[["workclass"]])
x["marital-status"] = enc.fit_transform(df[["marital-status"]])
x["sex"] = enc.fit_transform(df[["sex"]])
x["occupation"] = enc.fit_transform(df[["occupation"]])
x["relationship"] = enc.fit_transform(df[["relationship"]])

xt = golden.copy()
xt = xt.drop(non_num_columns, axis=1)
xt["salary"] = enc.fit_transform(golden[["salary"]])
xt["workclass"] = enc.fit_transform(golden[["workclass"]])
xt["marital-status"] = enc.fit_transform(golden[["marital-status"]])
xt["sex"] = enc.fit_transform(golden[["sex"]])
xt["occupation"] = enc.fit_transform(golden[["occupation"]])
xt["relationship"] = enc.fit_transform(golden[["relationship"]])

In [70]:
model = RandomForestClassifier(criterion='entropy')
model.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

RandomForestClassifier(criterion='entropy')

In [71]:
predictions = model.predict(xt.drop(['fnlwgt','salary'], axis=1))
predictionsx = model.predict(x.drop(['fnlwgt','salary'], axis=1))

In [72]:
Rpt_RndFrst_05 = classification_report(xt.salary, predictions)

### Add 'race' column

In [73]:
non_num_columns = ['education', 'native-country']

In [74]:
x = df.copy()
x = x.drop(non_num_columns, axis=1)
x["salary"] = enc.fit_transform(df[["salary"]])
x["workclass"] = enc.fit_transform(df[["workclass"]])
x["marital-status"] = enc.fit_transform(df[["marital-status"]])
x["sex"] = enc.fit_transform(df[["sex"]])
x["occupation"] = enc.fit_transform(df[["occupation"]])
x["relationship"] = enc.fit_transform(df[["relationship"]])
x["race"] = enc.fit_transform(df[["race"]])

xt = golden.copy()
xt = xt.drop(non_num_columns, axis=1)
xt["salary"] = enc.fit_transform(golden[["salary"]])
xt["workclass"] = enc.fit_transform(golden[["workclass"]])
xt["marital-status"] = enc.fit_transform(golden[["marital-status"]])
xt["sex"] = enc.fit_transform(golden[["sex"]])
xt["occupation"] = enc.fit_transform(golden[["occupation"]])
xt["relationship"] = enc.fit_transform(golden[["relationship"]])
xt["race"] = enc.fit_transform(golden[["race"]])

In [75]:
model = RandomForestClassifier(criterion='entropy')
model.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

RandomForestClassifier(criterion='entropy')

In [76]:
predictions = model.predict(xt.drop(['fnlwgt','salary'], axis=1))
predictionsx = model.predict(x.drop(['fnlwgt','salary'], axis=1))

In [77]:
Rpt_RndFrst_06 = classification_report(xt.salary, predictions)

In [78]:
print(classification_report(xt.salary, predictions))

              precision    recall  f1-score   support

         0.0       0.89      0.92      0.90     12435
         1.0       0.71      0.62      0.66      3846

    accuracy                           0.85     16281
   macro avg       0.80      0.77      0.78     16281
weighted avg       0.85      0.85      0.85     16281



### Add 'native-country' column

In [79]:
non_num_columns = ['education'] # Note that education already encoded under column 'education-num'

In [80]:
x = df.copy()
x = x.drop(non_num_columns, axis=1)
x["salary"] = enc.fit_transform(df[["salary"]])
x["workclass"] = enc.fit_transform(df[["workclass"]])
x["marital-status"] = enc.fit_transform(df[["marital-status"]])
x["sex"] = enc.fit_transform(df[["sex"]])
x["occupation"] = enc.fit_transform(df[["occupation"]])
x["relationship"] = enc.fit_transform(df[["relationship"]])
x["race"] = enc.fit_transform(df[["race"]])
x["native-country"] = enc.fit_transform(df[["native-country"]])

xt = golden.copy()
xt = xt.drop(non_num_columns, axis=1)
xt["salary"] = enc.fit_transform(golden[["salary"]])
xt["workclass"] = enc.fit_transform(golden[["workclass"]])
xt["marital-status"] = enc.fit_transform(golden[["marital-status"]])
xt["sex"] = enc.fit_transform(golden[["sex"]])
xt["occupation"] = enc.fit_transform(golden[["occupation"]])
xt["relationship"] = enc.fit_transform(golden[["relationship"]])
xt["race"] = enc.fit_transform(golden[["race"]])
xt["native-country"] = enc.fit_transform(golden[["native-country"]])

In [81]:
model = RandomForestClassifier(criterion='entropy')
model.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

RandomForestClassifier(criterion='entropy')

In [82]:
predictions = model.predict(xt.drop(['fnlwgt','salary'], axis=1))
predictionsx = model.predict(x.drop(['fnlwgt','salary'], axis=1))

In [83]:
Rpt_RndFrst_07 = classification_report(xt.salary, predictions)

### Print all classifcation reports

In [84]:
print(Rpt_RndFrst_00)
print(Rpt_RndFrst_01)
print(Rpt_RndFrst_02)
print(Rpt_RndFrst_03)
print(Rpt_RndFrst_04)
print(Rpt_RndFrst_05)
print(Rpt_RndFrst_06)
print(Rpt_RndFrst_07)

              precision    recall  f1-score   support

         0.0       0.85      0.93      0.89     12435
         1.0       0.68      0.47      0.56      3846

    accuracy                           0.82     16281
   macro avg       0.77      0.70      0.72     16281
weighted avg       0.81      0.82      0.81     16281

              precision    recall  f1-score   support

         0.0       0.86      0.92      0.89     12435
         1.0       0.67      0.50      0.57      3846

    accuracy                           0.82     16281
   macro avg       0.76      0.71      0.73     16281
weighted avg       0.81      0.82      0.81     16281

              precision    recall  f1-score   support

         0.0       0.88      0.92      0.90     12435
         1.0       0.70      0.60      0.65      3846

    accuracy                           0.84     16281
   macro avg       0.79      0.76      0.77     16281
weighted avg       0.84      0.84      0.84     16281

              preci

# 3. Optional: Using gridSearch find the most optimal parameters for your model
Warning: this can be computationally intensive and may take some time.
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
- https://scikit-learn.org/stable/modules/grid_search.html

In [85]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

In [86]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

In [87]:
clf = GridSearchCV(model, parameters)

In [89]:
clf.fit(preprocessing.scale(x.drop('salary', axis=1)), x.salary)
sorted(clf.cv_results_.keys())

ValueError: Invalid parameter C for estimator RandomForestClassifier(criterion='entropy'). Check the list of available parameters with `estimator.get_params().keys()`.