# Assignment is below at the end

- https://scikit-learn.org/stable/modules/tree.html
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.plot_tree.html

In [138]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 6)
plt.rcParams['font.size'] = 14
import pandas as pd
from sklearn.metrics import classification_report,confusion_matrix

filename_train = "/Users/sanmirkem/Desktop/mlnn-personal/data/adult.data"
filename_test = "/Users/sanmirkem/Desktop/mlnn-personal/data/adult.test"

In [212]:
df = pd.read_csv(filename_train, index_col=False)

In [6]:
golden = pd.read_csv(filename_test, index_col=False)

In [7]:
golden.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [5]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [17]:
from sklearn import preprocessing

In [18]:
# Columns we want to transform
transform_columns = ['sex']

#Columns we can't use because non-numerical
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']

## First let's try using `pandas.get_dummies()` to transform columns

In [43]:
dummies = pd.get_dummies(df[transform_columns])
dummies

Unnamed: 0,sex_ Female,sex_ Male
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0
...,...,...
32556,1,0
32557,0,1
32558,1,0
32559,0,1


In [22]:
dummies.shape

(32561, 2)

## sklearn has a similar process for OneHot Encoding features

In [44]:
onehot = preprocessing.OneHotEncoder(handle_unknown="infrequent_if_exist", sparse=False)
onehot.fit(df[transform_columns])



In [39]:
onehot.categories_

[array([' Female', ' Male'], dtype=object)]

In [45]:
sex = onehot.transform(df[transform_columns])
sex

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]])

In [18]:
sex.shape

(32561, 2)

## In addition to OneHot encoding there is Ordinal Encoding 

In [46]:
enc = preprocessing.OrdinalEncoder()
enc.fit(df[["salary"]])
salary = enc.transform(df[["salary"]])
salary

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [1.]])

In [47]:
enc.categories_[0]

array([' <=50K', ' >50K'], dtype=object)

In [62]:
x = df.copy()

# transformed = pd.get_dummies(df[transform_columns])


onehot = preprocessing.OneHotEncoder(handle_unknown="infrequent_if_exist", sparse=False).fit(df[transform_columns])

enc = preprocessing.OrdinalEncoder()

enc.fit(df[["salary"]])


transformed = onehot.transform(df[transform_columns])
new_cols = list(onehot.categories_[0].flatten())
df_trans = pd.DataFrame(transformed, columns=new_cols)


x = pd.concat(
    [
        x.drop(non_num_columns, axis=1), 
        df_trans
    ], 
    axis=1,)


x["salary"] = enc.transform(df[["salary"]])



In [40]:
x.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,Female,Male
0,39,77516,13,2174,0,40,0.0,0.0,1.0
1,50,83311,13,0,0,13,0.0,0.0,1.0
2,38,215646,9,0,0,40,0.0,0.0,1.0
3,53,234721,7,0,0,40,0.0,0.0,1.0
4,28,338409,13,0,0,40,0.0,1.0,0.0


In [226]:
xt = golden.copy()

transformed = onehot.transform(xt[transform_columns])
new_cols = list(onehot.categories_[0].flatten())
df_trans = pd.DataFrame(transformed, columns=new_cols)

xt = pd.concat(
    [
        xt.drop(non_num_columns, axis=1), 
        df_trans
    ], 
    axis=1,)

xt["salary"] = enc.fit_transform(golden[["salary"]])

In [227]:
xt.salary.value_counts()

0.0    12435
1.0     3846
Name: salary, dtype: int64

In [67]:
enc.categories_

[array([' <=50K.', ' >50K.'], dtype=object)]

In [68]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

#### Choose the model of your preference: DecisionTree or RandomForest

In [58]:
model = RandomForestClassifier(criterion='entropy')

In [59]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=None)

In [69]:
model.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

In [70]:
model.tree_.node_count

8333

In [72]:
list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, model.feature_importances_))

[('age', 0.32221185199137453),
 ('education-num', 0.16137894960739976),
 ('capital-gain', 0.22816674068862702),
 ('capital-loss', 0.07876175207314069),
 ('hours-per-week', 0.15408960757356877),
 (' Female', 0.03406118075909206),
 (' Male', 0.021329917306797244)]

In [73]:
list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, model.feature_importances_))

[('age', 0.32221185199137453),
 ('education-num', 0.16137894960739976),
 ('capital-gain', 0.22816674068862702),
 ('capital-loss', 0.07876175207314069),
 ('hours-per-week', 0.15408960757356877),
 (' Female', 0.03406118075909206),
 (' Male', 0.021329917306797244)]

In [74]:
x.drop(['fnlwgt','salary'], axis=1).head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,Female,Male
0,39,13,2174,0,40,0.0,1.0
1,50,13,0,0,13,0.0,1.0
2,38,9,0,0,40,0.0,1.0
3,53,7,0,0,40,0.0,1.0
4,28,13,0,0,40,1.0,0.0


In [75]:
set(x.columns) - set(xt.columns)

set()

In [24]:
list(x.drop('salary', axis=1).columns)

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'sex_ Female',
 'sex_ Male']

In [55]:
predictions = model.predict(xt.drop(['fnlwgt','salary'], axis=1))
predictionsx = model.predict(x.drop(['fnlwgt','salary'], axis=1))

In [76]:
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, auc, roc_curve
)

In [45]:
accuracy_score(xt.salary, predictions)

0.8213868926970088

In [57]:
accuracy_score(xt.salary, predictions)

0.82808181315644

In [46]:
confusion_matrix(xt.salary, predictions)

array([[11472,   963],
       [ 1945,  1901]])

In [47]:
#print(classification_report(xt.salary, predictions))

              precision    recall  f1-score   support

         0.0       0.86      0.92      0.89     12435
         1.0       0.66      0.49      0.57      3846

    accuracy                           0.82     16281
   macro avg       0.76      0.71      0.73     16281
weighted avg       0.81      0.82      0.81     16281



In [56]:
#print(classification_report(xt.salary, predictions))

              precision    recall  f1-score   support

         0.0       0.86      0.92      0.89     12435
         1.0       0.68      0.52      0.59      3846

    accuracy                           0.83     16281
   macro avg       0.77      0.72      0.74     16281
weighted avg       0.82      0.83      0.82     16281



In [48]:
accuracy_score(x.salary, predictionsx)

0.8955806025613464

In [49]:
confusion_matrix(x.salary, predictionsx)

array([[24097,   623],
       [ 2777,  5064]])

In [50]:
print(classification_report(x.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.90      0.97      0.93     24720
         1.0       0.89      0.65      0.75      7841

    accuracy                           0.90     32561
   macro avg       0.89      0.81      0.84     32561
weighted avg       0.90      0.90      0.89     32561



In [58]:
#print(classification_report(x.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.91      0.96      0.93     24720
         1.0       0.85      0.68      0.76      7841

    accuracy                           0.90     32561
   macro avg       0.88      0.82      0.85     32561
weighted avg       0.89      0.90      0.89     32561



# For the following use the above `adult` dataset. 

# 1. Show the RandomForest outperforms the DecisionTree for a fixed `max_depth` by training using the train set and calculate `precision`, `recall`, `f1`, `confusion matrix` on golden-test set. Start with only numerical features/columns. (age, education-num, capital-gain, capital-loss, hours-per-week) 

In [422]:
# Aside from reassigning the file path at the beginning, I am piggy backing off your dataset import #
non_numeric_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country', 'fnlwgt']
adult_train = df.copy().drop(non_numeric_columns, axis=1)
adult_train.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,salary
0,39,13,2174,0,40,<=50K
1,50,13,0,0,13,<=50K
2,38,9,0,0,40,<=50K
3,53,7,0,0,40,<=50K
4,28,13,0,0,40,<=50K


In [418]:
min(adult_train['age']),max(adult_train['age'])

(17, 90)

In [419]:
adult_test = pd.read_csv(filename_test, index_col=False)
min(adult_test['age']),max(adult_test['age'])

(17, 90)

In [423]:
x = adult_train.copy().drop('salary', axis=1 )
y = adult_train['age']

In [424]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x.drop(['age'], axis=1),x.age, test_size=.20)

In [363]:
# making a decision tree
# testing at various max depths to figure out optimal setting
clf = DecisionTreeClassifier(max_depth=5) 
clf.fit(x_train, y_train)

# making predictions
clf_pred = clf.predict(x_test) # Test the prediction accurracy of the model
accuracy_score(y_test, clf_pred)

0.046829418086903114

In [425]:
# testing at various max depths to figure out optimal setting
clf = DecisionTreeClassifier(max_depth=15) 
clf.fit(x_train, y_train)

# making predictions
clf_pred = clf.predict(x_test) # Test the prediction accurracy of the model
accuracy_score(y_test, clf_pred)

0.04836480884385076

In [426]:
# testing at various max depths to figure out optimal setting
clf = DecisionTreeClassifier(max_depth=10) 
clf.fit(x_train, y_train)

# making predictions
clf_pred = clf.predict(x_test) # Test the prediction accurracy of the model
accuracy_score(y_test, clf_pred)

0.05082143405496699

In [162]:
clf.tree_.node_count

47429

In [377]:
accuracy_score(y_test, clf_pred)
# keeping max depth at 5 because it produces relatively highest acc. score

0.04867188699524029

In [378]:
#confusion matrix
cf_matrix = confusion_matrix(y_test, clf_pred) # generate confusion matrix
cf_matrix

array([[50,  8,  0, ...,  0,  0,  0],
       [33, 18, 15, ...,  0,  0,  0],
       [ 9, 15,  7, ...,  0,  0,  0],
       ...,
       [ 0,  0,  1, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 1,  0,  1, ...,  0,  0,  0]])

In [379]:
#Looking at Random Tree 
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(max_depth=10)
rf_clf.fit(x_train, y_train)
y_pred_rf = rf_clf.predict(x_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy:", accuracy_rf)

Accuracy: 0.04913250422232458


In [427]:
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country','salary']
xt = adult_test.copy().drop(non_num_columns, axis=1)
yt = adult_test['age']
xt.head

<bound method NDFrame.head of        age  fnlwgt  education-num  capital-gain  capital-loss  hours-per-week
0       25  226802              7             0             0              40
1       38   89814              9             0             0              50
2       28  336951             12             0             0              40
3       44  160323             10          7688             0              40
4       18  103497             10             0             0              30
...    ...     ...            ...           ...           ...             ...
16276   39  215419             13             0             0              36
16277   64  321403              9             0             0              40
16278   38  374983             13             0             0              50
16279   44   83891             13          5455             0              40
16280   35  182148             13             0             0              60

[16281 rows x 6 columns]>

In [428]:
xt_train, xt_test, yt_train, yt_test = train_test_split(xt.drop(['age'], axis=1),xt.age, test_size=.20)

In [429]:
clf_2 = DecisionTreeClassifier(max_depth = 10) 
clf_2.fit(xt_test, yt_test)

# making predictions
clf_pred_2 = clf_2.predict(xt_test) # Test the prediction accurracy of the model
# print(classification_report(yt_test,clf_pred_2))

# 2. Use a RandomForest or DecisionTree and the `adult` dataset, systematically add new columns, one by one, that are non-numerical but converted using the feature-extraction techniques we learned. Using the golden-test set show [`precision`, `recall`, `f1`, `confusion matrix`] for each additional feature added.

In [373]:
# This was me trying to create a function... it was not successful

# ## the non-numeric columns = non_num_columns  ['workclass', 'education', 'marital-status', 
#                      #'occupation', 'relationship', 'race', 'sex', 
#                      #'native-country']

# transform_cols = ['sex','education','workclass','education','marital-status','occupation','relationship','race','sex','native-country']
# for i in range(1,len(transform_cols)+1):
#     current_cols = transform_cols[:i]
    
#     xt = adult_test[current_cols].copy()
#     yt = adult_test['age']
#     onehot = preprocessing.OneHotEncoder(handle_unknown="infrequent_if_exist", sparse=False)
#     onehot.fit(xt[current_cols])
    
    
#     some_collector = pd.DataFrame() # or matrix??
#     for col in current_cols:
#         # sex = onehot.transform(xt[transform_cols])
        
#         something = onehot.transform(xt[col])
        
#         # add 'something' to 'some_collector' aka append 'something' to the df / matrix / whatever
    
    
#     print(sex)
#      # xt_train, xt_test, yt_train, yt_test = train_test_split(xt.drop(['age'], axis=1),xt.age, test_size=.20)
    
# #     onehot = preprocessing.OneHotEncoder(handle_unknown="infrequent_if_exist", sparse=False)
# #     onehot.fit(df[current_cols])
# #     sex = onehot.transform(df[transform_cols])
# #     print(classification_report(yt_test,clf_pred_2))

In [403]:
# Adding columns one by one, starting with sex 
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race',
                     'native-country','salary']
xt = adult_test.copy().drop(non_num_columns, axis=1)
xt_enc_1 = pd.get_dummies(xt, columns=['sex'], drop_first=True)
yt = adult_test['age']


# print(xt_enc_1)

In [308]:
xt_enc_1_train, xt_enc_1_test, yt_enc_1_train, yt_enc_1_test = train_test_split(xt_enc_1.drop(['age'], axis=1),xt_enc_1.age, test_size=.20)

In [406]:
clf_enc = DecisionTreeClassifier(max_depth=5) 
clf_enc.fit(xt_enc_1_test, yt_enc_1_test)

# making predictions
clf_enc_pred = clf_enc.predict(xt_enc_1_test) # Test the prediction accurracy of the model
print(classification_report(yt_enc_1_test,clf_enc_pred))

# I am only printing the classification report featuring the first encoded variable to save pages. 
# All print lines following this block of code are commented out

              precision    recall  f1-score   support

          17       0.46      0.68      0.55        44
          18       0.16      0.42      0.23        62
          19       0.20      0.43      0.27        75
          20       0.06      0.29      0.11        86
          21       0.00      0.00      0.00        71
          22       0.12      0.11      0.12        75
          23       0.31      0.04      0.07        97
          24       0.00      0.00      0.00        75
          25       0.08      0.23      0.12        74
          26       0.50      0.03      0.05        76
          27       0.00      0.00      0.00        73
          28       0.07      0.04      0.05        82
          29       0.04      0.31      0.07       105
          30       0.08      0.23      0.12        79
          31       0.00      0.00      0.00        86
          32       0.00      0.00      0.00        74
          33       0.09      0.02      0.04        91
          34       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [383]:
confusion_matrix(yt_enc_1_test,clf_enc_pred)

array([[30,  6,  0, ...,  0,  0,  0],
       [ 6, 26, 12, ...,  0,  0,  0],
       [ 0, 10, 32, ...,  0,  0,  0],
       ...,
       [ 0,  1,  0, ...,  0,  0,  0],
       [ 0,  1,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])

In [433]:
# Adding race
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship',
                     'native-country','salary']
xt = adult_test.copy().drop(non_num_columns, axis=1)
xt_enc_2 = pd.get_dummies(xt, columns=['sex','race'], drop_first=True)
yt = adult_test['age']
xt_enc_2_train, xt_enc_2_test, yt_enc_2_train, yt_enc_2_test = train_test_split(xt_enc_2.drop(['age'], axis=1),xt_enc_2.age, test_size=.20)
clf_enc_2 = DecisionTreeClassifier(max_depth=5) 
clf_enc_2.fit(xt_enc_2_test, yt_enc_2_test)

# making predictions
clf_enc_pred_2 = clf_enc_2.predict(xt_enc_2_test) # Test the prediction accurracy of the model
# print(classification_report(yt_enc_2_test,clf_enc_pred_2))

In [385]:
confusion_matrix(yt_enc_2_test,clf_enc_pred_2)

array([[37,  0,  1, ...,  0,  0,  0],
       [17, 19, 16, ...,  0,  0,  0],
       [ 5,  8, 21, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])

In [404]:
# Adding workclass
non_num_columns = ['education', 'marital-status', 
                     'occupation', 'relationship',
                     'native-country','salary']
xt = adult_test.copy().drop(non_num_columns, axis=1)
xt_enc_3 = pd.get_dummies(xt, columns=['sex','race','workclass'], drop_first=True)
yt = adult_test['age']
xt_enc_3_train, xt_enc_3_test, yt_enc_3_train, yt_enc_3_test = train_test_split(xt_enc_3.drop(['age'], axis=1),xt_enc_3.age, test_size=.20)
clf_enc_3 = DecisionTreeClassifier(max_depth=5) 
clf_enc_3.fit(xt_enc_3_test, yt_enc_3_test)

# making predictions
clf_enc_pred_3 = clf_enc_3.predict(xt_enc_3_test) # Test the prediction accurracy of the model
# print(classification_report(yt_enc_3_test,clf_enc_pred_3))

In [387]:
confusion_matrix(yt_enc_3_test,clf_enc_pred_3)

array([[34,  3,  0, ...,  0,  0,  0],
       [16, 16, 10, ...,  0,  0,  0],
       [ 4,  7, 29, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 1,  0,  1, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])

In [405]:
#Adding native-country
non_num_columns = ['education', 
                     'occupation', 'relationship',
                     'salary','marital-status']
xt = adult_test.copy().drop(non_num_columns, axis=1)
xt_enc_4 = pd.get_dummies(xt, columns=['sex','race','workclass','native-country'], drop_first=True)
yt = adult_test['age']
xt_enc_4_train,xt_enc_4_test,yt_enc_4_train,yt_enc_4_test = train_test_split(xt_enc_4.drop(['age'], axis=1),xt_enc_4.age, test_size=.20)
clf_enc_4 = DecisionTreeClassifier(max_depth=5) 
clf_enc_4.fit(xt_enc_4_test, yt_enc_4_test)

# making predictions
clf_enc_pred_4 = clf_enc_4.predict(xt_enc_4_test) # Test the prediction accurracy of the model
# print(classification_report(yt_enc_4_test,clf_enc_pred_4))

In [389]:
confusion_matrix(yt_enc_4_test,clf_enc_pred_4)

array([[21, 16,  0, ...,  0,  0,  0],
       [ 3, 31,  9, ...,  0,  0,  0],
       [ 1, 11, 31, ...,  0,  0,  0],
       ...,
       [ 0,  1,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 1,  0,  1, ...,  0,  0,  0]])

In [432]:
#Adding education
non_num_columns = [ 'occupation', 'relationship',
                     'salary','marital-status']
xt = adult_test.copy().drop(non_num_columns, axis=1)
xt_enc_5 = pd.get_dummies(xt, columns=['sex','race','workclass','native-country','education'], drop_first=True)
yt = adult_test['age']
xt_enc_5_train,xt_enc_5_test,yt_enc_5_train,yt_enc_5_test = train_test_split(xt_enc_5.drop(['age'], axis=1),xt_enc_5.age, test_size=.20)
clf_enc_5 = DecisionTreeClassifier(max_depth=5) 
clf_enc_5.fit(xt_enc_5_test, yt_enc_5_test)

# making predictions
clf_enc_pred_5 = clf_enc_5.predict(xt_enc_5_test) # Test the prediction accurracy of the model
# print(classification_report(yt_enc_5_test,clf_enc_pred_5))

In [391]:
confusion_matrix(yt_enc_5_test,clf_enc_pred_5)

array([[28,  1,  1, ...,  0,  0,  0],
       [19, 12,  7, ...,  0,  0,  0],
       [ 1,  0, 14, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])

In [431]:
#Adding occupation
non_num_columns = [ 'relationship',
                     'salary','marital-status']
xt = adult_test.copy().drop(non_num_columns, axis=1)
xt_enc_6 = pd.get_dummies(xt, columns=['sex','race','workclass','native-country','education','occupation'], drop_first=True)
yt = adult_test['age']
xt_enc_6_train,xt_enc_6_test,yt_enc_6_train,yt_enc_6_test = train_test_split(xt_enc_6.drop(['age'], axis=1),xt_enc_6.age, test_size=.20)
clf_enc_6 = DecisionTreeClassifier(max_depth=5) 
clf_enc_6.fit(xt_enc_6_test, yt_enc_6_test)

# making predictions
clf_enc_pred_6 = clf_enc_6.predict(xt_enc_6_test) # Test the prediction accurracy of the model
# print(classification_report(yt_enc_6_test,clf_enc_pred_6))

In [393]:
confusion_matrix(yt_enc_6_test,clf_enc_pred_6)

array([[39,  0,  0, ...,  0,  0,  0],
       [19,  0,  0, ...,  0,  0,  0],
       [ 3,  0,  2, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])

In [430]:
#Adding salary
non_num_columns = [ 'relationship',
                     'marital-status']
xt = adult_test.copy().drop(non_num_columns, axis=1)
xt_enc_7 = pd.get_dummies(xt, columns=['sex','race','workclass','native-country','education','occupation','salary'], drop_first=True)
yt = adult_test['age']
xt_enc_7_train,xt_enc_7_test,yt_enc_7_train,yt_enc_7_test = train_test_split(xt_enc_7.drop(['age'], axis=1),xt_enc_7.age, test_size=.20)
clf_enc_7 = DecisionTreeClassifier(max_depth=5) 
clf_enc_7.fit(xt_enc_7_test, yt_enc_7_test)

# making predictions
clf_enc_pred_7 = clf_enc_7.predict(xt_enc_7_test) # Test the prediction accurracy of the model
# print(classification_report(yt_enc_7_test,clf_enc_pred_7))

In [395]:
confusion_matrix(yt_enc_7_test,clf_enc_pred_7)

array([[27,  3,  0, ...,  0,  0,  0],
       [ 7, 18, 13, ...,  0,  0,  0],
       [ 1,  4, 33, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])

In [401]:
#Adding relationship
non_num_columns = ['marital-status']
xt = adult_test.copy().drop(non_num_columns, axis=1)
xt_enc_8 = pd.get_dummies(xt, columns=['sex','race','workclass','native-country','education','occupation','salary','relationship'], drop_first=True)
yt = adult_test['age']
xt_enc_8_train,xt_enc_8_test,yt_enc_8_train,yt_enc_8_test = train_test_split(xt_enc_8.drop(['age'], axis=1),xt_enc_8.age, test_size=.20)
clf_enc_8 = DecisionTreeClassifier(max_depth=5) 
clf_enc_8.fit(xt_enc_8_test, yt_enc_8_test)

# making predictions
clf_enc_pred_8 = clf_enc_8.predict(xt_enc_8_test) # Test the prediction accurracy of the model
# print(classification_report(yt_enc_8_test,clf_enc_pred_8))

In [397]:
confusion_matrix(yt_enc_8_test,clf_enc_pred_8)

array([[19, 20,  0, ...,  0,  0,  0],
       [ 3, 37,  0, ...,  0,  0,  0],
       [ 1,  4, 12, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])

In [400]:
#Adding marital-status
xt = adult_test.copy()
xt_enc_9 = pd.get_dummies(xt, columns=['sex','race','workclass','native-country','education','occupation','salary','relationship','marital-status'], drop_first=True)
yt = adult_test['age']
xt_enc_9_train,xt_enc_9_test,yt_enc_9_train,yt_enc_9_test = train_test_split(xt_enc_9.drop(['age'], axis=1),xt_enc_9.age, test_size=.20)
clf_enc_9 = DecisionTreeClassifier(max_depth=5) 
clf_enc_9.fit(xt_enc_9_test, yt_enc_9_test)

# making predictions
clf_enc_pred_9 = clf_enc_9.predict(xt_enc_9_test) # Test the prediction accurracy of the model
# print(classification_report(yt_enc_9_test,clf_enc_pred_9))

In [399]:
confusion_matrix(yt_enc_9_test,clf_enc_pred_9)

array([[30,  2,  0, ...,  0,  0,  0],
       [13, 14,  0, ...,  0,  0,  0],
       [ 5,  6,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])