# For the following use the above `adult` dataset. Start with only numerical features/columns.  

# 1. Show the RandomForest outperforms the DecisionTree for a fixed `max_depth` by training using the train set and `precision`, `recall`, `f1` on golden-test set.

# 2. For RandomForest or DecisionTree and using the `adult` dataset, systematically add new columns, one by one, that are non-numerical but converted using the feature-extraction techniques we learned. Show [`precision`, `recall`, `f1`] for each additional feature added.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 6)
plt.rcParams['font.size'] = 14
import pandas as pd

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [38]:
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, auc, roc_curve
)

In [None]:
from sklearn import preprocessing

In [None]:
df = pd.read_csv('../data/adult.data', index_col=False)

In [None]:
golden = pd.read_csv('../data/adult.test', index_col=False)

In [None]:
df.head()

In [None]:
golden.head()

In [None]:
transform_columns = ['sex']
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']

In [None]:
enc = preprocessing.OrdinalEncoder()

In [None]:
x = df.copy()

x = pd.concat([x.drop(non_num_columns, axis=1), 
               pd.get_dummies(df[transform_columns])], axis=1,)

x["salary"] = enc.fit_transform(df[["salary"]])

In [None]:
xt = golden.copy()

xt = pd.concat([xt.drop(non_num_columns, axis=1), # dropping non-numerical columns, variable "non_num_columns" was defined in a command line above
               pd.get_dummies(golden[transform_columns])], axis=1,) # converting "transform_columns" (defined above) in a dummy variable

xt["salary"] = enc.fit_transform(golden[["salary"]])

In [None]:
x.salary.value_counts() # Adult dataset

In [None]:
xt.salary.value_counts() # Golden dataset

In [None]:
x.head() # Glimpse dataset 'adult'

In [None]:
xt.head() # Glimpse dataset 'golden'

# RandomForest Model

In [102]:
model = RandomForestClassifier(criterion='entropy')

In [103]:
model.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

RandomForestClassifier(criterion='entropy')

In [None]:
list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, model.feature_importances_))

In [104]:
predictions = model.predict(xt.drop(['fnlwgt','salary'], axis=1))
predictionsx = model.predict(x.drop(['fnlwgt','salary'], axis=1))

### Testing Model

In [105]:
accuracy_score(xt.salary, predictions)

0.8285117621767705

In [106]:
confusion_matrix(xt.salary, predictions)

array([[11509,   926],
       [ 1866,  1980]])

In [107]:
print(classification_report(xt.salary, predictions))

              precision    recall  f1-score   support

         0.0       0.86      0.93      0.89     12435
         1.0       0.68      0.51      0.59      3846

    accuracy                           0.83     16281
   macro avg       0.77      0.72      0.74     16281
weighted avg       0.82      0.83      0.82     16281



### Training model

In [108]:
accuracy_score(x.salary, predictionsx)

0.8955806025613464

In [109]:
confusion_matrix(x.salary, predictionsx)

array([[23868,   852],
       [ 2548,  5293]])

In [110]:
print(classification_report(x.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.90      0.97      0.93     24720
         1.0       0.86      0.68      0.76      7841

    accuracy                           0.90     32561
   macro avg       0.88      0.82      0.85     32561
weighted avg       0.89      0.90      0.89     32561



# DecisionTree Model

In [89]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=None)

In [90]:
model.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

DecisionTreeClassifier(criterion='entropy')

In [None]:
list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, model.feature_importances_))

In [91]:
predictions = model.predict(xt.drop(['fnlwgt','salary'], axis=1))
predictionsx = model.predict(x.drop(['fnlwgt','salary'], axis=1))

### Testing Model

In [92]:
accuracy_score(xt.salary, predictions)

0.8213254714083902

In [93]:
confusion_matrix(xt.salary, predictions)

array([[11473,   962],
       [ 1947,  1899]])

In [94]:
print(classification_report(xt.salary, predictions))

              precision    recall  f1-score   support

         0.0       0.85      0.92      0.89     12435
         1.0       0.66      0.49      0.57      3846

    accuracy                           0.82     16281
   macro avg       0.76      0.71      0.73     16281
weighted avg       0.81      0.82      0.81     16281



### Training model

In [96]:
accuracy_score(x.salary, predictionsx)

0.8955806025613464

In [98]:
confusion_matrix(x.salary, predictionsx)

array([[24097,   623],
       [ 2777,  5064]])

In [101]:
print(classification_report(x.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.90      0.97      0.93     24720
         1.0       0.89      0.65      0.75      7841

    accuracy                           0.90     32561
   macro avg       0.89      0.81      0.84     32561
weighted avg       0.90      0.90      0.89     32561



# Question 2

In [127]:
transform_columns = ['sex']
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']

In [129]:
xnew = df.copy()

xnew = pd.concat([xnew.drop(non_num_columns, axis=1), 
               pd.get_dummies(df[transform_columns])], axis=1,)

xnew["salary"] = enc.fit_transform(df[["salary"]])

In [132]:
xnew.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,sex_ Female,sex_ Male,workclass
0,39,77516,13,2174,0,40,0.0,0,1,7
1,50,83311,13,0,0,13,0.0,0,1,6
2,38,215646,9,0,0,40,0.0,0,1,4
3,53,234721,7,0,0,40,0.0,0,1,4
4,28,338409,13,0,0,40,0.0,1,0,4


In [117]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [118]:
one_hot = OneHotEncoder()

In [119]:
label_encoder = LabelEncoder()

#### Inserting 'workclass' to the model

In [131]:
xnew['workclass'] = label_encoder.fit_transform(df.workclass)

In [133]:
model = RandomForestClassifier(criterion='entropy')

In [134]:
model.fit(xnew.drop(['fnlwgt','salary'], axis=1), xnew.salary)

RandomForestClassifier(criterion='entropy')

In [136]:

predictionsx = model.predict(xnew.drop(['fnlwgt','salary'], axis=1))

In [137]:
accuracy_score(xnew.salary, predictionsx)

0.9197813334971284

In [138]:
confusion_matrix(xnew.salary, predictionsx)

array([[24035,   685],
       [ 1927,  5914]])

In [139]:
print(classification_report(xnew.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95     24720
         1.0       0.90      0.75      0.82      7841

    accuracy                           0.92     32561
   macro avg       0.91      0.86      0.88     32561
weighted avg       0.92      0.92      0.92     32561



#### Inserting 'education' to the model

In [None]:
transform_columns = ['sex']
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']

In [148]:
xnew['education'] = label_encoder.fit_transform(df.education) 

In [149]:
model.fit(xnew.drop(['fnlwgt','salary'], axis=1), xnew.salary)

RandomForestClassifier(criterion='entropy')

In [150]:
predictionsx = model.predict(xnew.drop(['fnlwgt','salary'], axis=1))

In [151]:
accuracy_score(xnew.salary, predictionsx)

0.9197199103221646

In [152]:
confusion_matrix(xnew.salary, predictionsx)

array([[24035,   685],
       [ 1929,  5912]])

In [153]:
print(classification_report(xnew.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95     24720
         1.0       0.90      0.75      0.82      7841

    accuracy                           0.92     32561
   macro avg       0.91      0.86      0.88     32561
weighted avg       0.92      0.92      0.92     32561



#### Inserting 'marital-status' to the model

In [155]:
transform_columns = ['sex']
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']

In [157]:
xnew['marital-status'] = label_encoder.fit_transform(df['marital-status']) 

In [158]:
model.fit(xnew.drop(['fnlwgt','salary'], axis=1), xnew.salary)

RandomForestClassifier(criterion='entropy')

In [160]:
predictionsx = model.predict(xnew.drop(['fnlwgt','salary'], axis=1))

In [161]:
accuracy_score(xnew.salary, predictionsx)

0.9403580971100396

In [162]:
confusion_matrix(xnew.salary, predictionsx)

array([[24039,   681],
       [ 1261,  6580]])

In [163]:
print(classification_report(xnew.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.95      0.97      0.96     24720
         1.0       0.91      0.84      0.87      7841

    accuracy                           0.94     32561
   macro avg       0.93      0.91      0.92     32561
weighted avg       0.94      0.94      0.94     32561



#### Inserting 'occupation' to the model

In [None]:
transform_columns = ['sex']
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']

In [164]:
xnew['occupation'] = label_encoder.fit_transform(df['occupation']) # Inserting 'education' to the model

In [165]:
model.fit(xnew.drop(['fnlwgt','salary'], axis=1), xnew.salary)

RandomForestClassifier(criterion='entropy')

In [167]:
predictionsx = model.predict(xnew.drop(['fnlwgt','salary'], axis=1))

In [168]:
accuracy_score(xnew.salary, predictionsx)

0.9694419704554529

In [169]:
confusion_matrix(xnew.salary, predictionsx)

array([[24351,   369],
       [  626,  7215]])

In [170]:
print(classification_report(xnew.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98     24720
         1.0       0.95      0.92      0.94      7841

    accuracy                           0.97     32561
   macro avg       0.96      0.95      0.96     32561
weighted avg       0.97      0.97      0.97     32561



#### Inserting 'relationship' to the model

In [None]:
transform_columns = ['sex']
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']

In [171]:
xnew['relationship'] = label_encoder.fit_transform(df['relationship']) # Inserting 'relationship' to the model

In [172]:
model.fit(xnew.drop(['fnlwgt','salary'], axis=1), xnew.salary)

RandomForestClassifier(criterion='entropy')

In [173]:
predictionsx = model.predict(xnew.drop(['fnlwgt','salary'], axis=1))

In [174]:
accuracy_score(xnew.salary, predictionsx)

0.9704554528423575

In [175]:
confusion_matrix(xnew.salary, predictionsx)

array([[24354,   366],
       [  596,  7245]])

In [176]:
print(classification_report(xnew.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.98      0.99      0.98     24720
         1.0       0.95      0.92      0.94      7841

    accuracy                           0.97     32561
   macro avg       0.96      0.95      0.96     32561
weighted avg       0.97      0.97      0.97     32561



#### Inserting 'race' to the model

In [None]:
transform_columns = ['sex']
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']

In [177]:
xnew['race'] = label_encoder.fit_transform(df['race']) # Inserting 'race' to the model

In [178]:
model.fit(xnew.drop(['fnlwgt','salary'], axis=1), xnew.salary)


RandomForestClassifier(criterion='entropy')

In [179]:
predictionsx = model.predict(xnew.drop(['fnlwgt','salary'], axis=1))


In [180]:
accuracy_score(xnew.salary, predictionsx)


0.9744479592150118

In [181]:
confusion_matrix(xnew.salary, predictionsx)


array([[24403,   317],
       [  515,  7326]])

In [182]:
print(classification_report(xnew.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.98      0.99      0.98     24720
         1.0       0.96      0.93      0.95      7841

    accuracy                           0.97     32561
   macro avg       0.97      0.96      0.96     32561
weighted avg       0.97      0.97      0.97     32561



#### Inserting 'native-country' to the model

In [184]:
transform_columns = ['sex']
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']

In [185]:
xnew['race'] = label_encoder.fit_transform(df['native-country'])


In [186]:
model.fit(xnew.drop(['fnlwgt','salary'], axis=1), xnew.salary)

RandomForestClassifier(criterion='entropy')

In [187]:
predictionsx = model.predict(xnew.drop(['fnlwgt','salary'], axis=1))

In [188]:
accuracy_score(xnew.salary, predictionsx)

0.973588034765517

In [189]:
confusion_matrix(xnew.salary, predictionsx)

array([[24408,   312],
       [  548,  7293]])

In [190]:
print(classification_report(xnew.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.98      0.99      0.98     24720
         1.0       0.96      0.93      0.94      7841

    accuracy                           0.97     32561
   macro avg       0.97      0.96      0.96     32561
weighted avg       0.97      0.97      0.97     32561

