# 3. 🎋 Decision Trees/Random Forest:
Exported from Filament on Thu, 17 Mar 2022 19:29:26 GMT

---

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from time import time

from sklearn import datasets
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [None]:
df = pd.read_csv('cleaned_mushrooms.csv')

# reading in the cleaned data

### ✅  Functions:

In [None]:
# Our Accuracy, precision, recall and f1 scores custom function

def apr(y_pred, y_real):
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)
    
    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")
    return accuracy, precision, recall, f1


# Confusion matrix function

def produce_confusion(positive_label, negative_label, cut_off, df, y_pred_name, y_real_name):
    # Set pred to 0 or 1 depending on whether it's higher than the cut_off point.
    # We use this when we predict probabilites
    if cut_off != 'binary':      
        df['pred_binary'] = np.where(df[y_pred_name] > cut_off , 1, 0)
    else: 
        df['pred_binary'] = df[y_pred_name]
    
    # Build the matrix
    cm = confusion_matrix(df[y_real_name], df['pred_binary'])  
    
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax=ax, fmt='g'); 

    # labels
    ax.set_xlabel('Predicted labels');
    ax.set_ylabel('Real labels'); 
    # Title
    ax.set_title('Confusion Matrix'); 
    # Ticks
    ax.xaxis.set_ticklabels([negative_label, positive_label])
    ax.yaxis.set_ticklabels([negative_label, positive_label]);

    print('Test accuracy = ', accuracy_score(df[y_real_name], df['pred_binary']))

    return accuracy_score(df[y_real_name], df['pred_binary'])

## ✅  Label Encoding:

In [None]:
# columns with only two categories can be label encoded:

for col in df.columns:
    if len(df[col].value_counts()) == 2:
        df[col] = df[col].astype('category') ## change these cols to category data types
        df[col] = df[col].cat.codes ## use cat.codes function to label encode

df.dtypes # we can see the data types of the label encoded cols are now integers

In [None]:
# prints every column and its unique values

for col in df.columns:
    print(col, " : ", df[col].unique())


In [None]:
# we know edible = 4208, and poisionous = 3916
# double checking that 0 = edible, 1 = poisonous

df['class'].value_counts()

## ✅ OHE:

In [None]:
# one hot encoding all columns that haven't been label encoded

one_hot = list(df.columns)
one_hot.remove('class')
one_hot.remove('bruises')
one_hot.remove('gill_attachment')
one_hot.remove('gill_spacing')
one_hot.remove('gill_size')
one_hot.remove('stalk_shape')
df = pd.get_dummies(df, columns = one_hot, prefix = one_hot)

In [None]:
# assigning variables for ease later on 

Poisonous = df[df['class'] == 1]
Edible = df[df['class'] == 0]
feature_cols = list(df.columns)
feature_cols.remove('class')

In [None]:
# saved here so that we can reuse the ohe and label encoded version to run more refined decision tree models later

df.to_csv('decision_mushrooms.csv', index = False)

## ✅ Train/Test Split:

In [None]:
# train size = 80%, test size = 20%, random state = 124

X = df.drop(columns = ['class'])
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=124, stratify=y)

# 🎋 Decision Trees:

In [None]:
# here we randomly set the max depth to 3 and fit the decision tree

treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X, y)

In [None]:
# here we call the decision tree- we can see the results are ok, but certainly room for improvement!

plt.figure(figsize=(30,20))
tree.plot_tree(treeclf, feature_names=feature_cols,  
               class_names=['Poisonous','Edible'],filled = True)

In [None]:
# printing the results for the train and test data
# pretty good results, especially considering we inputted random parameters

print(f'Score on training set: {treeclf.score(X_train, y_train)}')
print(f'Score on testing set: {treeclf.score(X_test, y_test)}')

In [None]:
# GRIDSEARCH:
# we can fine tune our own hyperparameters to find the best cross validation score

grid = GridSearchCV(estimator = DecisionTreeClassifier(),
                    param_grid = {'max_depth': [3, 5, 7, 10, 15],
                                  'min_samples_split': [5, 10, 15, 20],
                                  'min_samples_leaf': [2, 3, 4, 5, 6, 7]},
                    cv = 5,
                    refit = True,
                    verbose = 1,
                    scoring = 'accuracy')

In [None]:
# this nifty code tells us how long the model takes to run

now = time()

grid.fit(X_train, y_train)

print(f' Time in seconds: {time() - now}')

In [None]:
# this tells us the best parameters from our gridsearch

grid.best_params_

In [None]:
# Gives us the best average of all c-v folds for a single combination of the params we've inputted

grid.best_score_

In [None]:
# lets rerun the decision trees using these new params
# we've saved it as a new variable (dt)

dt = DecisionTreeClassifier(max_depth=7, min_samples_leaf=2, min_samples_split=5)

In [None]:
dt.fit(X_train, y_train)

In [None]:
# nice- the results seem to be better- but lets check the scores to confirm

plt.figure(figsize=(30,20))
tree.plot_tree(dt, feature_names=feature_cols, 
               class_names=['Poisonous','Edible'],filled=True)

In [None]:
# wow the scores have improved- almost too good... overfitting?

print(f'Score on training set: {dt.score(X_train, y_train)}')
print(f'Score on testing set: {dt.score(X_test, y_test)}')

In [None]:
# here we make a copy of the train data
# then we add actual and prediction columns, followed by the probability of ascertaining the right result

train_results = X_train.copy()
train_results['y_pred'] = dt.predict(X_train)
train_results['y_real'] = y_train
train_results['y_prob'] = dt.predict_proba(X_train)[:,1]

train_results

In [None]:
# we repeat the process for the test data

test_results = X_test.copy()
test_results['y_pred'] = dt.predict(X_test)
test_results['y_real'] = y_test
test_results['y_prob'] = dt.predict_proba(X_test)[:,1]

test_results

## ✅ Feature Importance:

In [None]:
# looking at the feature importance of our decision tree

dt.feature_importances_

In [None]:
# turn the feature importance into a list (need to zip first) to make it easy tor read
# the higher the score the better!

importance = list(zip(feature_cols, list(dt.feature_importances_)))
importance

These are all the features with importance above zero:

gill_size, cap_surface_grooves, odor_almond, odor_anise, odor_none, stalk_root_bulbous, stalk_root_club, stalk_surface_below_ring_scaly, spore_print_color_green, population_clustered, habitat_woods

We can use these in a refined decision trees model, and see what results we get- see 'Dec Trees- Final' workbook!

But lets carry on below for now...

In [None]:
# we've brought in our confusion matrix to see our train results

""" Confusion Matrix for Train Data """
produce_confusion('Poisonous', 'Edible', 0.5, train_results, 'y_pred', 'y_real')

In [None]:
# we do the same for the test data

produce_confusion('Poisonous','Edible', 0.5, test_results, 'y_pred', 'y_real')

# 🎋🎋 Random Forest:

Random forests are another great way to model, lets try it below to see what results we get! Although the decision tree will be pretty hard to beat with results like that!

In [None]:
# now lets try random forest modelling, once again lets start with inputting a random parameter

rf = RandomForestClassifier(n_estimators=50)

In [None]:
# here we have our accuracy, precision, recall and f1 scores for our random forest

rf_score = cross_val_score(rf, X_train, y_train, cv=5)
print(f'Random scored {rf_score}')

In [None]:
#Although ensembling and random forests typically improve model results from decision trees
#They still need to be subject to the usual tunning of hyperparameters to improve the model further
#Gridsearch can be used for this

rf_params = {
    'n_estimators': list(range(1, 100, 2)),
    'max_depth': [None, 1, 2, 3, 4, 5],
}


gs = GridSearchCV(rf, param_grid=rf_params, cv=5)

gs.fit(X_train, y_train)

print(f'Best score: {gs.best_score_}')

gs.best_params_

In [None]:
# once again we seem to have a perfect score!

gs.score(X_train, y_train)

In [None]:
## Lets see our metrics score, comparing our predictions to the actual results
## what a surprise- we have perfect scores across the board!

predictions_rf_train = pd.DataFrame(index=X_train.index)

predictions_rf_train['Pred'] = gs.predict(X_train)
predictions_rf_train['Actual'] = y_train

apr(predictions_rf_train['Pred'],predictions_rf_train['Actual'])

In [None]:

## Check scores for the test data now- and once again we have perfect scores!


predictions_rf_test = pd.DataFrame(index=X_test.index)

predictions_rf_test['Pred'] = gs.predict(X_test)
predictions_rf_test['Actual'] = y_test

apr(predictions_rf_test['Pred'],predictions_rf_test['Actual'])

In [None]:
# Here we see the most improtant features in our random forest
# they are exactly the same as the 12 most important features or decision tree picked up
# except here we've used a lovely visual

feat_imp = pd.DataFrame(dt.feature_importances_, index=X_train.columns, columns=["feat_imp"])
feat_imp = feat_imp.sort_values("feat_imp", ascending=False)
feat_imp.style.background_gradient("Blues")
