# 4. 🎄 Dec Trees- Final
Exported from Filament on Thu, 17 Mar 2022 19:29:57 GMT

---

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from time import time

from sklearn import datasets
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [None]:
df = pd.read_csv('decision_mushrooms.csv') # reading in the cleaned and encoded mushroom data

### ✅ Functions:

In [None]:
#Our Accuracy, precision and recall custom fucntion
def apr(y_pred, y_real):
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)
    
    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")
    return accuracy, precision, recall, f1


#Confusion matrix function
def produce_confusion(positive_label, negative_label, cut_off, df, y_pred_name, y_real_name):
    
    #Set pred to 0 or 1 depending on whether it's higher than the cut_off point.
    #We use this when we predict probabilites
    if cut_off != 'binary':      
        df['pred_binary'] = np.where(df[y_pred_name] > cut_off , 1, 0)
    else: 
        df['pred_binary'] = df[y_pred_name]
    
    #Build the CM
    cm = confusion_matrix(df[y_real_name], df['pred_binary'])  
    
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax=ax, fmt='g'); 

    # labels
    ax.set_xlabel('Predicted labels');
    ax.set_ylabel('Real labels'); 
    #Title
    ax.set_title('Confusion Matrix'); 
    #Ticks
    ax.xaxis.set_ticklabels([negative_label, positive_label])
    ax.yaxis.set_ticklabels([negative_label, positive_label]);

    print('Test accuracy = ', accuracy_score(df[y_real_name], df['pred_binary']))

    return accuracy_score(df[y_real_name], df['pred_binary'])

## ✅ Train/Test Split:

So here we have the refined feature columns to use for our final decision tree. Lets see how the results look, using just these 11 features!

In [None]:
feature_cols = ['gill_size', 'cap_surface_grooves', 'odor_almond', 'odor_anise', 'odor_none',
    'stalk_root_bulbous', 'stalk_root_club', 'stalk_surface_below_ring_scaly',
    'spore_print_color_green', 'population_clustered', 'habitat_woods']
y = df['class']
X = df[feature_cols]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=124, stratify=y)

In [None]:
# defining a random set of parameters again

treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X, y)

In [None]:
plt.figure(figsize=(30,20))
tree.plot_tree(treeclf, feature_names=feature_cols,  
                class_names=['Poisonous','Edible'], filled=True)

In [None]:
# so even with these random params, the scores still look pretty good.
# but lets see if we can improve on them further...

print(f'Score on training set: {treeclf.score(X_train, y_train)}')
print(f'Score on testing set: {treeclf.score(X_test, y_test)}')

In [None]:
# Using a grid search again to help us fine tune the parameters


grid = GridSearchCV(estimator = DecisionTreeClassifier(),
                    param_grid = {'max_depth': [3, 5, 7, 10],
                                  'min_samples_split': [5, 10, 15, 20],
                                  'min_samples_leaf': [2, 3, 4, 5, 6, 7]},
                    cv = 5,
                    refit = True,
                    verbose = 1,
                    scoring = 'accuracy')

In [None]:
# wow- its 10 seconds faster to run using just this refined list of features, great!

now = time()

grid.fit(X_train, y_train)

print(f' Time in seconds: {time() - now}')

In [None]:
# here we can see the best params to use

grid.best_params_

In [None]:
# nice- the score still holds up when we refine
grid.best_score_

# ✅ Final Decision Tree:

In [None]:
# you know the drill by now- lets run the tree with the best params

dt = DecisionTreeClassifier(max_depth=5, min_samples_leaf=2, min_samples_split=5)

In [None]:
# fit the model

dt.fit(X_train, y_train)

In [None]:
# nice- it looks really good, lets check the actual scores for confirmation

fig = plt.figure(figsize=(30,20))
thing = tree.plot_tree(dt, 
                   feature_names=feature_cols,  
                   class_names=['Poisonous','Edible'],
                   filled=True)

In [None]:
# and here we have it- these 11 refined features are still producing a really good score on train/test

print(f'Score on training set: {dt.score(X_train, y_train)}')
print(f'Score on testing set: {dt.score(X_test, y_test)}')

In [None]:
# here we've created a probability of our predictions being right for the train data

train_results = X_train.copy()
train_results['y_pred'] = dt.predict(X_train)
train_results['y_real'] = y_train
train_results['y_prob'] = dt.predict_proba(X_train)[:,1]

train_results

In [None]:
# and we've done the same for the test here

test_results = X_test.copy()
test_results['y_pred'] = dt.predict(X_test)
test_results['y_real'] = y_test
test_results['y_prob'] = dt.predict_proba(X_test)[:,1]

test_results

## ✅ Confusion Matrix:

In [None]:
# so here we can see that the results are pretty good- but beware, there are still 2 false negatives 

""" Confusion Matrix for Train Data """
produce_confusion('Poisonous', 'Edible', 0.5, train_results, 'y_pred', 'y_real')

In [None]:
# similarly for the test data- we still have two false negatives!

""" Confusion Matrix for Test Data """
produce_confusion('Poisonous','Edible', 0.5, test_results, 'y_pred', 'y_real')