In [0]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier


import pandas as pd
import seaborn as sns
sns.set()

# Data pre-processing

In [0]:
fsh_df = pd.read_csv('Fish.csv')
fsh_df.head()

In [0]:
train_features, test_features, train_labels, test_labels = train_test_split(fsh_df.loc[:,fsh_df.columns != 'Species'], fsh_df.Species, test_size = 0.25)

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

# Let's plant a tree

In [0]:
train_features.head()

In [0]:
# fit a decision tree to the training data
decision_tree = tree.DecisionTreeClassifier()
decision_tree.fit(train_features, train_labels)

def find_test_accuracy(model,test_features,test_labels):
    # Use our tree to predict the labels of the test data
    predictions = model.predict(test_features)
    # Model Accuracy, how often is the classifier correct?
    print("Test Accuracy:",metrics.accuracy_score(test_labels, predictions))
    
def draw_me_a_tree(my_tree,feature_names=None,class_names=None):
    if feature_names is None:
        feature_names = list(my_tree.feature_names_in_)
    if class_names is None:
        class_names = list(my_tree.classes_)
    # Let's draw our tree
    plt.figure(figsize=(20, 12), dpi=100)
    tree.plot_tree(my_tree,feature_names = feature_names,class_names = class_names)
    plt.show()
    return

# How well did we do?
find_test_accuracy(decision_tree,test_features,test_labels)
# Let's draw our tree
draw_me_a_tree(decision_tree)

## Model interpretation
- What are the final splits found by the tree?
- What is the depth of this tree?
- What are the decision paths learned by this tree?
- Why did we get this tree?
- Is this a "good" tree?

In [0]:
# How about a smaller tree? We'll enforce a maximum depth
our_max_depth = 3

short_tree = tree.DecisionTreeClassifier(max_depth=our_max_depth)
short_tree.fit(train_features, train_labels)

# How well did we do?
find_test_accuracy(short_tree)
# Let's draw our tree
draw_me_a_tree(short_tree)

## How do these two trees compare?

# Let's plant a whole forest

In [0]:
number_of_trees = 10

# Instantiate model with 5 decision trees
rf = RandomForestClassifier(n_estimators = number_of_trees, max_depth = our_max_depth,bootstrap = True)
# Train the model on training data
rf.fit(train_features, train_labels)

# How well did we do?
find_test_accuracy(rf)

## Let's take a look at one of our trees

In [0]:
random_tree = rf.estimators_[0]

draw_me_a_tree(random_tree,feature_names = list(rf.feature_names_in_),class_names = list(rf.classes_))

# It's Boostin' Time

In [0]:
booster =  AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=our_max_depth), n_estimators=number_of_trees)
booster.fit(train_features,train_labels)

find_test_accuracy(booster)

Let's look at some trees:

In [0]:
sub_tree_first = booster.estimators_[0]
sub_tree_last = booster.estimators_[-1]

print('First subtree:')
find_test_accuracy(sub_tree_first)
draw_me_a_tree(sub_tree_first,feature_names = list(booster.feature_names_in_),class_names = list(booster.classes_))

print('Last subtree:')
find_test_accuracy(sub_tree_last)
draw_me_a_tree(sub_tree_last,feature_names = list(booster.feature_names_in_),class_names = list(booster.classes_))