# Lab 7 - Trees, Forests, and Classification
- **Author:** Emily Aiken ([emilyaiken@berkeley.edu](mailto:emilyaiken@berkeley.edu))
- **Date:** March 9, 2022
- **Course:** INFO 251: Applied machine learning

## Topics:
1. Decision Trees
2. Random Forests
3. Classification performance metrics

## Learning Goals:
At the end of this lab, you will...
- Understand the recursive algorithm to grow a decision tree
- Know the standard splitting criteria used for decision trees
- Understand how feature importances are calculated for decision trees
- Know the main hyperparameters for decision trees and random forests, and how to tune them to prevent overfitting
- Know the main performance measures for classification: accuracy, TPR, FPR, precision, and recall
- Understand ROC curves, precision and recall plots, and methods for determining the "optimal" classification threshold

## Resources:
- [Feature importances in random forests](https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import datasets
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve, r2_score

### I. Decision Tree

In [None]:
# Data
np.random.seed(11)
data = datasets.load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
for col in df.columns:
    if col != 'target':
        df[col] = df[col] + np.random.normal(0, 4*df[col].std(), len(df))
df.head()

In [None]:
print('Observations in class 0: %i' % len(df[df['target'] == 0]))
print('Observations in class 1: %i' % len(df[df['target'] == 1]))

In [None]:
# Split data into training and test
train, test = train_test_split(df, shuffle=True, test_size=0.25, random_state=0)
x_train, y_train = train.drop('target', axis=1), train['target']
x_test, y_test = test.drop('target', axis=1), test['target']

In [None]:
# Fit model without tuning hyperparameters
model = DecisionTreeClassifier(max_depth=2)
model.fit(x_train, y_train)
yhat_train = model.predict(x_train)
yhat_test = model.predict(x_test)

print('Accuracy (train): %.2f' % accuracy_score(y_train, yhat_train))
print('Accuracy (test): %.2f' % accuracy_score(y_test, yhat_test))

print('Precision (train): %.2f' % recall_score(y_train, yhat_train))
print('Precision (test): %.2f' % recall_score(y_test, yhat_test))

print('Recall (train): %.2f' % precision_score(y_train, yhat_train))
print('Recall (test): %.2f' % precision_score(y_test, yhat_test))

#### A. Hyperparameter Tuning

In [None]:
# Tune hyperparameter: max_depth
cv = KFold(n_splits=3, shuffle=True, random_state=1)
params = {'max_depth':[2, 4, 6, 8, 10, 12, 14]}
cv_model = GridSearchCV(model, param_grid=params, scoring='accuracy', refit=True, return_train_score=True, cv=cv)
cv_model.fit(x_train, y_train)
cv_results = pd.DataFrame(cv_model.cv_results_)
cv_results.head()

In [None]:
# Plot CV accuracy as a function of maximum depth
sns.set(font_scale=1.5)
fig, ax = plt.subplots(1, figsize=(10, 5))
ax.plot(cv_results['param_max_depth'], cv_results['mean_train_score'], label='Train')
ax.plot(cv_results['param_max_depth'], cv_results['mean_test_score'], label='Test')
ax.set_xlabel('Maximum Depth of Decision Tree')
ax.set_ylabel('Average CV Accuracy')
ax.set_title('CV Accuracy vs. Tree Depth')
ax.legend(loc='best')
plt.show()

In [None]:
# Get predictions on test set using best model
model = cv_model.best_estimator_
model.fit(x_train, y_train)
yhat_train = model.predict(x_train)
yhat_test = model.predict(x_test)

print('Best maximum depth: %i' % cv_model.best_params_['max_depth'])

print('Accuracy (train): %.2f' % accuracy_score(y_train, yhat_train))
print('Accuracy (test): %.2f' % accuracy_score(y_test, yhat_test))

print('Precision (train): %.2f' % recall_score(y_train, yhat_train))
print('Precision (test): %.2f' % recall_score(y_test, yhat_test))

print('Recall (train): %.2f' % precision_score(y_train, yhat_train))
print('Recall (test): %.2f' % precision_score(y_test, yhat_test))

#### B. Feature Importances

In [None]:
importances = model.feature_importances_
importances = pd.DataFrame([x_train.columns, importances]).T
importances.columns = ['Feature', 'Importance']
importances = importances.sort_values('Importance', ascending=False)[:10]

In [None]:
# Bar chart
fig, ax = plt.subplots(1, figsize=(10, 6))
plt.barh(importances['Feature'], importances['Importance'])
ax.set_xlabel('Gini Importance')
ax.set_title('Feature Importances in Decision Tree')
plt.show()

### II. Random Forest

In [None]:
# Fit model without tuning hyperparameters
model = RandomForestClassifier(n_estimators=100, random_state=1)
model.fit(x_train, y_train)
yhat_train = model.predict(x_train)
yhat_test = model.predict(x_test)

print('Accuracy (train): %.2f' % accuracy_score(y_train, yhat_train))
print('Accuracy (test): %.2f' % accuracy_score(y_test, yhat_test))

print('Precision (train): %.2f' % recall_score(y_train, yhat_train))
print('Precision (test): %.2f' % recall_score(y_test, yhat_test))

print('Recall (train): %.2f' % precision_score(y_train, yhat_train))
print('Recall (test): %.2f' % precision_score(y_test, yhat_test))

#### A. Hyperparameter Tuning

In [None]:
# Tune hyperparameters: max_depth, n_estimators
model = RandomForestClassifier(random_state=1)
cv = KFold(n_splits=3, shuffle=True, random_state=1)
params = {'max_depth':[2, 4, 6, 8], 'n_estimators':[25, 50, 100]}
cv_model = GridSearchCV(model, param_grid=params, scoring='accuracy', refit=True, return_train_score=True, cv=cv)
cv_model.fit(x_train, y_train)
model = cv_model.best_estimator_
model.fit(x_train, y_train)
yhat_train = model.predict(x_train)
yhat_test = model.predict(x_test)

print('Best maximum depth: %i' % cv_model.best_params_['max_depth'])
print('Best number of estimators: %i' % cv_model.best_params_['n_estimators'])

print('Accuracy (train): %.2f' % accuracy_score(y_train, yhat_train))
print('Accuracy (test): %.2f' % accuracy_score(y_test, yhat_test))

print('Precision (train): %.2f' % recall_score(y_train, yhat_train))
print('Precision (test): %.2f' % recall_score(y_test, yhat_test))

print('Recall (train): %.2f' % precision_score(y_train, yhat_train))
print('Recall (test): %.2f' % precision_score(y_test, yhat_test))

#### B. ROC Curves

In [None]:
# ROC Curve
yhat_test_proba = model.predict_proba(x_test)[:, 1]
fprs, tprs, thresholds = roc_curve(y_test, yhat_test_proba)

fig, ax = plt.subplots(1, figsize=(10, 5))
ax.scatter(fprs, tprs)
ax.plot(fprs, tprs)
ax.plot([0, 1], [0, 1], color='grey', dashes=[2, 2])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate (Recall)')
ax.set_title('ROC Curve (Test Data)')
plt.show()

In [None]:
# Get area under the curve
print('AUC score: %.2f' % roc_auc_score(y_test, yhat_test_proba))

In [None]:
# Get "optimal" threshold: the one closest to the top-left corner of the ROC graph
distances_from_top_left = [np.sqrt(tprs[i]**2 + (1-fprs[i])**2) for i in range(len(tprs))]
best_cutoff = np.argmin(distances_from_top_left)
print('Threshold closest to top-left corner of graph: %.2f (%.2f TPR, %.2f FPR)' % 
      (thresholds[best_cutoff], tprs[best_cutoff], fprs[best_cutoff]))

#### C. Precision and Recall Curves

In [None]:
# Precision vs. recall plot
thresholds = np.arange(0, 0.80, .01)
precisions = [precision_score(y_test, (yhat_test_proba > t)) for t in thresholds]
recalls = [recall_score(y_test, (yhat_test_proba > t)) for t in thresholds]

fig, ax = plt.subplots(1, figsize=(10, 5))
ax.plot(thresholds, precisions, label='Precision')
ax.plot(thresholds, recalls, label='Recall')
ax.set_xlabel('Classification Threshold')
ax.set_ylabel('Precision and Recall')
ax.set_title('Precision and Recall vs. Classification Threshold')
ax.legend(loc='best')
plt.show()

In [None]:
# Get "optimal threshold": The one where precision and recall are balanced
best_threshold = np.argmin(np.abs(np.array(precisions) - np.array(recalls)))
print('Best threshold: %.2f (%.2f precision, %.2f recall)' % 
      (thresholds[best_threshold], precisions[best_threshold], recalls[best_threshold]))

#### D. Feature Importances

In [None]:
# Get feature importances
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
importances = pd.DataFrame([x_train.columns, importances]).T
importances.columns = ['Feature', 'Importance']
importances = importances.sort_values('Importance', ascending=True)

In [None]:
# Bar chart
fig, ax = plt.subplots(1, figsize=(10, 10))
plt.barh(importances['Feature'], importances['Importance'], yerr=std)
ax.set_xlabel('Mean Decrease in Impurity')
ax.set_title('Feature Importances in Random Forest')
plt.show()

### III. Over to you!
New dataset: Boston housing prices
- Train a decision tree, try tuning *maximum depth* and report best r2
- Train a random forest, try tuning *maximum depth*, *n_estimators* and report best r2
- Calculate the feature importances for the best random forest, show mean and standard deviation in impurity decrease

You should be able to achieve an r2 score of above 0.70 on the test set using your well-tuned random forest.

In [None]:
data = datasets.load_boston()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

In [None]:
train, test = train_test_split(df, shuffle=True, test_size=0.25, random_state=0)
x_train, y_train = train.drop('target', axis=1), train['target']
x_test, y_test = test.drop('target', axis=1), test['target']