### Decision Tree Methods

In [None]:
# This code appears in every demonstration Notebook.
# By default, when you run each cell, only the last output of the codes will show.
# This code makes all outputs of a cell show.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

1. Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.metrics import accuracy_score, confusion_matrix

2. Import Carseats dataset

In [None]:
Carseats = pd.read_csv('Carseats.csv')

In [None]:
Carseats.info()
Carseats.head()

3. We intend to classify the sales of carseats into high or low categories.<br>
First, we need to transform the numeric 'Sales' into high/low categories.

In [None]:
# The variable we create will be our y.
Carseats['Sales_c'] = pd.cut(Carseats['Sales'], bins = [-1, Carseats['Sales'].mean(), float('inf')], labels = ['low', 'high'])

In [None]:
Carseats['Sales_c'].value_counts()

4. Prepare X and y

In [None]:
# Drop the dependent variable from Carseats to get X
X = Carseats.drop(['Sales', 'Sales_c', 'CompPrice'], axis = 1)
y = Carseats['Sales_c']

In [None]:
X.head()

In [None]:
X = pd.get_dummies(X, columns = ['ShelveLoc', 'Urban', 'US'], drop_first=True)

5. Split training and test datasets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)

In [None]:
y_train.value_counts()

6. Fit a decision tree.<br>
We set the max depth to be 3 to stop the tree growing.

In [None]:
# Initiating the classifier
# We can set parameters to control the tree
# criterion - entropy or gini index
# max_depth - the levels of the tree
# min_samples_split - the least observations in a node
dt_1 = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)

In [None]:
dt_1.fit(X_train, y_train)

7. Plot the tree

In [None]:
car_names = X.columns

In [None]:
plt.figure(figsize=(12, 8))
plot_tree(dt_1, filled=True, feature_names = car_names) 
          #class_names = ['low', 'high'])

In [None]:
print(export_text(dt_1, feature_names=car_names, show_weights=True))

In [None]:
# Make prediction
dt_1_pred = dt_1.predict(X_test)

In [None]:
accuracy_score(y_test, dt_1_pred)
confusion_matrix(y_test, dt_1_pred)

In [None]:
pd.crosstab(y_test, dt_1_pred, rownames=['True'], colnames=['Predicted'])#, margins=True)

8. Next we fit a tree without specifying stopping criteria.<br>
By default, the tree will grown until there is no more information gain.

In [None]:
dt_full = DecisionTreeClassifier(criterion='entropy', random_state=0)

In [None]:
dt_full.fit(X_train, y_train)

In [None]:
plt.figure(figsize=(12, 8))
plot_tree(dt_full, filled=True, feature_names = car_names) 

# It is a large tree

In [None]:
dt_full_pred = dt_full.predict(X_test)
accuracy_score(y_test, dt_full_pred)
confusion_matrix(y_test, dt_full_pred)
# You can see the overfitting problem. The error rate does not decrease with bigger tree.

9. Bagging, random forest and boosting

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

9.1 Bagging

In [None]:
# Bagging is the special case of random forest: it uses all variables to grow trees
dt_bag = RandomForestClassifier(n_estimators=500, max_features=None, random_state=0)

# n_estimators default is 100. We can increase it.
# max_features default is to use all variables

In [None]:
dt_bag.fit(X_train, y_train)

dt_bag_pred = dt_bag.predict(X_test)

accuracy_score(y_test, dt_bag_pred)
confusion_matrix(y_test, dt_bag_pred)
# You can try increase n_estimator to 500.

9.2 Random forest

In [None]:
dt_rf = RandomForestClassifier(n_estimators=500, max_features = 4, random_state=0)
# max_features is set to sqrt(p), about 4.

In [None]:
dt_rf.fit(X_train, y_train)

dt_rf_pred = dt_rf.predict(X_test)

accuracy_score(y_test, dt_rf_pred)
confusion_matrix(y_test, dt_rf_pred)

9.4 We cannot get an interpretable tree from ensemble methods, but we can figure out the importance of each variable in classification.

In [None]:
feature_importances = dt_rf.feature_importances_
# This is a numpy array

# Sort feature importances in descending order
sort_index = np.argsort(feature_importances)
# sort_index = sort_index[::-1]
# argsort() returns the index position that the element would have 
# if the array were sorted in ascending order.
# We can use it to select feature importance and 

# Plot horizontal feature importances
plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.barh(range(X.shape[1]), feature_importances[sort_index], align="center")
plt.yticks(range(X.shape[1]), np.array(X.columns)[sort_index])
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.tight_layout()

9.3 Boosting

In [None]:
dt_gb = GradientBoostingClassifier(n_estimators=100, learning_rate =0.001,
max_depth=3, random_state=0)
# B = 100, lambda = 0.001, d = 3

In [None]:
dt_gb = GradientBoostingClassifier(n_estimators=1000, learning_rate =0.001,
max_depth=3, random_state=0)
# B = 1000, lambda = 0.001, d = 1

In [None]:
dt_gb = GradientBoostingClassifier(n_estimators=5000, learning_rate =0.001,
max_depth=3, random_state=0)
# B = 5000, lambda = 0.001, d = 1
# Too large B may lead to overfitting

In [None]:
dt_gb.fit(X_train, y_train)
dt_gb_pred = dt_gb.predict(X_test)
accuracy_score(y_test, dt_gb_pred)
confusion_matrix(y_test, dt_gb_pred)