Excerise:
Train and fine-tune a Decision Tree for the moons dataset


In [2]:
# Generate a moons dataset using make_moons(n_smaples=10000, noise =0.4)
# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_moons.html

from sklearn import *

moon_data=sklearn.datasets.make_moons(n_samples=10000, noise =0.4)
moon_data

(array([[ 1.32126048, -0.78908301],
        [ 0.34148978,  1.62513477],
        [-0.82098625,  0.58287333],
        ...,
        [ 1.37668036, -0.75882691],
        [ 0.33656132,  0.07747864],
        [ 0.28068985,  0.0409081 ]]),
 array([1, 0, 0, ..., 1, 1, 0]))

In [3]:
# split it into a training set and a test set using train_test_split()
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
X,y = moon_data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

In [4]:
# Use grid searh with cross-validation to find good hyperparameter values for a DecisionTreeClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html?highlight=gridsearchcv#sklearn.model_selection.GridSearchCV

from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier

# loop through max_leaf_nodes from 2 to 100 
params = {'max_leaf_nodes': list(range(2, 100))}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1, verbose=1, cv=3)

grid_search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 98 candidates, totalling 294 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 294 out of 294 | elapsed:    1.9s finished


GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                            13, 14, 15, 16, 17, 18, 19, 20, 21,
                                            22, 23, 24, 25, 26, 27, 28, 29, 30,
                                            31, ...]},
             verbose=1)

In [5]:
# print out the best max_leaf_nodes from the list
grid_search_cv.best_estimator_

DecisionTreeClassifier(max_leaf_nodes=20, random_state=42)

In [6]:
# Train it on the full training set using the best max_leaf_nodes, target the accuracy should above 85%

best_moon_clf = DecisionTreeClassifier(max_leaf_nodes=20)
best_moon_clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score

y_pred = best_moon_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8665

In [7]:
# by default the grid_serach_cv will use the best estimator
y_pred = grid_search_cv.predict(X_test)
accuracy_score(y_test, y_pred)

0.8665

In [8]:
# improve the accurary by checking other parameters like 

params = {'max_leaf_nodes': list(range(2, 100)),'min_samples_split': list(range(2,10))}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1, verbose=1, cv=3)

grid_search_cv.fit(X_train, y_train)

y_pred = grid_search_cv.predict(X_test)
accuracy_score(y_test, y_pred)


Fitting 3 folds for each of 784 candidates, totalling 2352 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 2320 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 2352 out of 2352 | elapsed:    7.4s finished


0.8665

From above setting we could see this doesn't help improve the accuracy.... hmmmm

Excerise 8 - grow a forest
a. based on moon dataset generate 1000 subsets of the training set - each subset contains 100 instances selected randomly

In [10]:
len(X_train)

8000

In [27]:
from sklearn.model_selection import ShuffleSplit
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html

# class sklearn.model_selection.ShuffleSplit(n_splits=10, *, test_size=None, train_size=None, random_state=None)

n_subsets = 1000
n_instances = 100

mini_sets = []

rs = ShuffleSplit(n_splits=n_subsets, test_size=len(X_train) - n_instances, random_state=42)

# split(self, X, y=None, groups=None)[source]
for mini_train_index, mini_test_index in rs.split(X_train):
    X_mini_train = X_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append((X_mini_train, y_mini_train))

In [28]:
print(len(mini_sets))
print(len(mini_sets[1]))
print(len(mini_sets[1][1]))

1000
2
100


b. Train one decision tree on each subset, using the best hyperparmeter values found above. (in this case, max_leaf_nodes=20, random_state=42 ) Evaluate these 1000 DT on the test set.

In [31]:
import numpy as np
best_moon_clf = DecisionTreeClassifier(max_leaf_nodes=20)

accuracy_scores = []

for X_mini_train, y_mini_train in mini_sets:
    best_moon_clf.fit(X_mini_train, y_mini_train)
    y_pred = best_moon_clf.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

print('AVG: ' ,np.mean(accuracy_scores))
print('MAX: ' ,np.max(accuracy_scores))
print('MIN: ' ,np.min(accuracy_scores))


AVG:  0.791482
MAX:  0.855
MIN:  0.6795


c. For each test set instance, generate the predictions of the 1000 DT and keep only the most frequent prediction (Hint: use SciPy's mode()function). This gives majority-vote predictions over the test set

In [32]:
# from the above cell, I pre-set the max_leaf_nodes to 20 instead of use the grid_search_cv.best_estimator_, this may cause overfitting

from sklearn.base import clone
# here instead of set the max leaf, we use grid_search to find the best parameter

forest = [clone(grid_search_cv.best_estimator_) for _ in range(n_subsets)] #create 1000 decision trees based best_estimator

accuracy_scores = []

for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)
    
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

print('AVG: ' ,np.mean(accuracy_scores))
print('MAX: ' ,np.max(accuracy_scores))
print('MIN: ' ,np.min(accuracy_scores))

AVG:  0.791453
MAX:  0.8525
MIN:  0.6795


In [33]:
len(X_test)

2000

In [34]:
Y_pred = np.empty([n_subsets, len(X_test)], dtype=np.uint8)

for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_test)

from scipy.stats import mode

y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)

In [35]:
y_pred_majority_votes

array([[1, 0, 1, ..., 0, 1, 0]], dtype=uint8)

In [36]:
n_votes

array([[868, 845, 848, ..., 797, 873, 595]])

In [37]:
accuracy_score(y_test, y_pred_majority_votes.reshape([-1]))

0.87