# Decision Tree

__Number 7__

In [1]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
param_grid = {'max_leaf_nodes': list(range(2,100))}
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, verbose=1, cv=3)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 98 candidates, totalling 294 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 294 out of 294 | elapsed:    3.2s finished


GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                            13, 14, 15, 16, 17, 18, 19, 20, 21,
                                            22, 23, 24, 25, 26, 27, 28, 29, 30,
                                            31, ...]},
             verbose=1)

In [3]:
grid_search.best_estimator_

DecisionTreeClassifier(max_leaf_nodes=17, random_state=42)

In [4]:
from sklearn.metrics import accuracy_score
y_pred = grid_search.predict(X_test)
accuracy_score(y_test, y_pred)

0.8695

__Number 8__

In [7]:
from sklearn.model_selection import ShuffleSplit
sets = []
rs = ShuffleSplit(n_splits=1000, test_size=len(X_train)-100, random_state=42)
for train_index, test_index in rs.split(X_train):
    X_mini_train = X_train[train_index]
    y_mini_train = y_train[train_index]
    sets.append((X_mini_train, y_mini_train))

In [8]:
from sklearn.base import clone
forest = []
for _ in range(100): #because we want to clone 100 tree classifiers
    forest.append(clone(grid_search.best_estimator_))
accuracy_scores = []
for tree, (X_mini_train, y_mini_train) in zip(forest, sets):
    tree.fit(X_mini_train, y_mini_train)
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_pred, y_test))

In [9]:
import numpy as np
np.mean(accuracy_scores)

0.8042050000000001

In [12]:
import scipy
predictions = np.empty([100, len(X_test)])

for index, tree in enumerate(forest):
    predictions[index] = tree.predict(X_test)
maj_votes, num_votes = scipy.stats.mode(predictions)

In [16]:
accuracy_score(y_test, maj_votes.reshape([-1]))

0.874