# Decision Tree with Random Forest

## Get Datasets

In [3]:
from sklearn.datasets import make_moons
from sklearn.model_selection import ShuffleSplit

mini_batch = []

data = make_moons(n_samples=1000, noise=0.4)
X = data[0]
y = data[1]

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
shuffle_split = ShuffleSplit(n_splits=1000, random_state=42, test_size=0.2, train_size=None)

for i_train, i_test in shuffle_split.split(X_train):
    X_train_batch = X_train[i_train]
    y_train_batch = y_train[i_train]
    mini_batch.append((X_train_batch, y_train_batch))

## Grid Search for the best Decision Tree Model

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=1, verbose=1, cv=3)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 294 candidates, totalling 882 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 882 out of 882 | elapsed:    1.2s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=42,
                                              splitter='best'),
             iid='deprecated', n_jobs=1,
             param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
   

## Forest Model

In [11]:
from sklearn.base import clone
from sklearn.metrics import accuracy_score
import numpy as np

forest = [clone(grid_search.best_estimator_) for _ in range(1000)]

accuracy_scores = []

for tree, (X_train_batch, y_train_batch) in zip(forest, mini_batch):
    tree.fit(X_train_batch, y_train_batch)
    
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

np.mean(accuracy_scores)

0.8293149999999999

## Predictions

In [13]:
Y_pred = np.empty([1000, len(X_test)], dtype=np.uint8)

for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_test)

In [14]:
from scipy.stats import mode

y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)

In [16]:
accuracy_score(y_test, y_pred_majority_votes.reshape([-1]))

0.845