In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

# Generate a moons dataset

In [2]:
X,y=make_moons(n_samples=1000,noise=0.4)
X.shape,y.shape

((1000, 2), (1000,))

# Split it into a training set and a test set

In [3]:
X_train,X_test,y_train,y_test=train_test_split(X,y)

# Run Gridsearch on DecitionTreeClassifier

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
dtc=DecisionTreeClassifier()
parameters={
    'max_depth':np.arange(10,51,10),
    'min_samples_split':np.arange(2,6,1),
    'max_leaf_nodes':np.arange(2,11,1)
    }

In [5]:
model=GridSearchCV(dtc,param_grid=parameters)

In [6]:
model.fit(X_train,y_train)
print(model.best_estimator_)
y_pred=model.predict(X_test)
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test,y_pred))

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=10, max_features=None, max_leaf_nodes=8,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
0.844


# Grow a forest

In [7]:
from sklearn.model_selection import ShuffleSplit

shuffle_data=ShuffleSplit(n_splits=1000,test_size=len(X)-100)


In [8]:
accuracy,forrest=[],[]

# Train one Decision Tree on each subset, using the best hyperparameter values found above.

In [9]:
for train_index,test_index in shuffle_data.split(X):
    best_model=GridSearchCV(dtc,param_grid=parameters)
    best_model.fit(X[train_index],y[train_index])
    forrest.append(best_model.best_estimator_)
    y_pred=best_model.predict(X[test_index])
    accuracy.append(accuracy_score(y[test_index],y_pred))
    
print(np.mean(accuracy))

0.8158655555555556


In [10]:
Y_pred = np.empty((len(forrest),len(X_test)), dtype=np.uint8)
print(Y_pred.shape)
for index,tree in enumerate(forrest):
    Y_pred[index]=tree.predict(X_test)

(1000, 250)


#  Majority-vote predictions over the test set.

In [11]:
from scipy.stats import mode
y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)

In [12]:
accuracy_score(y_test,y_pred_majority_votes.reshape([-1]))

0.856