# Decision tree - moons dataset

## Import the dataset & explore

In [1]:
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=10000, shuffle=True, noise=0.4, random_state=42)
X.shape

(10000, 2)

In [2]:
import matplotlib.pyplot as plt

plt.scatter(X[:,0], X[:,1], c=y, s=2, alpha=0.3)

<matplotlib.collections.PathCollection at 0x7f7e38267710>

## Create the train and test datasets

In [3]:
from sklearn.model_selection import train_test_split
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train, y_test = y_train.astype(np.int8), y_test.astype(np.int8)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(8000, 2)
(8000,)
(2000, 2)
(2000,)


## Train a decision tree classifier
### Use grid search to find the best hyper-parameters

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

dtc = DecisionTreeClassifier(criterion='gini', 
                           splitter='best', 
                           max_depth=None, 
                           min_samples_split=2, 
                           min_samples_leaf=1, 
                           min_weight_fraction_leaf=0.0, 
                           max_features=None, 
                           random_state=None, 
                           max_leaf_nodes=None, 
                           min_impurity_decrease=0.0, 
                           min_impurity_split=None, 
                           class_weight=None, 
                           presort=False)

param_grid = {'max_leaf_nodes': [10, 20, 30, 40]}

grid_search = GridSearchCV(estimator=dtc, 
                         param_grid=param_grid,
                         cv=3,
                         verbose=1,  
                         return_train_score=True)

In [5]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.1s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_leaf_nodes': [10, 20, 30, 40]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,

In [6]:
best_model = grid_search.best_estimator_.fit(X_train, y_train)
grid_search.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=20,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [7]:
score_full_training_set = best_model.score(X_test, y_test)
print('Accuracy score from single decision tree on full training dataset:\t', score_full_training_set)

Accuracy score from single decision tree on full training dataset:	 0.87


## Grow a forest
### Create an array of decision tree classifiers each trained on a different subsample of training data

In [8]:
from sklearn.model_selection import ShuffleSplit

shuffle_split = ShuffleSplit(n_splits=1000, 
                             train_size=0.0125,  
                             random_state=42)

flag = True

models = []
for X_small in shuffle_split.split(X_train):
    index = X_small[0]
    dtc = DecisionTreeClassifier(criterion='gini', 
                           splitter='best', 
                           max_depth=None, 
                           min_samples_split=2, 
                           min_samples_leaf=1, 
                           min_weight_fraction_leaf=0.0, 
                           max_features=None, 
                           random_state=None, 
                           max_leaf_nodes=20, 
                           min_impurity_decrease=0.0, 
                           min_impurity_split=None, 
                           class_weight=None, 
                           presort=False)
    models.append(dtc.fit(X_train[index,:], y_train[index]))

### Find the average prediction from the decision trees

In [9]:
from scipy.stats import mode

pred_sum = np.zeros((X_test.shape[0]))
scores = []

i = 0
for model in models:
    pred_sum = pred_sum + model.predict(X_test)
    scores.append(model.score(X_test, y_test))
    i=i+1

pred_normalised = pred_sum/float(i) >= 0.5
print('Average accuracy from small sample models:\t', np.average(scores))

Average accuracy from small sample models:	 0.8011240000000001


### Compare predictions on training set with ground truth

In [10]:
from sklearn.metrics import accuracy_score

average_predictions = accuracy_score(pred_normalised, y_test)
print('Accuracy from average prediction:\t\t', average_predictions)

Accuracy from average prediction:		 0.8735
