In [41]:
from sklearn.datasets import make_moons

In [42]:
X, y = make_moons(n_samples=10000, noise=0.4)

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import GridSearchCV

tree_para = {
    'criterion': ['gini','entropy'],
    'max_depth': [5,7,8,10,15,20,30,40,50,70,90,120,150],
    'random_state': [42],
    'max_leaf_nodes': [1, 2, 5, 10, 15, 30, 50, 75, 100]
}
dtc = DecisionTreeClassifier()
GSCV = GridSearchCV(dtc, tree_para, cv=3)

In [45]:
import warnings
warnings.filterwarnings('ignore')
GSCV.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 7, 8, 10, 15, 20, 30, 40, 50, 70, 90,
                                       120, 150],
                         'max_leaf_nodes': [1, 2, 5, 10, 15, 30, 50, 75, 100],
                         'random_state': [42]})

In [46]:
GSCV.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=8, max_leaf_nodes=30,
                       random_state=42)

In [47]:
from sklearn.metrics import accuracy_score

In [48]:
model = GSCV.best_estimator_
model.fit(X_train, y_train)
y_pred1 = model.predict(X_train)
accuracy_score(y_train, y_pred1)

0.86825

In [49]:
y_pred2 = model.predict(X_test)
accuracy_score(y_test, y_pred2)

0.865

# -------------------------------------------------------------------------------------------

In [34]:
from sklearn.model_selection import ShuffleSplit

In [54]:
n_trees = 1000
n_instances = 100

rs = ShuffleSplit(n_splits=n_trees, train_size=0.0125, random_state=42)

In [68]:
scores = []
y_pred = []
for fold, (train_index, test_index) in enumerate(rs.split(X_train, y_train)):
    X_train_mini = X_train[train_index]
    y_train_mini = y_train[train_index]
    model.fit(X_train_mini, y_train_mini)
    y_pred.append(model.predict(X_test))
    scores.append(accuracy_score(y_test, y_pred[fold]))
    if fold % 100 == 0:
        print(f'fold number: {fold} done, accuracy={accuracy_score(y_test, y_pred[fold])}')

fold number: 0 done, accuracy=0.8085
fold number: 100 done, accuracy=0.794
fold number: 200 done, accuracy=0.792
fold number: 300 done, accuracy=0.803
fold number: 400 done, accuracy=0.7685
fold number: 500 done, accuracy=0.714
fold number: 600 done, accuracy=0.831
fold number: 700 done, accuracy=0.8035
fold number: 800 done, accuracy=0.7735
fold number: 900 done, accuracy=0.8255


In [69]:
scores_avg = sum(scores)/rs.n_splits
scores_avg

0.7946829999999999

In [82]:
from scipy.stats import mode

In [83]:
mode, count = mode(y_pred)

In [91]:
y_test.shape

(2000,)

In [92]:
mode.shape

(1, 2000)

In [87]:
accuracy_score(y_test, mode.reshape([-1]))

0.865