In [153]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, ShuffleSplit
from sklearn.metrics import accuracy_score
from scipy.stats import mode

Exercise 7

In [154]:
X,y = make_moons(n_samples=10000, noise=0.4, random_state=42)

In [155]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [156]:
tree_clf = DecisionTreeClassifier()

In [157]:
param_grid = {'max_leaf_nodes':[10,15,20,30,40,45,50,55,60,65,70],
              'max_depth':[2,6,7,8,10,11,12],
              'splitter':['best','random'],
              'min_samples_split':[2,3,4,5,6,7]}

grid = GridSearchCV(tree_clf, param_grid, cv=6, scoring='accuracy')

In [158]:
grid.fit(X_train, y_train)

In [159]:
grid.best_params_

{'max_depth': 12,
 'max_leaf_nodes': 45,
 'min_samples_split': 2,
 'splitter': 'random'}

In [160]:
tree_best_clf = grid.best_estimator_

In [161]:
scores = cross_val_score(tree_best_clf, X_train, y_train, cv=10)
print(scores.mean())

0.843375


In [162]:
y_pred = tree_best_clf.predict(X_test)

In [163]:
acc_tree = accuracy_score(y_test, y_pred)
print(acc_tree)

0.864


Exercise 9

In [164]:
n_trees = 1000
n_instances = 100

shu = ShuffleSplit(n_splits=n_trees, train_size=n_instances)

In [165]:
X_train_sm, y_train_sm = [],[]

for i, (itr, ite) in enumerate(shu.split(X_train)):
    X_train_sm.append(X_train[itr])
    y_train_sm.append(y_train[itr])

In [166]:
print(len(X_train_sm))
X_train_sm[0].shape

1000


(100, 2)

In [167]:
acc_sm = []

for i in range(len(X_train_sm)):
    tree_best_clf.fit(X_train_sm[i], y_train_sm[i])
    y_pred = tree_best_clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    acc_sm.append(acc)

In [168]:
print(np.mean(acc_sm))

0.7884749999999999


In [169]:
y_test.shape

(2000,)

In [170]:
y_pred_sm = []

for i in range(len(X_train_sm)):
    tree_best_clf.fit(X_train_sm[i], y_train_sm[i])
    y_pred = tree_best_clf.predict(X_test)
    y_pred_sm.append(y_pred)

In [171]:
y_pred_sm = np.array(y_pred_sm)

In [172]:
y_pred_sm.shape

(1000, 2000)

In [173]:
y_pred_rf = mode(y_pred_sm, keepdims='False')

In [174]:
acc_rf = accuracy_score(y_test, y_pred_rf[0][0])
print(acc_rf)

0.874


In [186]:
print(r'%.2f %%'%(100*(acc_rf - acc_tree)))

1.00 %
