In [48]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

Exercise 8

In [5]:
mnist = fetch_openml('mnist_784', version=1, parser='auto')

In [6]:
list(mnist.keys())

['data',
 'target',
 'frame',
 'categories',
 'feature_names',
 'target_names',
 'DESCR',
 'details',
 'url']

In [21]:
X = mnist['data'].to_numpy()
y = mnist['target'].to_numpy().astype(int)

In [22]:
print(X.shape)
print(y.shape)

(70000, 784)
(70000,)


In [24]:
X_train_init, X_test, y_train_init, y_test = train_test_split(X, y, train_size=60000, random_state=42)

In [26]:
X_train_init.shape

(60000, 784)

In [27]:
X_train, X_val, y_train, y_val = train_test_split(X_train_init, y_train_init, train_size=50000, random_state=42)

In [29]:
print(X_train.shape)
print(X_val.shape)

(50000, 784)
(10000, 784)


In [32]:
forest_clf = RandomForestClassifier()
extra_clf = ExtraTreeClassifier()
svm_clf = SVC()
xg_clf = XGBClassifier()
tree_clf = DecisionTreeClassifier()

In [35]:
forest_clf.fit(X_train, y_train)

In [36]:
extra_clf.fit(X_train, y_train)

In [37]:
svm_clf.fit(X_train, y_train)

In [40]:
xg_clf.fit(X_train, y_train)

In [43]:
tree_clf.fit(X_train, y_train)

In [45]:
forest_sc = forest_clf.score(X_val, y_val)

In [50]:
extra_sc = extra_clf.score(X_val, y_val)

In [51]:
y_pred = svm_clf.predict(X_val)
svm_sc = accuracy_score(y_val, y_pred)

In [53]:
xg_sc = xg_clf.score(X_val, y_val)

In [55]:
tree_sc = tree_clf.score(X_val, y_val)

In [56]:
print('RF:',forest_sc)
print('ET:',extra_sc)
print('SVM:',svm_sc)
print('XGB:',xg_sc)
print('Tree:',tree_sc)

RF: 0.9699
ET: 0.8236
SVM: 0.9788
XGB: 0.9777
Tree: 0.8661


In [57]:
estimators = [('RF',forest_clf), ('ET', extra_clf), ('SVM', svm_clf), ('XGB', xg_clf), ('Tree', tree_clf)]

voting_hard_clf = VotingClassifier(estimators, voting='hard')

In [58]:
voting_hard_clf.fit(X_train, y_train)

In [59]:
voting_sc = voting_hard_clf.score(X_val, y_val)

In [60]:
print('Voting:', voting_sc)

Voting: 0.9772


In [61]:
svm_clf.score(X_test, y_test)

0.976

In [62]:
voting_sc_test = voting_hard_clf.score(X_test, y_test)
print('Voting in the test set:', voting_sc_test)

Voting in the test set: 0.9722


Exercise 9

In [63]:
y_pred_forest = forest_clf.predict(X_val)
y_pred_extra = extra_clf.predict(X_val)
y_pred_svm = svm_clf.predict(X_val)
y_pred_xg = xg_clf.predict(X_val)
y_pred_tree = tree_clf.predict(X_val)

In [85]:
X_new = np.c_[y_pred_forest, y_pred_extra, y_pred_svm, y_pred_xg, y_pred_tree]

In [92]:
y_new = y_val

In [93]:
print(X_new, X_new.shape)
print(y_new, y_new.shape)

[[5 5 5 5 5]
 [8 8 8 8 8]
 [2 2 2 2 2]
 ...
 [7 7 7 7 8]
 [6 6 6 6 6]
 [7 7 7 7 7]] (10000, 5)
[5 8 2 ... 7 6 7] (10000,)


In [102]:
forest_blend_clf = RandomForestClassifier()

In [94]:
X_train_b, X_val_b, y_train_b, y_val_b = train_test_split(X_new, y_new, test_size=0.2, random_state=42)

In [99]:
forest_blend_clf.fit(X_train_b, y_train_b)

In [100]:
forest_blend_clf.score(X_val_b, y_val_b)

0.9675

In [103]:
forest_blend_clf.fit(X_new, y_new)

In [104]:
y_pred_forest_test = forest_clf.predict(X_test)
y_pred_extra_test = extra_clf.predict(X_test)
y_pred_svm_test = svm_clf.predict(X_test)
y_pred_xg_test = xg_clf.predict(X_test)
y_pred_tree_test = tree_clf.predict(X_test)

In [105]:
X_test_new = np.c_[y_pred_forest_test, y_pred_extra_test, y_pred_svm_test, y_pred_xg_test, y_pred_tree_test]

In [106]:
y_test_new = y_test

In [107]:
forest_blend_clf.score(X_test_new, y_test_new)

0.9724