In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
np.set_printoptions(precision=3)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Voting Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import VotingClassifier


from sklearn.datasets import make_moons
X, y = make_moons(n_samples=100, noise=.2, random_state=18) # carefully picked random state for illustration
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

voting = VotingClassifier([('logreg', LogisticRegression(C=100)), 
                           ('tree', DecisionTreeClassifier(max_depth=3, random_state=0))],
                         voting='soft')
voting.fit(X_train, y_train)

In [None]:
voting.estimators_[0].score(X_test, y_test)

In [None]:
voting.estimators_[1].score(X_test, y_test)

In [None]:
voting.score(X_test, y_test)

# Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, stratify=digits.target, random_state=0)

In [None]:
train_scores = []
test_scores = []
oob_scores = []

feature_range = range(1, 64, 5)
for max_features in feature_range:
    rf = RandomForestClassifier(max_features=max_features, oob_score=True, n_estimators=200, random_state=0)
    rf.fit(X_train, y_train)
    train_scores.append(rf.score(X_train, y_train))
    test_scores.append(rf.score(X_test, y_test))
    oob_scores.append(rf.oob_score_)

In [None]:
plt.plot(feature_range, test_scores, label="test scores")
plt.plot(feature_range, oob_scores, label="oob_scores scores")
plt.plot(feature_range, train_scores, label="train scores")
plt.legend()
plt.ylabel("accuracy")
plt.xlabel("max_features")

In [None]:
rf = RandomForestClassifier().fit(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

In [None]:
np.set_printoptions(precision=6, suppress=True)
param_grid = {'max_features': [4,8,12,16,20],
              'max_depth': [6,8,10,12,14]}
param_grid

In [None]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(rf, param_grid=param_grid, cv=10, return_train_score=True)
grid.fit(X_train, y_train)

In [None]:
import pandas as pd
results = pd.DataFrame(grid.cv_results_)
results

In [None]:
res = pd.pivot_table(pd.DataFrame(grid.cv_results_), values='mean_test_score', index='param_max_depth', columns='param_max_features')
pd.set_option("display.precision",4)
res = res.set_index(res.index.values.round(4))

In [None]:
res

In [None]:
import seaborn as sns
sns.heatmap(res, annot=True, fmt=".3g", vmin=0.6)

In [None]:
grid.best_params_

In [None]:
print("Best estimator:\n{}".format(grid.best_estimator_))

In [None]:
accuracy = grid.score(X_test, y_test)
print('Accuracy score of the {} is {:.3f}'.format(grid.__class__.__name__, accuracy))

# Grid-searching which model to use

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC())])

In [None]:
param_grid = [
    {'classifier': [SVC()], 'preprocessing': [StandardScaler(), None],
     'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
     'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},
    {'classifier': [RandomForestClassifier(n_estimators=100)],
     'preprocessing': [None], 'classifier__max_features': [4,8,12,16,20],
              'classifier__max_depth': [6,8,10,12,20]}]

In [None]:
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)

In [None]:
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.3f}".format(grid.best_score_))
print("Test-set score: {:.3f}".format(grid.score(X_test, y_test)))

# Feature Importance

In [None]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.target[:10]

In [None]:
mnist.data.shape

In [None]:
mnist.data[0]

In [None]:
plt.imshow(mnist.data[0].reshape(28, 28), cmap='gray');
plt.axis('off')
print('The digit in the image is {}'.format(mnist.target[0]))

In [None]:
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rnd_clf.fit(mnist["data"], mnist["target"])

In [None]:
import matplotlib as mpl

In [None]:
def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = mpl.cm.hot,
               interpolation="nearest")
    plt.axis("off")

In [None]:
plot_digit(rnd_clf.feature_importances_)

cbar = plt.colorbar(ticks=[rnd_clf.feature_importances_.min(), rnd_clf.feature_importances_.max()])
cbar.ax.set_yticklabels(['Not important', 'Very important'])

plt.show()