In [2]:
# Getting the MNIST data (This takes forever, so do it just once)
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [3]:
# Getting training, validation, and testing data
import numpy as np
mnist_data = mnist['data']
mnist_test = mnist['target']

# Uncomment after loading the numpy pdframe into a numpy array
mnist_data = mnist_data.to_numpy()

# np.uint8 astype will turn all the ELEMENTS into unsigned integers
mnist_test = mnist_test.to_numpy().astype(np.uint8)
mnist_test

array([5, 0, 4, ..., 4, 5, 6], dtype=uint8)

In [4]:
# Splitting up the data
from sklearn.model_selection import train_test_split
train_data = mnist_data[:60000]
train_labels = mnist_test[:60000]

# Do not use the data for the testing
test_data = mnist_data[60000:]
test_labels = mnist_test[60000:]

x_train, x_val, y_train, y_val = train_test_split(train_data, train_labels, test_size=1/6)

In [5]:
# Training all the classifiers (RFC, SVC, ETC)
# This takes SUPER LONG so only do it once!
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

rfc = RandomForestClassifier()
svc = make_pipeline(StandardScaler(), SVC(gamma='auto'))
etc = ExtraTreesClassifier()

myClassifiers = [rfc, svc, etc]

# Train all three
for classifiers in myClassifiers:
    classifiers.fit(x_train, y_train)
    
accuracies = [estimator.score(x_val, y_val) for estimator in myClassifiers]
print(accuracies)

# Creating a voting ensemble classifier
voting_clf = VotingClassifier(
    estimators=[('rf', rfc), ('svc', svc), ('et', etc)],
    voting='hard')

# Training the voting classifier
voting_clf.fit(x_train, y_train)


[0.9715, 0.9649, 0.9738]


VotingClassifier(estimators=[('rf', RandomForestClassifier()),
                             ('svc',
                              Pipeline(steps=[('standardscaler',
                                               StandardScaler()),
                                              ('svc', SVC(gamma='auto'))])),
                             ('et', ExtraTreesClassifier())],
                 voting='soft')

In [6]:
# Showing the ensemble of the voting classifier
# Messed up on the voting classifier, so I am changing the voting method
# Creating a voting ensemble classifier
# voting_clf = VotingClassifier(
#   estimators=[('rf', rfc), ('svc', svc), ('et', etc)],
#   voting='hard')
# voting_clf.fit(x_train, y_train)
voting_clf.score(x_val, y_val)

0.9748

In [7]:
# The Voter did just about .5 percent better
print((sum(accuracies)/len(accuracies)) - voting_clf.score(x_val, y_val))

-0.004733333333333478


In [8]:
# Now let's try the test set
test_scores =[classifier.score(test_data, test_labels) for classifier in myClassifiers]
print(test_scores)

print(voting_clf.score(test_data, test_labels))
# Just about the same

[0.9688, 0.9648, 0.9702]
0.9721


In [9]:
# Exercise 9
# Getting the training set for the blender model (XGBoost)
x_val_predictions = np.empty((len(x_val), len(myClassifiers)), dtype=np.float32)
print(x_val_predictions)
for index, classifier in enumerate(myClassifiers):
    x_val_predictions[:, index] = classifier.predict(x_val)
print(x_val_predictions)
print(x_val_predictions.shape)

[[9.5402979e-20 4.5703349e-41 9.5402979e-20]
 [4.5703349e-41 1.7467022e-37 0.0000000e+00]
 [1.7467022e-37 0.0000000e+00 5.6051939e-45]
 ...
 [0.0000000e+00 9.8090893e-45 0.0000000e+00]
 [2.8025969e-45 0.0000000e+00 4.2038954e-45]
 [0.0000000e+00 2.8025969e-45 0.0000000e+00]]
[[9. 9. 9.]
 [9. 9. 1.]
 [9. 9. 9.]
 ...
 [4. 3. 3.]
 [1. 1. 1.]
 [5. 5. 5.]]
(10000, 3)


In [10]:
# You have to train the classifiers with the samples reshaped like this:
# svc.predict(x_train[].reshape(1,-1))

In [14]:
# Now to train the xgb blender model on the predictions
import xgboost as xgb
xgbc = xgb.XGBClassifier()
xgbc.fit(x_val_predictions, y_val)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [15]:
# Now to blend and see the scores on the test set and see the resulting score
x_test_predictions = np.empty((len(test_data), len(myClassifiers)), dtype=np.float32)
for index, classifier in enumerate(myClassifiers):
    x_test_predictions[:, index] = classifier.predict(test_data)
xgbc.score(x_test_predictions, test_labels)
# It did just as good at the voting classifier earlier. Maybe a super small margin worse

0.9712