Fetches the MNIST dataset

In [None]:
>>> from sklearn.datasets import fetch_openml
>>> mnist = fetch_openml('mnist_784', version=1)
>>> mnist.keys()

Look at the arrays

In [None]:
>>> X, y = mnist["data"], mnist["target"]
>>> X.shape
>>> y.shape

Grab an instance’s feature vector, reshape it to a 28 × 28 array, and display it

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

some_digit = X[0]
some_digit_image = some_digit.reshape(28, 28)

plt.imshow(some_digit_image, cmap="binary")
plt.axis("off")
plt.show()

Cast y to
integer

In [None]:
>>> y = y.astype(np.uint8)

Split into a training set and a test set

In [None]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

Create the target vectors for
the classification task

In [None]:
y_train_5 = (y_train == 5) # True for all 5s, False for all other digits
y_test_5 = (y_test == 5)

Create an SGDClassifier and train it on the whole training set

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

Detect images of the number 5

In [None]:
>>> sgd_clf.predict([some_digit])

Evaluate SGDClassifier model using K-fold cross-validation with three folds

In [None]:
>>> from sklearn.model_selection import cross_val_score
>>> cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")

Using a worse classifier that just classifies every single image in the “not-5” class

In [None]:
from sklearn.base import BaseEstimator
class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        return self
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

Evaluate the model

In [None]:
>>> never_5_clf = Never5Classifier()
>>> cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")

Compute the confusion matrix:
*   Use the `cross_val_predict()` function
*   Pass the target classes (y_train_5) and the predicted classes
(y_train_pred)



In [None]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)

In [None]:
>>> from sklearn.metrics import confusion_matrix
>>> confusion_matrix(y_train_5, y_train_pred)

Using a supposed perfect prediction for the confusion matrix

In [None]:
>>> y_train_perfect_predictions = y_train_5 # pretend we reached perfection
>>> confusion_matrix(y_train_5, y_train_perfect_predictions)

Functions to compute classifier metrics (Precision and Recall)

In [None]:
>>> from sklearn.metrics import precision_score, recall_score
>>> precision_score(y_train_5, y_train_pred) # == 4096 / (4096 + 1522)
>>> recall_score(y_train_5, y_train_pred) # == 4096 / (4096 + 1325)

Compute the F1 Score

In [None]:
>>> from sklearn.metrics import f1_score
>>> f1_score(y_train_5, y_train_pred)

Calling decision_function() method to return a
score for each instance, then use any threshold to make predictions
based on those scores

In [None]:
>>> y_scores = sgd_clf.decision_function([some_digit])
>>> y_scores

In [None]:
>>> threshold = 0
>>> y_some_digit_pred = (y_scores > threshold)

Raising the threshold

In [None]:
>>> threshold = 8000
>>> y_some_digit_pred = (y_scores > threshold)
>>> y_some_digit_pred

Get the scores of all instances in the training set with specification to return decision scores instead of predictions

In [None]:
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,
                             method="decision_function")

Compute precision
and recall for all possible thresholds

In [None]:
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

Plot precision and recall as functions of the threshold value

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    [...] # highlight the threshold and add the legend, axis label, and grid

plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

Search for the
lowest threshold that gives at least 90% precision and make predictions

In [None]:
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)] # ~7816

y_train_pred_90 = (y_scores >= threshold_90_precision)

In [None]:
>>> precision_score(y_train_5, y_train_pred_90)

>>> recall_score(y_train_5, y_train_pred_90)

Use the `roc_curve()` function to compute the TPR
and FPR for various threshold values

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

Plot the FPR against the TPR

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--') # Dashed diagonal
    [...] # Add axis labels and grid

plot_roc_curve(fpr, tpr)
plt.show()

Function to compute the ROC
AUC

In [None]:
>>> from sklearn.metrics import roc_auc_score
>>> roc_auc_score(y_train_5, y_scores)

Train a RandomForestClassifier and compare its ROC curve and ROC
AUC score to those of the SGDClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,
 method="predict_proba")

Using the positive class’s probability as the score for the `roc_curve()` function

In [None]:
y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_scores_forest)

Plot the ROC curve

In [None]:
plt.plot(fpr, tpr, "b:", label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right")
plt.show()

Measuring the precision and recall scores

In [None]:
>>> roc_auc_score(y_train_5, y_scores_forest)

Using a binary classification algorithm for a multiclass classification task

In [None]:
>>> from sklearn.svm import SVC
>>> svm_clf = SVC()
>>> svm_clf.fit(X_train, y_train) # y_train, not y_train_5
>>> svm_clf.predict([some_digit])

Call the `decision_function()` method

In [None]:
>>> some_digit_scores = svm_clf.decision_function([some_digit])
>>> some_digit_scores

Seeing the highest score (the one corresponding to class 5)

In [None]:
>>> np.argmax(some_digit_scores)
>>> svm_clf.classes
>>> svm_clf.classes_[5]

Forcing Scikit-Learn to use one-versus-one or one-versus-the-rest by creating an
instance and passing a classifier to its constructor. Creates a multiclass classifier using the OvR strategy, based on an SVC

In [None]:
>>> from sklearn.multiclass import OneVsRestClassifier
>>> ovr_clf = OneVsRestClassifier(SVC())
>>> ovr_clf.fit(X_train, y_train)
>>> ovr_clf.predict([some_digit])
>>> len(ovr_clf.estimators_)

Training an SGDClassifier

In [None]:
>>> sgd_clf.fit(X_train, y_train)
>>> sgd_clf.predict([some_digit])

Looking at the score that the SGD classifier assigned
to each class

In [None]:
>>> sgd_clf.decision_function([some_digit])

Use the `cross_val_score()` function to evaluate the SGDClassifier’s accuracy

In [None]:
>>> cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

Scaling the inputs to increase accuracy to above 89%

In [None]:
>>> from sklearn.preprocessing import StandardScaler
>>> scaler = StandardScaler()
>>> X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
>>> cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

Make predictions using the
`cross_val_predict()` function, then calling the `confusion_matrix()` function

In [None]:
>>> y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
>>> conf_mx = confusion_matrix(y_train, y_train_pred)
>>> conf_mx

Plot an image representation
of the confusion matrix

In [None]:
plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()

Dividing each value in the confusion
matrix by the number of images in the corresponding class

In [None]:
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums

Fill the diagonal with zeros to keep only the errors, and plot the result

In [None]:
np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()

Plot examples of 3s and 5s to analyze errors

In [None]:
cl_a, cl_b = 3, 5
X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]

plt.figure(figsize=(8,8))
plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)
plt.show()

A simpler example of a classification system that outputs multiple binary tags
called a multilabel classification system

In [None]:
from sklearn.neighbors import KNeighborsClassifier

y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

Make a prediction

In [None]:
>>> knn_clf.predict([some_digit])

Compute the average F1 score across all labels

In [None]:
>>> y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)
>>> f1_score(y_multilabel, y_train_knn_pred, average="macro")


Creating the training and test sets by taking the MNIST images and
adding noise to their pixel intensities

In [None]:
noise = np.random.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
noise = np.random.randint(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test

Train the classifier and make it clean the image

In [None]:
knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod[some_index]])
plot_digit(clean_digit)