# 1. Binary Classifier

In [2]:
import numpy as np
import sklearn 
from sklearn.linear_model import SGDClassifier 

### 1a. SGD Classifier
Apply the SGD Classifier - this is capable of handling very large data sets efficiently becuase SGD deals with training instances independently one at a time

In [None]:
# Split into training and test sets
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

# Turn into a binary classification problem - either y=5 or y!=5
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

# Run SGD (Stochastic Gradient Descent) Classifier 
sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)
sgd_clf.fit(X_train, y_train_5)

# Prediction
sgd_clf.predict([some_digit])

### 1b. Performance Measures

In [None]:
# Implement Stratified Cross-Validation and print the cross validaiton score

from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = y_train_5[train_index]
    X_test_fold = X_train[test_index]
    y_test_fold = y_train_5[test_index]

    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))

In [None]:
# Use sklearn to obtain cross validation score 
# This will output an accuracy score for each fold (in this case 3)
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")

### 1c. Compare Against a Base Estimator
This will compare against a base estimator (dumb classifier) that classifies every single image to be not "5"

In [None]:
from sklearn.base import BaseEstimator

# Create the dumb classifier
class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)
    
# Test results of classifier
never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")

Note: accuracy is not the preferred performance measure for classifiers when dealing with skewed datasets

### 1d. Confusion Matrix
Confusion Matrix is a much better way to evaluate classifier performance. The `cross_val_predict()` function will perform K-fold cross-validation, but returns the predictions made on each test fold

In [None]:
# First lets create a set of predictions that we can use to compare against the actual training data
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)

# Create Confusion Matrix by comparing actual to predicted data
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_5, y_train_pred) #y_train_5 is a vector of actual y values 

* Each Row = actual class
* Each Column = predicted class
* Top Left = true negatives ; Top Right = false positives
* Bottom Left = false negatives ; Bottom Right = true positives
* Precision = TP / TP+FP -- accuracy of positive predictions
* Recall or Sensitivity or True Positive Rate = TP / TP + FN -- ratio of positive instances that are correctly detected by classifier  

In [None]:
# Calculate Precision and Recall with sklearn 
from sklearn.metrics import precision_score, recall_score
precision_score(y_train_5, y_train_pred)
recall_score(y_train_5, y_train_pred)

* F1 Score combines precision and recall into a single metric if you need to compare two classifiers. F1 score is the harmonic mean of precision and recall. Harmonic mean gives much more weight to low values. Classifiers will therefore only get a high F1 score if both recall and precision are hgh 
* Formula: F1 = 2 $\cdot$ $\frac{precision \times recall}{precision + recall}$

In [None]:
# Calculate F1 Score
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)

### 1e. Precision/Recall Tradeoff
The code below can be used to visualize this tradeoff and select a threshold to use in your classifier

In [None]:
# Returns the decision scores for every data point
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,
                             method="decision_function")

# Generate a curve showing the precision and recall for all possible thresholds
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.legend(loc="center right", fontsize=16) # Not shown in the book
    plt.xlabel("Threshold", fontsize=16)        # Not shown
    plt.grid(True)                              # Not shown
    plt.axis([-50000, 50000, 0, 1])             # Not shown
    
recall_90_precision = recalls[np.argmax(precisions >= 0.90)]
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]

plt.figure(figsize=(8, 4))                                                                  # Not shown
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.plot([threshold_90_precision, threshold_90_precision], [0., 0.9], "r:")                 # Not shown
plt.plot([-50000, threshold_90_precision], [0.9, 0.9], "r:")                                # Not shown
plt.plot([-50000, threshold_90_precision], [recall_90_precision, recall_90_precision], "r:")# Not shown
plt.plot([threshold_90_precision], [0.9], "ro")                                             # Not shown
plt.plot([threshold_90_precision], [recall_90_precision], "ro")                             # Not shown
save_fig("precision_recall_vs_threshold_plot")                                              # Not shown
plt.show()

Another way to select a good precision/recall trade-off is to plot precision directly against recall

In [None]:
# Plot precision against recall
def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls, precisions, "b-", linewidth=2)
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])
    plt.grid(True)

plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions, recalls)
plt.plot([recall_90_precision, recall_90_precision], [0., 0.9], "r:") # Not need
plt.plot([0.0, recall_90_precision], [0.9, 0.9], "r:") # Not needed
plt.plot([recall_90_precision], [0.9], "ro")
save_fig("precision_vs_recall_plot")
plt.show()

### 1f. ROC Curve
This plots the TPR (Sensitivty or Recall) against the FPR, where FPR is the ratio of negative instances that are incorrectly classified as positive; in other terms the FPR = 1 - TNR = 1 - specificity
* **Note:** you should use a precision-recall curve if your data has very few instances of positive values or when you care more about false-positives than false negatives

In [None]:
# Plot ROC Curve
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal
    plt.axis([0, 1, 0, 1])                                    # Not shown in the book
    plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16) # Not shown
    plt.ylabel('True Positive Rate (Recall)', fontsize=16)    # Not shown
    plt.grid(True)                                            # Not shown

plt.figure(figsize=(8, 6))                                    # Not shown
plot_roc_curve(fpr, tpr)
fpr_90 = fpr[np.argmax(tpr >= recall_90_precision)]           # Not shown
plt.plot([fpr_90, fpr_90], [0., recall_90_precision], "r:")   # Not needed
plt.plot([0.0, fpr_90], [recall_90_precision, recall_90_precision], "r:")  # Not needed
plt.plot([fpr_90], [recall_90_precision], "ro")               # Not shown
save_fig("roc_curve_plot")                                    # Not shown
plt.show()

### 1g. Aread Under the Curve (AUC)

In [4]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_5, y_scores)

NameError: name 'y_train_5' is not defined

# 2. Multiclass Classification

* Logistic Regression, RF, Naive Bayes classifiers can handle multiclass natively 
* SGD, SVM classifiers are strictly binary classifiers, but you can perform multiclass classification with multiple binary classifiers
* One-vs-Rest (OvR) strategy is typically preferred over One-vs-One (OvO) 

### 2a. SVM

In [None]:
from sklearn.svm import SVC

# SVM example for multi-class - actually does OvO under the hood
# 45 binary classifiers (OvO is usually done just for SVM)
svm_clf = SVC(gamma="auto", random_state=42)
svm_clf.fit(X_train[:1000], y_train[:1000]) # y_train, not y_train_5
svm_clf.predict([some_digit])

# Get the decision function scores for each digit
some_digit_scores = svm_clf.decision_function([some_digit])

Force sklearn to use OvR rather than OvO method for SVM

In [None]:
# Force SVM to use OvR method
from sklearn.multiclass import OneVsRestClassifier
ovr_clf = OneVsRestClassifier(SVC(gamma="auto", random_state=42))
ovr_clf.fit(X_train[:1000], y_train[:1000])
ovr_clf.predict([some_digit])

### 2b. SGD Multiclass (OvR)

In [None]:
# Same logic as binary classification problem 
# Does OvR under the hood 
sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])

# Return the decision function for each class
sgd_clf.decision_function([some_digit])

# Now let's use cross validation
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

# Scaling the inputs will increase the accuracy
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

### 2c. Error Analysis
* This is done after you have selected a model and want to analyze the types of errors that it makes

In [None]:
# First lets look at the confusion matrix
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)

# Plot this confusion matrix - ABSOLUTE ERRORs
plt.matshow(conf_mx, cmap=plt.cm.gray)
save_fig("confusion_matrix_plot", tight_layout=False)
plt.show()

# Plot the confusion matrix - ERROR RATE - what you want
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums
np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
save_fig("confusion_matrix_errors_plot", tight_layout=False)
plt.show()