In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
from matplotlib import pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Load data in
mnist = pd.read_csv("/kaggle/input/mnist-784/mnist_784.csv")

In [None]:
# Peak at the data
mnist.head()

In [None]:
# Create predictors and labels
X = mnist.loc[:, mnist.columns != 'class']
y = mnist.loc[:, 'class']
print(X.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split
# create train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)
print(X_train.shape)
print(y_train.shape)

In [None]:
# Further exploring data: display a random digit to see what we are working with
random_digit = X.loc[0,:].to_numpy()
display_digit = random_digit.reshape(28,28)

plt.imshow(display_digit, cmap="binary")
plt.axis('off')
plt.show

In [None]:
# This looks like a 5. We can compare it to the actual label and see it is indeed a 5
y[0]

In [None]:
# Let's peform a binary classification on whether a digit is 8 or not
# Set all classes == 8 as True, False otherwise
y_train_8 = (y_train == 8)
y_test_8 = (y_test == 8)

In [None]:
# we'll train on the following classifiers: SGD, Decision Tree, and RandomForest
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

sgd_clf = SGDClassifier(random_state=42)
dTree_clf = DecisionTreeClassifier(random_state=42)
ranFor_clf = RandomForestClassifier(random_state=42)

In [None]:
'''
We'll perform cross validation with K=3 folds on each model. 
We'll retrieve the predictions in order to perform model performance
using metrics such as precision, recall, F1, ROC, and AUC
'''
from sklearn.model_selection import cross_val_predict
y_pred_SGD = cross_val_predict(sgd_clf, X_train, y_train_8, cv=3)
y_pred_dTree = cross_val_predict(dTree_clf, X_train, y_train_8, cv=3)
y_pred_ranFor = cross_val_predict(ranFor_clf, X_train, y_train_8, cv=3)

In [None]:
# Precision, Recall, and F1
from sklearn.metrics import precision_score, recall_score, f1_score
sgd_metrics = [precision_score(y_train_8, y_pred_SGD), recall_score(y_train_8, y_pred_SGD), f1_score(y_train_8, y_pred_SGD)]
dTree_metrics = [precision_score(y_train_8, y_pred_dTree), recall_score(y_train_8, y_pred_dTree), f1_score(y_train_8, y_pred_dTree)]
ranFor_metrics = [precision_score(y_train_8, y_pred_ranFor), recall_score(y_train_8, y_pred_ranFor), f1_score(y_train_8, y_pred_ranFor)]

# Display our results in a data frame
df_metrics = pd.DataFrame({'SGD_Metrics' : sgd_metrics, 'dTree_metrics' : dTree_metrics, 'ranFor_metrics' : ranFor_metrics})
df_metrics.index = ['Precision', 'Recall', 'F1']
df_metrics

In [None]:
# Here we mess around with observing Precision and Recall percentages based on each models' thresholds
# We can't directly change the thresholds of each model but we can use the decision scores to test prediction results on different thresholds

# Get decision scores / prediction probabilities for each prediction
y_score_SGD = cross_val_predict(sgd_clf, X_train, y_train_8, cv=3, method='decision_function')
y_score_dTree = cross_val_predict(dTree_clf, X_train, y_train_8, cv=3, method='predict_proba')
y_score_ranFor = cross_val_predict(ranFor_clf, X_train, y_train_8, cv=3, method='predict_proba')

In [None]:
# Get predicted probabilities of positive class (the following yields same as decision score)
y_score_dTree = y_score_dTree[:,1]
y_score_ranFor = y_score_ranFor[:,1]

In [None]:
# Compute precision and recall for all possible thresholds using a precision recall curve
from sklearn.metrics import precision_recall_curve
prec_SGD, recall_SGD, threshold_SGD = precision_recall_curve(y_train_8, y_score_SGD)
prec_dTree, recall_dTree, threshold_dTree = precision_recall_curve(y_train_8, y_score_dTree)
prec_ranFor, recall_ranFor, threshold_ranFor = precision_recall_curve(y_train_8, y_score_ranFor)

In [None]:
# Plot the precision recall vs threshold curve

# plot function
def plot_PR_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], 'b--', label='Precision')
    plt.plot(thresholds, recalls[:-1], 'g-', label='Recall')
    plt.xlabel('Threshold')
    plt.legend(loc='center right')

plot_PR_vs_threshold(prec_SGD, recall_SGD, threshold_SGD)
plt.show()

plot_PR_vs_threshold(prec_dTree, recall_dTree, threshold_dTree)
plt.show()
plot_PR_vs_threshold(prec_ranFor, recall_ranFor, threshold_ranFor)
plt.show()

# The plots agree with the precision-recall trade off. That is, as precision increases, recall decreases, and vice versa

In [None]:
# Now we focus on plotting the ROC curves and determining the AUC scores

# ROC plotting function
def plot_ROC(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1], [0,1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate (Recall)')
    
# Plot each ROC curve
from sklearn.metrics import roc_curve, roc_auc_score
fpr_sgd, tpr_sgd, thesholds_sgd = roc_curve(y_train_8, y_score_SGD)
fpr_dTree, tpr_dTree, thresholds_dTree = roc_curve(y_train_8, y_score_dTree)
fpr_ranFor, tpr_ranFor, thresholds_ranFor = roc_curve(y_train_8, y_score_ranFor)

plt.plot(fpr_sgd, tpr_sgd, "b:", label='SGD')
plt.plot(fpr_dTree, tpr_dTree, "g-", label='dTree')
plot_ROC(fpr_ranFor, tpr_ranFor, label='RanForest')
plt.legend(loc='lower right')
plt.show()

In [None]:
# So we see that the greatest AUC of each ROC is the Random Forest model. 
# That is, if randomly pick one positive data point and one negative data point, 
# our Random Forest model will rank the positive data point higher than the negative point with a higher probability than the others.

# To confirm we check the values directly
print(f"SGD AUC-score: {round(roc_auc_score(y_train_8, y_score_SGD), 3)}")
print(f"Decision Tree AUC-score: {round(roc_auc_score(y_train_8, y_score_dTree),3)}")
print(f"Random Forest AUC-score: {round(roc_auc_score(y_train_8, y_score_ranFor), 3)}")

Out of our 3 models, our best model is the Random Forest model. It out performs both SGD and Decision Tree in the following performance metrics: Precision, Recall (tied with Decision Tree), F1-Score, AUC score.

Specifically, for the random forest, out of all the digits classified as 8 in the train set, 99% of them are indeed 8's, and the model can identify an 8 81% of the train cases. 

For the sake of curiousity, we can change the threshold to attempt to increase recall. This will, in turn, lower precision. Lets attempt to lower precision to 90% and see how much or recall increases

In [None]:
# We first start by finding the lowest threshold that yields us 90% precision
thresh_90_precision = threshold_ranFor[np.argmax(prec_ranFor >= 0.90)]
print(f"Lowest Threshold yielding 90% precision: {thresh_90_precision}")

In [None]:
# Let's now check the precision and recall scores
y_pred_90 = (y_score_ranFor >= thresh_90_precision)
print(f"Precision: {precision_score(y_train_8, y_pred_90)}")
print(f"Recall: {recall_score(y_train_8, y_pred_90)}")

As we see, we are successfully able to increase the recall to 94% by lowering the Random Forest's precision. Ofcourse, it all depends on the situation that will govern which metric we prefer.

We can finish up by testing our final model on the test set. We will gather and report its performance using the same metrics as above

In [None]:
ranFor_clf.fit(X_train, y_train_8)
y_pred_final = ranFor_clf.predict(X_test)

In [None]:
y_pred_score = ranFor_clf.predict_proba(X_test)
y_pred_score = y_pred_score[:,1]

In [None]:
# Precision & Recall
precision_final = precision_score(y_test_8, y_pred_final)
recall_final = recall_score(y_test_8, y_pred_final)
print(f"Precision Score: {precision_final} \n Recall Score: {recall_final}")

# ROC Curve & AUC Score
fpr, tpr, _ = roc_curve(y_test_8, y_pred_score)
plot_ROC(fpr, tpr, label="Random Forest")
AUC_final = round(roc_auc_score(y_test_8, y_pred_score), 3)
print(f"AUC score: {AUC_final}")
