In [None]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

In [None]:
from sklearn.datasets import fetch_openml

# return_X_y=true gibt nur das Tupel von np.arrays
# mnist = fetch_openml('mnist_784', version=1, return_X_y=True)

# default ist ein object mit 'data', 'target', ... attributes
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

In [None]:
X,y = mnist["data"], mnist["target"]
print(X.shape)
sampleDigit = X[36000]
plt.imshow(sampleDigit.reshape(28,28), cmap = matplotlib.cm.binary, interpolation = "nearest")
# plt.axis("off")
plt.show
y[36000]


In [None]:
# meine Spielerei: 10x10 images zusammenstellen

randomIndices = np.random.permutation(100)
digitTable = X[randomIndices]
digitTable.shape

def digit(i) :
    myDigit = X[randomIndices[i]]
    return myDigit.reshape(28,28)

# print(digit(27))

for i in range(0, 9) :
    myRow = digit(10*i)
    for j in range(1, 9) : 
        myRow = np.hstack((myRow, digit(10*i+j)))
    if i == 0:
        digitTable = myRow
    else :
        digitTable = np.vstack((digitTable, myRow))
            

plt.imshow(digitTable, cmap = matplotlib.cm.binary, interpolation = "nearest")







In [None]:
# mnist is already split in 60,000 training and 10,000 test data
# we shuffle the training data to have a valid basis for crossvalidation
shuffle = np.random.permutation(60000)
x_train, x_test, y_train, y_test = X[:60000][shuffle], X[60000:], y[:60000][shuffle], y[60000:]

In [None]:
# train a binary classifier (yes/no) on predicting it's a five or not
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(random_state=42, tol=1e-3, max_iter=1000)
sgd.fit(x_train, y_train == '5')



In [None]:
# implement cross-validation ourselves
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

y_train_5 = (y_train == '5')
skfolds = StratifiedKFold(n_splits=3, random_state=42)
for train_index, test_index in skfolds.split(x_train, y_train_5):
    fitter = clone(sgd)
    fitter.fit(x_train[train_index], y_train_5[train_index])
    y_pred = fitter.predict(x_train[test_index])
    print(sum(y_pred == y_train_5[test_index]) / len(y_pred))
    


In [None]:
# following is identical


In [None]:
# accuracy not the right measure because only 10% of samples is a '5'
# e.g. the trivial predictor always outputting 'false' has accuracy 90%
# confuction matrix gives TP, FP, FN, TN and is calculated 

#  # this would give the exact same result as our implementation of cross-validation
#  from sklearn.model_selection import cross_val_score
#  cross_val_score(sgd, x_train, (y_train == '5'), cv=3, scoring="accuracy")

# use cross validation to calculate a predcition instead of a score, 
# from which we calculate a confusion matrix
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
y_train_5 = (y_train == '5')
y_train_pred = cross_val_predict(sgd, x_train, (y_train_5), cv=3)
m = confusion_matrix(y_train_5, y_train_pred)


# confusion_matrix(test, predicted) gives:
# 
#  TN  FP    --  first row: all negativ cases
#  FN  TP
#       |
#      second column: predicted to be positive
#
print(m)


In [None]:
print(m)
TN = m[0,0]
FP = m[0,1]
FN = m[1,0]
TP = m[1,1]
print("accuracy = (TP + TN) / (TP + FP + TN + FN) = ", (TP+TN)/y_train.size)

print("precision = TP / (TP + FP) = ", TP/(TP+FP))
print("TPR = recall = TP / (TP + FN)", TP / (TP+FN))
print("F1 = harm. mean(precision,TPR) = 2*TP/(2TP+FN+FP)", 2*TP/(2*TP + FN+ FP))

# same thing from sklearn.metrics
from sklearn.metrics import precision_score, recall_score, f1_score
print("precision: ", precision_score(y_train_5, y_train_pred))
print("recall: ", recall_score(y_train_5, y_train_pred))
print("f1: ", f1_score(y_train_5, y_train_pred))



In [None]:
# the yes/no prediction of SGD is based on the score being higher than
# the threshold, which is set at zero. We can improve precision at the cost of
# recall by setting a higher treshold
# sklearn has no implementation to calculat confusion-matrix for 
# for different threshold, so we implement ourselves, using decision_function, 
# to make the decision ourselves

from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

def getConfusion(threshold) :

    TP = TN = FP = FN = 0
    y_train_5 = (y_train == '5')
    skfolds = StratifiedKFold(n_splits=3, random_state=42)
    for train_index, test_index in skfolds.split(x_train, y_train_5):
        fitter = clone(sgd)
        fitter.fit(x_train[train_index], y_train_5[train_index])
        y_score = fitter.decision_function(x_train[test_index])
        y_pred = (y_score > threshold)
        TP = TP + sum(y_pred & (y_pred == y_train_5[test_index]))
        FP = FP + sum(y_pred & (y_pred != y_train_5[test_index]))
        FN = FN + sum(y_train_5[test_index] & (y_pred != y_train_5[test_index]))

    TN = TN + (y_train_5.size - TP - FP - FN)
    print("\n threshold:", threshold, "precision:", TP / (TP+FP))
    print("recall:", TP / (TP+FN))
    return np.array([[TN, FP], [FN, TP]])

m = getConfusion(0)
print(m)
print(getConfusion(1000))


In [None]:
# choosing this threshold is a trade-off between precision and recall
# which is illustrated by precision_recall_curve and is implemented in sklearn:

# predict the full set of scores
y_train_5 = (y_train =='5')
y_scores = cross_val_predict(sgd, x_train, y_train_5, cv=3, method="decision_function")




In [None]:
# plot the curve
# recall is smooth and monotonously decreasing,
# precision increases with threshold but not necessarily
# monotonously and therefore less smooth

from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
plt.plot(thresholds, precisions[:-1], "b--", label="precision")
plt.plot(thresholds, recalls[:-1], "g-", label="recall")
plt.xlabel("threshold")
plt.legend()
plt.show()

In [None]:
# the (other) classical curve is ROC, plotting 
# TPR=recall versus FPR = 1 - sensitivity

from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)
plt.plot(fpr, tpr, linewidth=2)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.show()

# the area under the curve is higher for a better predictor
# AUC=0.5 for a random predictor, AUC=1.0 for a perfect predictor
from sklearn.metrics import roc_auc_score
print("auc:", roc_auc_score(y_train_5, y_scores))

## TIP
# use precision vs recall if FP are more important than FN
# otherwise use ROC curve / AUC rate

In [None]:
# now a second classifier: a random forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=42, n_estimators=10)
y_predict = cross_val_predict(rfc, x_train, y_train_5,
                               cv=3, method="predict_proba")


In [None]:
# the above y_predict calculated as predict_proba are not
# a 60000x1 array either of true/false values or scores,
# but rather 60000x2 array with probabilities of
# being 5 or not being 5. Select the 2nd column as a "score"
# to produce ROCplt.plot(fpr, tpr, linewidth=2)
y_score = y_predict[:,1]
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_train_5, y_score)

plt.plot([0,1], [0,1], 'k--')
plt.plot(fpr_rf, tpr_rf, "b:")
plt.plot(fpr,tpr, "b-")
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.show()

print("auc random forest:", roc_auc_score(y_train_5, y_score))

# at page95, TODO: multiclass classification
