# 7 - Model Evaluation
Reading: https://scikit-learn.org/stable/modules/model_evaluation.html

In [None]:
#@title Run this cell to download preprocessed data (features + labels). { display-mode: "form" }
!pip install -U wget
!rm -rf preprocessed
!mkdir preprocessed

import wget
wget.download('https://github.com/shengpu1126/BDSI2019-ML/raw/master/preprocessed/data.npz', 'preprocessed/data.npz')

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn import metrics, calibration, exceptions

In [None]:
with np.load('preprocessed/data.npz') as f:
    X = f['X']
    y = f['y']
    feature_names = f['feature_names']

In [None]:
#@title Run this cell to define the three preprocessing functions. { display-mode: "form" }
#@markdown - `impute_missing_values(X)`
#@markdown - `normalize_feature_matrix(X)`

def impute_missing_values(X):
    """
    For each feature column, impute missing values  (np.nan) with the 
    population mean for that feature.
    
    Args:
        X: np.array, shape (N, d). X could contain missing values
    Returns:
        X: np.array, shape (N, d). X does not contain any missing values
    """
    from sklearn.impute import SimpleImputer
    return SimpleImputer().fit_transform(X)

def normalize_feature_matrix(X):
    """
    For each feature column, normalize all values to range [0, 1].

    Args:
        X: np.array, shape (N, d).
    Returns:
        X: np.array, shape (N, d). Values are normalized per column.
    """
    from sklearn.preprocessing import MinMaxScaler
    return MinMaxScaler().fit_transform(X)

In [None]:
X = impute_missing_values(X)
X = normalize_feature_matrix(X)

# Split data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=3)

In [None]:
print('First 10 labels:', y[:10])
print('First 2 feature vectors:\n', X[:2])

## Review: training a classifier in python using sklearn

In [None]:
# Train a linear SVM
from sklearn.svm import SVC
clf = SVC(kernel='linear', C=1)
clf.fit(X_train, y_train)

In [None]:
# Calculate test accuracy
print('Test accuracy:', metrics.accuracy_score(y_test, clf.predict(X_test)))

## (1) Quantitative metrics

Why isn't accuracy enough? Read about [confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix), and the related [`sklearn.metrics` submodule](https://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics). 

There are many quantitative performance measures that can be derived from the _confusion matrix_:
- accuracy
- TPR, FPR, TNR, FNR
- Precision, recall, sensitivity, specificity
- AUROC, AUPR, F1-score
- ...

Thinking questions:
- What score is considered "good" for each metric?
- Which metric should we optimize?

In [None]:
## TODO: write a funciton that calculates the following performance measures:
#     accuracy, F1-Score, AUROC, precision, sensitivity, and specificity.
#
import warnings
warnings.filterwarnings('ignore', category=exceptions.UndefinedMetricWarning)

def calculate_performance(clf_trained, X, y_true, metric='accuracy'):
    """
    Calculates the performance metric as evaluated on the true labels
    y_true versus the predicted scores from clf_trained and X.
    Input:
        clf_trained: a fitted instance of sklearn estimator
        X : (n,d) np.array containing features
        y_true: (n,) np.array containing true labels in {0,1}
        metric: string specifying the performance metric; possible options include
            'accuracy', 'f1-score', 'auroc', 
            'precision', 'recall', 'sensitivity', 'specificity', 
            'tpr', 'fpr', 'tnr', 'fnr'
    Returns:
        the performance measure as a float
    """
    tn, fp, fn, tp = ???
    if metric.lower() == 'accuracy':
        return ???
    elif metric.lower() == 'auroc':
        return ???
    elif metric.lower() == 'f1_score' or metric.lower() == 'f1-score':
        return ???
    elif ...:
        return ???

In [None]:
df_scores = pd.DataFrame(columns=['metric', 'score'])
for m in ['accuracy', 'f1-score', 'auroc', 'precision', 'sensitivity', 'specificity']:
    df_scores = df_scores.append([
        {'metric': m, 'score': calculate_performance(clf, X_test, y_test, metric=m)}
    ], ignore_index=True)
display(df_scores)

## (2) Plots
Oftentimes, it is also useful to visually understand a model's predictive power through the following plots:
- ROC curve: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html
- PR curve: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html
- Calibration plot: https://scikit-learn.org/stable/auto_examples/calibration/plot_calibration_curve.html

[Read and discuss.]

In [None]:
clf = SVC(kernel='linear', C=1, probability=True)
clf.fit(X_train, y_train)

y_true = y_test
y_pred = clf.predict(X_test)
y_score = clf.decision_function(X_test)
y_prob = clf.predict_proba(X_test)[:,1]

In [None]:
# TODO: Plot ROC curve and calculate AUROC score


In [None]:
# TODO: Plot precision-recall curve and calculate AUPR score


In [None]:
# TODO: Generate calibration plot (with deciles) and calculate Brier score


## (3) Compare __Linear SVM__ with **Logistic Regression**

Using the quantitative metrics and plots we developed above. 

In [None]:
clfs = [
    SVC(kernel='linear', C=1, probability=True).fit(X_train, y_train),
    LogisticRegression(C=1).fit(X_train, y_train),
]

## (4*) Boostrap confidence intervals