# Load the libraries

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import precision_recall_curve, roc_curve

In [None]:
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# Load the data

In [None]:
df = pd.read_csv('cancer_data.csv', index_col="id")
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

Make a train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

# Explore & preprocess
Explore the data a bit, to know if the data needs preprocessing

In [None]:
X_train = pd.DataFrame(X_train)
X_train.describe().T


In [None]:
p = X_train.melt()
sns.boxplot(data=p, x='variable', y='value')
plt.xticks(rotation=90);

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
p = pd.DataFrame(X_train).melt()
sns.boxplot(data=p, x='variable', y='value')
plt.xticks(rotation=90);

Conclusions: the data needs some rescaling. We could dive into the data, trying to get a better grip on correlations etc., but that is another lesson.

# Select models and compare performance
Now select some models. Before you try them, try to guess what models would work better on this dataset, and which models will probably works worse. This way, you can improve your own priors about some models.

In [None]:
plt.figure(figsize=(10,10))
cv = 5
classifiers = [
    ('svc-linear', SVC(kernel='linear')),
    ('svc-kernel', SVC()),
    ('random-forest', RandomForestClassifier()),
    ('naive bayes', GaussianNB()),
    ('gaussian', GaussianProcessClassifier()),
    ('kNN', KNeighborsClassifier(3)),
    ('decision tree', DecisionTreeClassifier())
]

for i, (name, clf) in enumerate(classifiers):
    clf.fit(X_train, y_train)
    result = cross_val_score(clf, X_test, y_test, cv = cv, scoring='f1_macro')

    mu = np.mean(result)
    stderr = np.std(result)/np.sqrt(cv)

    plt.scatter(i, mu, label=name)
    plt.errorbar(i, mu, yerr=stderr)
    plt.legend(loc=3)

plt.xticks(np.arange(len(classifiers)), [name[0] for name in classifiers], rotation=45);
plt.show()

# Evaluate
Pick one model, and make a more extensive evaluation of the performance by making a precision-recall curve, roc curve and confusion matrix

In [None]:
for name, clf in classifiers:
    if hasattr(clf, "decision_function"):
        print("decision_function:{}".format(name))
    if hasattr(clf, "predict_proba"):
        print("predict_proba:{}".format(name))

In [None]:
clf = GaussianProcessClassifier()
clf.fit(X_train, y_train)

In [None]:
proba = clf.predict_proba(X_test)
y_test[:5], proba[:5]

So, we will need to transform the `y_test`. Let's do that simply with `y_test == "B"`

In [None]:
y_decision = cross_val_predict(clf, X_test, y_test, cv = 3, n_jobs = 4, method = 'predict_proba')
precision, recall, thresholds = precision_recall_curve(y_test == 'B', y_decision[:,0])

In [None]:
sns.set_theme()
data = pd.DataFrame({'precision':precision[:-1],'recall': recall[:-1], 'thresholds':thresholds})
sns.lineplot(x = 'thresholds', y='precision', label = 'precision', data = data)
sns.lineplot(x = 'thresholds', y='recall', label = 'recall', data=data)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test == 'B', y_decision[:,0])
data = pd.DataFrame({'fpr' : fpr, 'tpr':tpr})
plot = sns.lineplot(x = 'fpr', y = 'tpr', data=data)
plot.set(xlabel = 'FPR', ylabel = 'TPR')
plt.plot([0,1], [0,1], 'k--')

In [None]:
yhat = clf.predict(X_test)
cfm = confusion_matrix(y_test, yhat)
cfm