# BME-336546-C06-Nonlinear classification (SVM and Random forest)


## Data loading

In [None]:

import numpy as np
import pickle
import sys
import pandas as pd
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
mpl.style.use(['ggplot']) 
%matplotlib inline
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline

In this tutorial we would use data that you can find within `sklearn.datasets`. The chosen data are concentric circles.

In [None]:
from sklearn.datasets.samples_generator import make_circles
X, y = make_circles(1000, factor=.1, noise=.2, random_state=336546)

As usual, we should split our data into training and testing sets:

In [None]:
X_train, x_test, Y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 10, stratify=y)

In [None]:
def plot_radar(clf, clf_type):
    labels=np.array(['Accuracy', 'F1', 'PPV', 'Sensitivity', 'AUROC']) 
    score_mat_train = np.stack((clf.cv_results_['mean_train_accuracy'], clf.cv_results_['mean_train_f1'],
                               clf.cv_results_['mean_train_precision'], clf.cv_results_['mean_train_recall'],
                               clf.cv_results_['mean_train_roc_auc']), axis=0)
    score_mat_val = np.stack((clf.cv_results_['mean_test_accuracy'], clf.cv_results_['mean_test_f1'],
                               clf.cv_results_['mean_test_precision'], clf.cv_results_['mean_test_recall'],
                               clf.cv_results_['mean_test_roc_auc']), axis=0)


    angles=np.linspace(0, 2*np.pi, len(labels), endpoint=False)

    angles=np.concatenate((angles,[angles[0]]))
    cv_dict = clf.cv_results_['params']
    fig=plt.figure(figsize=(18,14))
    if 'svm__gamma' in cv_dict[0]:
        new_list = [(i, item) for i, item in enumerate(cv_dict) if
                    item["svm__kernel"] == clf_type[0] and item["svm__gamma"] == clf_type[1]]
    else:
        new_list = [(i, item) for i, item in enumerate(cv_dict) if
                    item["svm__kernel"] == clf_type[0]]
    for idx, val in enumerate(new_list):
        ax = fig.add_subplot(1, len(new_list), 1+idx, polar=True)
        rel_idx, rel_dict = val
        stats_train = score_mat_train[:, rel_idx]
        stats_train=np.concatenate((stats_train,[stats_train[0]]))
        ax.plot(angles, stats_train, 'o-', linewidth=2)
        ax.fill(angles, stats_train, alpha=0.25)
        stats_val = score_mat_val[:, rel_idx]
        stats_val=np.concatenate((stats_val,[stats_val[0]]))
        ax.plot(angles, stats_val, 'o-', linewidth=2)
        ax.fill(angles, stats_val, alpha=0.25)
        ax.set_thetagrids(angles[:-1] * 180/np.pi, labels)
        if idx == 0:
            ax.set_ylabel(clf_type[0], fontsize=18)
        ax.set_title('C = %.3f' % (rel_dict['svm__C']))
        if 'svm__gamma' in cv_dict[0]:
            ax.set_xlabel('$\gamma = %s $' % (rel_dict['svm__gamma']))
        ax.set_ylim([0,1])
        ax.legend(['Train','Validation'])
        ax.grid(True)
        
    plt.show()

In [None]:
def plot_dataset(x_train, x_test, y_train, y_test, axes_range):
    fig, axs = plt.subplots(1, 2, figsize=(12,6))
    for idx, ax in enumerate(axs.flatten()):
        if idx == 0:
            ax.plot(x_train[:, 0][y_train==0], x_train[:, 1][y_train==0], "bs")
            ax.plot(x_train[:, 0][y_train==1], x_train[:, 1][y_train==1], "g^")
            ax.set_title('Train')
        else:
            ax.plot(x_test[:, 0][y_test==0], x_test[:, 1][y_test==0], "bs")
            ax.plot(x_test[:, 0][y_test==1], x_test[:, 1][y_test==1], "g^")
            ax.set_title('Test')
        ax.axis(axes_range)
        ax.grid(True, which='both')
        ax.set_xlabel(r"$x_1$", fontsize=20)
        ax.set_ylabel(r"$x_2$", fontsize=20, rotation=0)
    return fig, axs

In [None]:
def plot_predictions(clf, axs, axes_range):
    x0s = np.linspace(axes_range[0], axes_range[1], 100)
    x1s = np.linspace(axes_range[2], axes_range[3], 100)
    x0, x1 = np.meshgrid(x0s, x1s)
    X = np.c_[x0.ravel(), x1.ravel()]
    y_pred = clf.predict(X).reshape(x0.shape)
    y_decision = clf.decision_function(X).reshape(x0.shape)
    for idx, ax in enumerate(axs.flatten()):
        ax.contourf(x0, x1, y_pred, cmap=plt.cm.brg, alpha=0.2)
        ax.contourf(x0, x1, y_decision, cmap=plt.cm.brg, alpha=0.1)
        if idx == 0:
            ax.set_title('Train')
        else:
            ax.set_title('Test')

Let's look at our data sets:

In [None]:
axes_range = [-3, 3, -2, 2]
_, _ = plot_dataset(X_train, x_test, Y_train, y_test, axes_range)

This data set is clearly non-linearly separable, at least not at this domain. We should now use the `StratifiedKFold` class as in the previous tutorial with 3 splits and then we will use `GridSearchCV` to look for the best linear SVM among the different given C values for classification penalties. Use `Pipeline` to insert scaling and notice that the kernel should not be a string but rather a one-element list containing the adequate string for linear kernel.\
Again, we would look for the best estimator according to AUROC performance and for "running info" we would use `verbose=3`. Name the `GridSearchCV` as `svm_lin` and use `svm` string for the dictionary of `Pipeline`.

In [None]:
n_splits = 3
skf = StratifiedKFold(n_splits=n_splits, random_state=10, shuffle=True)

In [None]:
#C1
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
svc = SVC(probability=True, random_state=336546)
C = np.array([0.001, 0.01, 1, 10, 100, 1000])
#--------------------------Impelment your code here:-------------------------------------

#------------------------------------------------------------------------------------------

Choose the best linear model and name it as `best_svm_lin`. In addition, print the parameters adequate to the best classifier

In [None]:
#C2
#--------------------------Impelment your code here:-------------------------------------

#-----------------------------------------------------------------------------------------

### Expected outpout:
<center><img src="outputs/1.PNG" width="380"><center>

Now let's look at the performances as a function of missclassification penalties:

In [None]:
clf_type = ['linear']
plot_radar(svm_lin, clf_type)

### Expected outpout:
<center><img src="outputs/2.PNG" width="880"><center>

And now, we would like to visualize the classification of our chosen estimator using `decision_function` method (used within `plot_predictions`).

In [None]:
axes_range = [-3, 3, -2, 2]
_, axs = plot_dataset(X_train, x_test, Y_train, y_test, axes_range)
plot_predictions(best_svm_lin, axs, axes_range)

### Expected outpout:
<center><img src="outputs/3.PNG" width="380"><center>

In [None]:
from sklearn.metrics import confusion_matrix
calc_TN = lambda y_true, y_pred: confusion_matrix(y_true, y_pred)[0, 0]
calc_FP = lambda y_true, y_pred: confusion_matrix(y_true, y_pred)[0, 1]
calc_FN = lambda y_true, y_pred: confusion_matrix(y_true, y_pred)[1, 0]
calc_TP = lambda y_true, y_pred: confusion_matrix(y_true, y_pred)[1, 1]

Calculate the statistics and plot the confusion matrix as in previous tutorial.

In [None]:
#C3
from sklearn.metrics import plot_confusion_matrix, roc_auc_score

y_pred_test = best_svm_lin.predict(x_test) #NOTICE NOT TO USE THE STANDARDIZED DATA.
y_pred_proba_test = best_svm_lin.predict_proba(x_test)
#--------------------------Impelment your code here:-------------------------------------

#------------------------------------------------------------------------------------------
print('AUROC is {:.3f}'.format(roc_auc_score(y_test, y_pred_proba_test[:,1])))

### Expected outpout:
<center><img src="outputs/4.PNG" width="380"><center>

Now we would look for a **nonlinear** SVM calssifier. Use the kernels `rbf` and `poly`. For $\gamma$, use `auto` and `scale`. The order of the polynom should be 3 (so it won't take more than several minutes). Notice to use a single-element list for this.

In [None]:
#C4
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
svc = SVC(probability=True)
C = np.array([1, 100, 1000])#, 10, 100, 1000])
#--------------------------Impelment your code here:-------------------------------------

#-----------------------------------------------------------------------------------------

Choose the best estimator and name it as `best_svm_nonlin`.

---
<span style="color:red">***Question:***</span> *Which kernel do you think is more proper to use?*

---

In [None]:
#C5
#--------------------------Impelment your code here:-------------------------------------

#------------------------------------------------------------------------------------------

### Expected outpout:
<center><img src="outputs/5.PNG" width="380"><center>

Now you can plot The performances as a function of missclassification penalties for different kernels and $\gamma$.

In [None]:
clf_type = ['rbf', 'scale']
plot_radar(svm_nonlin, clf_type)
clf_type = ['poly', 'scale']
plot_radar(svm_nonlin, clf_type)

### Expected outpout:
<center><img src="outputs/6.PNG" width="700"><center>

Let's visualize the decision function upon our data:

In [None]:
_, axs = plot_dataset(X_train, x_test, Y_train, y_test, axes_range)
plot_predictions(best_svm_nonlin, axs, axes_range)

### Expected outpout:
<center><img src="outputs/7.PNG" width="380"><center>

Calculate the statistics and plot the confusion matrix as in previous tutorial.

In [None]:
#C6
from sklearn.metrics import plot_confusion_matrix, roc_auc_score

y_pred_test = best_svm_nonlin.predict(x_test) #NOTICE NOT TO USE THE STANDARDIZED DATA.
y_pred_proba_test = best_svm_nonlin.predict_proba(x_test)
#--------------------------Impelment your code here:-------------------------------------

#------------------------------------------------------------------------------------------
print('AUROC is {:.3f}'.format(roc_auc_score(y_test, y_pred_proba_test[:,1])))

### Expected outpout:
<center><img src="outputs/8.PNG" width="400"><center>

Here, we will only introduce a powerful classifier named *random forest* that was also used in the last part of *HW1*. Calculate the statistics and plot the confusion matrix as in previous tutorial.

In [None]:
#C7
from sklearn.metrics import plot_confusion_matrix, roc_auc_score,plot_roc_curve
from sklearn.ensemble import RandomForestClassifier
rfc = Pipeline(steps=[('scale', StandardScaler()), ('rfc', RandomForestClassifier(max_depth=4, random_state=336546, criterion='gini'))])
rfc.fit(X_train, Y_train)
y_pred_test = rfc.predict(x_test) #NOTICE NOT TO USE THE STANDARDIZED DATA.
y_pred_proba_test = rfc.predict_proba(x_test)
#--------------------------Impelment your code here:-------------------------------------

#------------------------------------------------------------------------------------------
print('AUROC is {:.3f}'.format(roc_auc_score(y_test, y_pred_proba_test[:,1])))

### Expected outpout:
<center><img src="outputs/9.PNG" width="380"><center>

Finally, we will compare the classifiers according to AUROC.

In [None]:
classifiers = [best_svm_lin, best_svm_nonlin, rfc]
roc_score = []
plt.figure()
ax = plt.gca()
for clf in classifiers:
    plot_roc_curve(clf, x_test, y_test, ax=ax)
    roc_score.append(np.round_(roc_auc_score(y_test, clf.predict_proba(x_test)[:,1]), decimals=3))
ax.plot(np.linspace(0,1,x_test.shape[0]),np.linspace(0,1,x_test.shape[0]))
plt.legend(('lin_svm, AUROC = '+str(roc_score[0]),'nonlin_svm, AUROC = '+str(roc_score[1]),'rfc, AUROC = '+str(roc_score[2]),'flipping a coin'))

### Expected outpout:
<center><img src="outputs/10.PNG" width="380"><center>

#### *This tutorial was written by [Moran Davoodi](mailto:morandavoodi@gmail.com) & Alon Begin with the assitance of [Yuval Ben Sason](mailto:yuvalbse@gmail.com) & Kevin Kotzen*