# Experiment 02: Scattering + PCA + SVM






In [1]:
import sys
sys.path.append('../src')
#import warnings
#warnings.filterwarnings("ignore") 

from utils.reduce import reduce_pca
from utils.split import train_test_split
from pprint import pprint
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from tqdm import tqdm

from itertools import product
import pickle
import pandas as pd
import numpy as np
import mlflow
import matplotlib.pyplot as plt


In [2]:
mlflow.set_experiment('scattering_svm_pca_experimentv2')

INFO: 'scattering_svm_pca_experimentv2' does not exist. Creating a new experiment


  and should_run_async(code)


## Feature Reduction/Selection

#### Upload Scattering Features

In [3]:
with open('../data/03_features/scattering_features.pickle', 'rb') as handle:
    scatter_dict = pickle.load(handle)
    df_scattering = scatter_dict['df']
    scattering_params = {'J':scatter_dict['J'],
                         'M':scatter_dict['M'],
                         'N':scatter_dict['N']}

#### Apply PCA

Since sklearn is used for PCA, the dataset will be transformed into a panda dataframe.

# Cross Validation using SVM Classification

> Methods that exclude outliers were used to normalize the features. Patient-specific leave-one-out cross-validation (LOOCV) was applied to evaluate the classification. In each case, the test set consisted of10 images from the same patient and the training set contained 540 images from the remaining 54 patients. For each training set, fivefold cross-validation and grid search were applied to indicate the optimal SVM classifier hyperparameters and the best kernel. To address the problem of class imbalance, the SVM hyperparameter C of each class was adjusted inversely proportional to that class frequency in the training set. Label 1 indicated the image containing a fatty liver and label −1 otherwise. 


In [4]:
df_train, df_test = train_test_split(df_scattering)

In [5]:
# Set the parameters by cross-validation
param_gamma = [1e-3, 1e-4]
param_C = [1, 10, 100, 1000] 
pca_n_components = [5,10,15]
svm_class_weight = [None, 'balanced']
rbf_params = list(product(['kernel'],param_gamma, param_C,svm_class_weight ))
linear_params = list(product(['linear'],param_C, svm_class_weight))
params = rbf_params + linear_params

In [6]:
standardize = True
pca_n_components = 5
df_train_pid = df_train.pop('id')
df_train_y = df_train.pop('class')

In [7]:
# 5 folds
n_splits=5
metrics={}

df_train, df_test = train_test_split(df_scattering)
standardize = True
pca_n_components = 5
df_train_pid = df_train.pop('id')
df_train_y = df_train.pop('class')
# Do cross-validation
group_kfold = GroupKFold(n_splits=n_splits)

for train_index, valid_index in group_kfold.split(df_train, 
                                                  df_train_y, 
                                                  df_train_pid):

    X_train, X_valid = df_train.iloc[train_index], df_train.iloc[valid_index]
    y_train, y_valid = df_train_y.iloc[train_index], df_train_y.iloc[valid_index]


    #apply PCA to each fold
    pca = PCA(n_components=pca_n_components)           
    X_train = pca.fit_transform(X_train)
    X_valid = pca.transform(X_valid)

    #standardize
    if standardize:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_valid = scaler.transform(X_valid)

    for param in tqdm(params):
        if str(param) not in metrics.keys() :
            metrics[str(param)] ={'acc':[], 'auc':[], 'sensitivity':[], 'specificity':[]}

        if param[0] == 'kernel': 
            #The “balanced” mode uses the values of y to automatically adjust weights inversely
            #proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
            model = SVC(gamma=param[1], C=param[2], class_weight= param[3])
        if param[0] == 'linear': 
            #The “balanced” mode uses the values of y to automatically adjust weights inversely
            #proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
            model = LinearSVC(C=param[1], class_weight= param[2])
            #mlflow.log_param('C',param[1])

        model.fit(X_train, y_train)
        predictions = model.predict(X_valid)

        acc = accuracy_score(y_valid, predictions)
        auc = roc_auc_score(y_valid, predictions)
        tn, fp, fn, tp = confusion_matrix(y_valid, predictions).ravel()
        specificity = tn / (tn+fp)
        sensitivity = tp / (tp+fn)

        metrics[str(param)]['auc'].append(auc)
        metrics[str(param)]['acc'].append(acc)
        metrics[str(param)]['sensitivity'].append(sensitivity)
        metrics[str(param)]['specificity'].append(specificity)



    # log mlflow params
    for param in params:
        with mlflow.start_run():
            #log params
            mlflow.log_param('pca_n',pca_n_components)
            mlflow.log_params(scattering_params)
            mlflow.log_param('model',f'svm: {param[0]}')
            if param[0] == 'kernel':
                mlflow.log_param('gamma',param[1])
                mlflow.log_param('C',param[2])
                mlflow.log_param('class weight svm', param[3])
            if param[0] == 'linear': 
                mlflow.log_param('C',param[1])
                mlflow.log_param('class weight svm', param[2])
            #log metrics
            mlflow.log_metric('accuracy',np.array(metrics[str(param)]['acc']).mean())
            mlflow.log_metric('AUC',np.array(metrics[str(param)]['auc']).mean())
            mlflow.log_metric('specificity',np.array(metrics[str(param)]['specificity']).mean())
            mlflow.log_metric('sensitivity',np.array(metrics[str(param)]['sensitivity']).mean())




100%|██████████| 24/24 [00:00<00:00, 100.40it/s]
100%|██████████| 24/24 [00:00<00:00, 118.81it/s]
100%|██████████| 24/24 [00:00<00:00, 129.77it/s]
100%|██████████| 24/24 [00:00<00:00, 151.92it/s]
100%|██████████| 24/24 [00:00<00:00, 99.53it/s] 


# Analyzing PCA




In [None]:
pca = PCA(n_components=p50)           
data = pca.fit_transform(df_train)

In [None]:
plt.plot(np.insert(pca.explained_variance_ratio_.cumsum(),0,0),marker='o')
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.show()

In [None]:
print(pca.explained_variance_ratio_.cumsum())

# Test Prediction

In [2]:
# !mlflow ui 
# Set a new mlflow experiment
# Use the best hyperparameters to train a model on the whole training data
# Test and record results!
mlflow.set_experiment('test_results_dataset_liver_bmodes_steatosis_assessment_IJCARS')

  and should_run_async(code)


Best combination of hyper parameters
<img width="788" alt="Screen Shot 2020-09-29 at 10 46 57 PM" src="https://user-images.githubusercontent.com/23482039/94637580-f0569e80-02a5-11eb-9c4a-0d06abe20d85.png">

In [15]:
with open('../data/03_features/scattering_features.pickle', 'rb') as handle:
    scatter_dict = pickle.load(handle)
    df_scattering = scatter_dict['df']
    scattering_params = {'J':scatter_dict['J'],
                         'M':scatter_dict['M'],
                         'N':scatter_dict['N']}

In [16]:
df_train, df_test = train_test_split(df_scattering)
# note the id of the patients in the test set
df_test['id'].unique()

array([ 4,  9, 28, 33, 52, 53], dtype=uint8)

In [17]:
pca_n_components = 5

df_train.pop('id')
df_test.pop('id')
df_train_y = df_train.pop('class')
df_test_y = df_test.pop('class')

pca = PCA(n_components=pca_n_components)           
df_train = pca.fit_transform(df_train)
df_test = pca.transform(df_test)


In [18]:
scaler = StandardScaler()
df_train = scaler.fit_transform(df_train)
df_test = scaler.transform(df_test)

  and should_run_async(code)


In [23]:
with mlflow.start_run():
    model = SVC(C=1000, gamma= 0.0001) #class_weight= 'balanced')
    model.fit(df_train, df_train_y)
    predictions = model.predict(df_test)
    acc = accuracy_score(df_test_y, predictions)
    mlflow.log_param('Model', 'Scattering features + PCA + SVM')
    mlflow.log_metric('accuracy', acc)


In [24]:
predictions

  and should_run_async(code)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=uint8)

In [14]:
df_test_y

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    1
21    1
22    1
23    1
24    1
25    1
26    1
27    1
28    1
29    1
30    1
31    1
32    1
33    1
34    1
35    1
36    1
37    1
38    1
39    1
40    1
41    1
42    1
43    1
44    1
45    1
46    1
47    1
48    1
49    1
50    1
51    1
52    1
53    1
54    1
55    1
56    1
57    1
58    1
59    1
Name: class, dtype: uint8

In [8]:
print('The test accuracy of the model is ', acc)

The test accuracy of the model is  0.8333333333333334


  and should_run_async(code)
