# Experiment 02: Scattering + PCA + SVM






In [1]:
import sys
import random
sys.path.append('../src')
import warnings
warnings.filterwarnings("ignore") 

from utils.compute_metrics import get_metrics, get_majority_vote,log_test_metrics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import GroupKFold
from tqdm import tqdm
from pprint import pprint

from itertools import product
import pickle
import pandas as pd
import numpy as np
import mlflow
import matplotlib.pyplot as plt


## Feature Reduction/Selection

#### Upload Scattering Features

In [2]:
with open('../data/03_features/scattering_features_J_3.pickle', 'rb') as handle:
    scatter_dict = pickle.load(handle)
    df_scattering = scatter_dict['df']
    scattering_params = {'J':scatter_dict['J'],
                         'M':scatter_dict['M'],
                         'N':scatter_dict['N']}

  and should_run_async(code)


# Cross Validation using SVM Classification

> Methods that exclude outliers were used to normalize the features. Patient-specific leave-one-out cross-validation (LOOCV) was applied to evaluate the classification. In each case, the test set consisted of10 images from the same patient and the training set contained 540 images from the remaining 54 patients. For each training set, fivefold cross-validation and grid search were applied to indicate the optimal SVM classifier hyperparameters and the best kernel. To address the problem of class imbalance, the SVM hyperparameter C of each class was adjusted inversely proportional to that class frequency in the training set. Label 1 indicated the image containing a fatty liver and label −1 otherwise. 


In [3]:
# Set the parameters by cross-validation
param_gamma = [1e-3, 1e-4]
param_C = [1, 10, 1000] 
svm_class_weight = [None, 'balanced']
rbf_params = list(product(['kernel'],param_gamma, param_C, svm_class_weight ))
linear_params = list(product(['linear'],param_C, svm_class_weight))
params = rbf_params + linear_params

In [4]:
def train_valid(param, X_train,X_valid,y_train, y_valid):
    if param[0] == 'kernel': 
        #The “balanced” mode uses the values of y to automatically adjust weights inversely
        #proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
        model = SVC(gamma=param[1], C=param[2], class_weight= param[3])
    if param[0] == 'linear': 
        #The “balanced” mode uses the values of y to automatically adjust weights inversely
        #proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
        model = LinearSVC(C=param[1], class_weight= param[2])

    model.fit(X_train, y_train)
    predictions = model.predict(X_valid)
    acc, auc, specificity, sensitivity = get_metrics(y_valid, predictions)
    return acc, auc, specificity, sensitivity , predictions


In [5]:
def log_val_metrics(params, metrics, test_n_splits, num_components=5, standardize=True):
    # Important !!!! Put Correct Experiment Name
    mlflow.set_experiment('val_scattering_svm_pca_experiment')
    # log mlflow params
    for param in params:
        with mlflow.start_run():
            #log params
            mlflow.log_param('pca_n',num_components)
            mlflow.log_params(scattering_params)
            mlflow.log_param('model',f'svm: {param[0]}')
            mlflow.log_param('test K fold', test_n_splits)
            if param[0] == 'kernel':
                mlflow.log_param('gamma',param[1])
                mlflow.log_param('C',param[2])
                mlflow.log_param('class weight svm', param[3])
            if param[0] == 'linear': 
                mlflow.log_param('C',param[1])
                mlflow.log_param('class weight svm', param[2])
            #log metrics
            mlflow.log_metric('accuracy',np.array(metrics[str(param)]['acc']).mean())
            mlflow.log_metric('AUC',np.array(metrics[str(param)]['auc']).mean())
            mlflow.log_metric('specificity',np.array(metrics[str(param)]['specificity']).mean())
            mlflow.log_metric('sensitivity',np.array(metrics[str(param)]['sensitivity']).mean())
    print("Done logging validation params in MLFlow")

In [6]:
df = df_scattering
pca_n_components = 5
standardize = True
test_metrics={}  
#majority vote results
test_metrics_mv={} 
# test split 55 et 11
test_n_splits = 11
group_kfold_test = GroupKFold(n_splits=test_n_splits)
seed= 11
df_pid = df['id']
df_y = df['class']
fold_c =1 

for train_index, test_index in group_kfold_test.split(df, 
                                                  df_y, 
                                                  df_pid):
    random.seed(seed)
    random.shuffle(train_index)
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    y_train, y_test = df_y.iloc[train_index], df_y.iloc[test_index]
    
    X_test = X_test.drop(columns=['id', 'class'])
    X_train_pid = X_train.pop('id')
    X_train = X_train.drop(columns=['class'])
    
    # Do cross-validation for hyperparam tuning
    group_kfold_val = GroupKFold(n_splits=5)
    metrics={}
    #X_train_y = df.pop('class')
    for subtrain_index, valid_index in group_kfold_val.split(X_train, 
                                                      y_train, 
                                                      X_train_pid):
                                   
        X_subtrain, X_valid = X_train.iloc[subtrain_index], X_train.iloc[valid_index]
        y_subtrain, y_valid = y_train.iloc[subtrain_index], y_train.iloc[valid_index]


        
        pca = PCA(n_components=pca_n_components,random_state = seed)           
        X_subtrain = pca.fit_transform(X_subtrain)
        X_valid = pca.transform(X_valid)
        
        #standardize
        if standardize:
            scaler = StandardScaler()
            X_subtrain = scaler.fit_transform(X_subtrain)
            X_valid = scaler.transform(X_valid)
        

        for param in tqdm(params):
            if str(param) not in metrics.keys() :
                metrics[str(param)] ={'acc':[], 'auc':[], 'sensitivity':[], 'specificity':[]}
                                   
            acc, auc, specificity, sensitivity,_ = train_valid(param, X_subtrain,X_valid,y_subtrain, y_valid)
            metrics[str(param)]['auc'].append(auc)
            metrics[str(param)]['acc'].append(acc)
            metrics[str(param)]['sensitivity'].append(sensitivity)
            metrics[str(param)]['specificity'].append(specificity)
    #log validation metrics for all combination of params
    log_val_metrics(params, metrics, test_n_splits, pca_n_components, standardize=standardize )
    
    #highest accuracy
    index_param_max = np.array([np.array(metrics[str(param)]['auc']).mean() for param in params]).argmax()
    print('From all the combinations, the highest accuracy was achieved with', params[index_param_max])
    #train and test with max param

    
    pca = PCA(n_components=pca_n_components)           
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    
    #standardize
    if standardize:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    acc, auc, specificity, sensitivity, predictions = train_valid(params[index_param_max], X_train, X_test, y_train, y_test)
    
    #compute majority vote metrics
    acc_mv, auc_mv, specificity_mv, sensitivity_mv = get_majority_vote(y_test, predictions)
    
    print('FOLD '+ str(fold_c) + ':  acc ' + str(acc) +  ', auc ' +  str(auc) +  ', specificity '+ str(specificity)
          + ', sensitivity ' + str(sensitivity))
    print('FOLD '+ str(fold_c) + ':  MV acc ' + str(acc_mv) +  ', MV auc ' +  str(auc_mv) +  ', MV specificity '+ str(specificity_mv)
          + ', MV sensitivity ' + str(sensitivity_mv))
    
    test_metrics[fold_c]=  {'acc':acc, 'auc':auc, 'sensitivity':sensitivity, 'specificity':specificity, 'param':params[index_param_max]}
    test_metrics_mv[fold_c]=  {'acc':acc_mv, 'auc':auc_mv, 'sensitivity':sensitivity_mv, 'specificity':specificity_mv, 'param':params[index_param_max]}
    
    fold_c +=1 

log_test_metrics(test_metrics, test_metrics_mv, test_n_splits, 'Scattering features + PCA + SVM',None, seed, pca_n_components, standardize=standardize)


100%|██████████| 18/18 [00:00<00:00, 77.01it/s]
100%|██████████| 18/18 [00:00<00:00, 92.74it/s]
100%|██████████| 18/18 [00:00<00:00, 110.56it/s]
100%|██████████| 18/18 [00:00<00:00, 115.95it/s]
100%|██████████| 18/18 [00:00<00:00, 110.99it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 1, 'balanced')
FOLD 1:  acc 1.0, auc 1.0, specificity 1.0, sensitivity 1.0
FOLD 1:  MV acc 1.0, MV auc 1.0, MV specificity 1.0, MV sensitivity 1.0


100%|██████████| 18/18 [00:00<00:00, 110.21it/s]
100%|██████████| 18/18 [00:00<00:00, 115.10it/s]
100%|██████████| 18/18 [00:00<00:00, 99.89it/s]
100%|██████████| 18/18 [00:00<00:00, 109.37it/s]
100%|██████████| 18/18 [00:00<00:00, 109.26it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.0001, 1000, 'balanced')
FOLD 2:  acc 0.86, auc 0.8833333333333333, specificity 0.7666666666666667, sensitivity 1.0
FOLD 2:  MV acc 0.8, MV auc 0.8333333333333334, MV specificity 0.6666666666666666, MV sensitivity 1.0


100%|██████████| 18/18 [00:00<00:00, 66.01it/s]
100%|██████████| 18/18 [00:00<00:00, 57.30it/s]
100%|██████████| 18/18 [00:00<00:00, 117.92it/s]
100%|██████████| 18/18 [00:00<00:00, 118.18it/s]
100%|██████████| 18/18 [00:00<00:00, 120.26it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('linear', 10, None)
FOLD 3:  acc 0.6, auc 0.5833333333333333, specificity 0.5, sensitivity 0.6666666666666666
FOLD 3:  MV acc 0.6, MV auc 0.5833333333333333, MV specificity 0.5, MV sensitivity 0.6666666666666666


100%|██████████| 18/18 [00:00<00:00, 107.87it/s]
100%|██████████| 18/18 [00:00<00:00, 114.28it/s]
100%|██████████| 18/18 [00:00<00:00, 42.58it/s]
100%|██████████| 18/18 [00:00<00:00, 119.94it/s]
100%|██████████| 18/18 [00:00<00:00, 128.73it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 10, 'balanced')
FOLD 4:  acc 0.8, auc 0.5, specificity 0.0, sensitivity 1.0
FOLD 4:  MV acc 0.8, MV auc 0.5, MV specificity 0.0, MV sensitivity 1.0


100%|██████████| 18/18 [00:00<00:00, 104.29it/s]
100%|██████████| 18/18 [00:00<00:00, 118.00it/s]
100%|██████████| 18/18 [00:00<00:00, 124.36it/s]
100%|██████████| 18/18 [00:00<00:00, 122.74it/s]
100%|██████████| 18/18 [00:00<00:00, 95.07it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 10, 'balanced')
FOLD 5:  acc 1.0, auc 1.0, specificity 1.0, sensitivity 1.0
FOLD 5:  MV acc 1.0, MV auc 1.0, MV specificity 1.0, MV sensitivity 1.0


100%|██████████| 18/18 [00:00<00:00, 104.55it/s]
100%|██████████| 18/18 [00:00<00:00, 62.38it/s]
100%|██████████| 18/18 [00:00<00:00, 134.98it/s]
100%|██████████| 18/18 [00:00<00:00, 126.35it/s]
100%|██████████| 18/18 [00:00<00:00, 116.32it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 10, 'balanced')
FOLD 6:  acc 0.8, auc 0.875, specificity 1.0, sensitivity 0.75
FOLD 6:  MV acc 0.8, MV auc 0.875, MV specificity 1.0, MV sensitivity 0.75


100%|██████████| 18/18 [00:00<00:00, 43.13it/s]
100%|██████████| 18/18 [00:00<00:00, 111.78it/s]
100%|██████████| 18/18 [00:00<00:00, 119.66it/s]
100%|██████████| 18/18 [00:00<00:00, 139.19it/s]
100%|██████████| 18/18 [00:00<00:00, 57.86it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 10, 'balanced')
FOLD 7:  acc 1.0, auc 1.0, specificity 1.0, sensitivity 1.0
FOLD 7:  MV acc 1.0, MV auc 1.0, MV specificity 1.0, MV sensitivity 1.0


100%|██████████| 18/18 [00:00<00:00, 115.54it/s]
100%|██████████| 18/18 [00:00<00:00, 106.06it/s]
100%|██████████| 18/18 [00:00<00:00, 119.06it/s]
100%|██████████| 18/18 [00:00<00:00, 124.24it/s]
100%|██████████| 18/18 [00:00<00:00, 113.11it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 1, 'balanced')
FOLD 8:  acc 1.0, auc 1.0, specificity 1.0, sensitivity 1.0
FOLD 8:  MV acc 1.0, MV auc 1.0, MV specificity 1.0, MV sensitivity 1.0


100%|██████████| 18/18 [00:00<00:00, 126.83it/s]
100%|██████████| 18/18 [00:00<00:00, 111.23it/s]
100%|██████████| 18/18 [00:00<00:00, 40.66it/s]
100%|██████████| 18/18 [00:00<00:00, 132.19it/s]
100%|██████████| 18/18 [00:00<00:00, 114.36it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 1, 'balanced')
FOLD 9:  acc 0.6, auc 0.75, specificity 1.0, sensitivity 0.5
FOLD 9:  MV acc 0.6, MV auc 0.75, MV specificity 1.0, MV sensitivity 0.5


100%|██████████| 18/18 [00:00<00:00, 120.65it/s]
100%|██████████| 18/18 [00:00<00:00, 60.05it/s]
100%|██████████| 18/18 [00:00<00:00, 149.11it/s]
100%|██████████| 18/18 [00:00<00:00, 149.41it/s]
100%|██████████| 18/18 [00:00<00:00, 135.56it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.0001, 10, 'balanced')
FOLD 10:  acc 0.4, auc 0.25, specificity 0.0, sensitivity 0.5
FOLD 10:  MV acc 0.4, MV auc 0.25, MV specificity 0.0, MV sensitivity 0.5


100%|██████████| 18/18 [00:00<00:00, 118.05it/s]
100%|██████████| 18/18 [00:00<00:00, 109.86it/s]
100%|██████████| 18/18 [00:00<00:00, 107.40it/s]
100%|██████████| 18/18 [00:00<00:00, 43.97it/s]
100%|██████████| 18/18 [00:00<00:00, 117.02it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 10, 'balanced')
FOLD 11:  acc 0.8, auc 0.75, specificity 1.0, sensitivity 0.5
FOLD 11:  MV acc 0.8, MV auc 0.75, MV specificity 1.0, MV sensitivity 0.5
0.8054545454545454 0.7810606060606061
0.8 0.7765151515151516
Experiment done


In [7]:
df = df_scattering
pca_n_components = 8
standardize = True
test_metrics={}  
#majority vote results
test_metrics_mv={} 
# test split 55 et 11
test_n_splits = 11
group_kfold_test = GroupKFold(n_splits=test_n_splits)
seed= 11
df_pid = df['id']
df_y = df['class']
fold_c =1 

for train_index, test_index in group_kfold_test.split(df, 
                                                  df_y, 
                                                  df_pid):
    random.seed(seed)
    random.shuffle(train_index)
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    y_train, y_test = df_y.iloc[train_index], df_y.iloc[test_index]
    
    X_test = X_test.drop(columns=['id', 'class'])
    X_train_pid = X_train.pop('id')
    X_train = X_train.drop(columns=['class'])
    
    # Do cross-validation for hyperparam tuning
    group_kfold_val = GroupKFold(n_splits=5)
    metrics={}
    #X_train_y = df.pop('class')
    for subtrain_index, valid_index in group_kfold_val.split(X_train, 
                                                      y_train, 
                                                      X_train_pid):
                                   
        X_subtrain, X_valid = X_train.iloc[subtrain_index], X_train.iloc[valid_index]
        y_subtrain, y_valid = y_train.iloc[subtrain_index], y_train.iloc[valid_index]


        
        pca = PCA(n_components=pca_n_components,random_state = seed)           
        X_subtrain = pca.fit_transform(X_subtrain)
        X_valid = pca.transform(X_valid)
        
        #standardize
        if standardize:
            scaler = StandardScaler()
            X_subtrain = scaler.fit_transform(X_subtrain)
            X_valid = scaler.transform(X_valid)
        

        for param in tqdm(params):
            if str(param) not in metrics.keys() :
                metrics[str(param)] ={'acc':[], 'auc':[], 'sensitivity':[], 'specificity':[]}
                                   
            acc, auc, specificity, sensitivity,_ = train_valid(param, X_subtrain,X_valid,y_subtrain, y_valid)
            metrics[str(param)]['auc'].append(auc)
            metrics[str(param)]['acc'].append(acc)
            metrics[str(param)]['sensitivity'].append(sensitivity)
            metrics[str(param)]['specificity'].append(specificity)
    #log validation metrics for all combination of params
    log_val_metrics(params, metrics, test_n_splits, pca_n_components, standardize=standardize )
    
    #highest accuracy
    index_param_max = np.array([np.array(metrics[str(param)]['auc']).mean() for param in params]).argmax()
    print('From all the combinations, the highest accuracy was achieved with', params[index_param_max])
    #train and test with max param

    
    pca = PCA(n_components=pca_n_components)           
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    
    #standardize
    if standardize:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    acc, auc, specificity, sensitivity, predictions = train_valid(params[index_param_max], X_train, X_test, y_train, y_test)
    
    #compute majority vote metrics
    acc_mv, auc_mv, specificity_mv, sensitivity_mv = get_majority_vote(y_test, predictions)
    
    print('FOLD '+ str(fold_c) + ':  acc ' + str(acc) +  ', auc ' +  str(auc) +  ', specificity '+ str(specificity)
          + ', sensitivity ' + str(sensitivity))
    print('FOLD '+ str(fold_c) + ':  MV acc ' + str(acc_mv) +  ', MV auc ' +  str(auc_mv) +  ', MV specificity '+ str(specificity_mv)
          + ', MV sensitivity ' + str(sensitivity_mv))
    
    test_metrics[fold_c]=  {'acc':acc, 'auc':auc, 'sensitivity':sensitivity, 'specificity':specificity, 'param':params[index_param_max]}
    test_metrics_mv[fold_c]=  {'acc':acc_mv, 'auc':auc_mv, 'sensitivity':sensitivity_mv, 'specificity':specificity_mv, 'param':params[index_param_max]}
    
    fold_c +=1 

log_test_metrics(test_metrics, test_metrics_mv, test_n_splits, 'Scattering features + PCA + SVM',None, seed, pca_n_components, standardize=standardize)


  and should_run_async(code)
100%|██████████| 18/18 [00:00<00:00, 71.42it/s]
100%|██████████| 18/18 [00:00<00:00, 70.46it/s]
100%|██████████| 18/18 [00:00<00:00, 40.81it/s]
100%|██████████| 18/18 [00:00<00:00, 74.71it/s]
100%|██████████| 18/18 [00:00<00:00, 76.17it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 1, 'balanced')
FOLD 1:  acc 0.8, auc 0.8333333333333333, specificity 1.0, sensitivity 0.6666666666666666
FOLD 1:  MV acc 0.8, MV auc 0.8333333333333333, MV specificity 1.0, MV sensitivity 0.6666666666666666


100%|██████████| 18/18 [00:00<00:00, 74.03it/s]
100%|██████████| 18/18 [00:00<00:00, 77.38it/s]
100%|██████████| 18/18 [00:00<00:00, 34.88it/s]
100%|██████████| 18/18 [00:00<00:00, 85.11it/s]
100%|██████████| 18/18 [00:00<00:00, 85.39it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.0001, 1000, 'balanced')
FOLD 2:  acc 0.84, auc 0.8666666666666667, specificity 0.7333333333333333, sensitivity 1.0
FOLD 2:  MV acc 0.8, MV auc 0.8333333333333334, MV specificity 0.6666666666666666, MV sensitivity 1.0


100%|██████████| 18/18 [00:00<00:00, 75.61it/s]
100%|██████████| 18/18 [00:00<00:00, 78.59it/s]
100%|██████████| 18/18 [00:00<00:00, 33.58it/s]
100%|██████████| 18/18 [00:00<00:00, 82.85it/s]
100%|██████████| 18/18 [00:00<00:00, 89.27it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 10, 'balanced')
FOLD 3:  acc 0.66, auc 0.7166666666666667, specificity 1.0, sensitivity 0.43333333333333335
FOLD 3:  MV acc 0.6, MV auc 0.6666666666666666, MV specificity 1.0, MV sensitivity 0.3333333333333333


100%|██████████| 18/18 [00:00<00:00, 76.37it/s]
100%|██████████| 18/18 [00:00<00:00, 63.95it/s]
100%|██████████| 18/18 [00:00<00:00, 76.01it/s]
100%|██████████| 18/18 [00:00<00:00, 76.96it/s]
100%|██████████| 18/18 [00:00<00:00, 78.24it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 10, 'balanced')
FOLD 4:  acc 0.8, auc 0.5, specificity 0.0, sensitivity 1.0
FOLD 4:  MV acc 0.8, MV auc 0.5, MV specificity 0.0, MV sensitivity 1.0


100%|██████████| 18/18 [00:00<00:00, 70.79it/s]
100%|██████████| 18/18 [00:00<00:00, 75.97it/s]
100%|██████████| 18/18 [00:00<00:00, 77.00it/s]
100%|██████████| 18/18 [00:00<00:00, 58.21it/s]
100%|██████████| 18/18 [00:00<00:00, 73.33it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 10, 'balanced')
FOLD 5:  acc 1.0, auc 1.0, specificity 1.0, sensitivity 1.0
FOLD 5:  MV acc 1.0, MV auc 1.0, MV specificity 1.0, MV sensitivity 1.0


100%|██████████| 18/18 [00:00<00:00, 73.01it/s]
100%|██████████| 18/18 [00:00<00:00, 57.75it/s]
100%|██████████| 18/18 [00:00<00:00, 78.14it/s]
100%|██████████| 18/18 [00:00<00:00, 86.40it/s]
100%|██████████| 18/18 [00:00<00:00, 69.16it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 10, 'balanced')
FOLD 6:  acc 0.8, auc 0.875, specificity 1.0, sensitivity 0.75
FOLD 6:  MV acc 0.8, MV auc 0.875, MV specificity 1.0, MV sensitivity 0.75


100%|██████████| 18/18 [00:00<00:00, 69.40it/s]
100%|██████████| 18/18 [00:00<00:00, 72.40it/s]
100%|██████████| 18/18 [00:00<00:00, 80.94it/s]
100%|██████████| 18/18 [00:00<00:00, 37.23it/s]
100%|██████████| 18/18 [00:00<00:00, 69.36it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 1, 'balanced')
FOLD 7:  acc 1.0, auc 1.0, specificity 1.0, sensitivity 1.0
FOLD 7:  MV acc 1.0, MV auc 1.0, MV specificity 1.0, MV sensitivity 1.0


100%|██████████| 18/18 [00:00<00:00, 71.39it/s]
100%|██████████| 18/18 [00:00<00:00, 33.67it/s]
100%|██████████| 18/18 [00:00<00:00, 81.09it/s]
100%|██████████| 18/18 [00:00<00:00, 90.26it/s]
100%|██████████| 18/18 [00:00<00:00, 70.51it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 10, 'balanced')
FOLD 8:  acc 1.0, auc 1.0, specificity 1.0, sensitivity 1.0
FOLD 8:  MV acc 1.0, MV auc 1.0, MV specificity 1.0, MV sensitivity 1.0


100%|██████████| 18/18 [00:00<00:00, 41.79it/s]
100%|██████████| 18/18 [00:00<00:00, 76.45it/s]
100%|██████████| 18/18 [00:00<00:00, 81.03it/s]
100%|██████████| 18/18 [00:00<00:00, 81.74it/s]
100%|██████████| 18/18 [00:00<00:00, 76.39it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 10, 'balanced')
FOLD 9:  acc 0.6, auc 0.75, specificity 1.0, sensitivity 0.5
FOLD 9:  MV acc 0.6, MV auc 0.75, MV specificity 1.0, MV sensitivity 0.5


100%|██████████| 18/18 [00:00<00:00, 89.71it/s]
100%|██████████| 18/18 [00:00<00:00, 90.20it/s]
100%|██████████| 18/18 [00:00<00:00, 96.61it/s]
100%|██████████| 18/18 [00:00<00:00, 98.03it/s]
100%|██████████| 18/18 [00:00<00:00, 88.85it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 1, 'balanced')
FOLD 10:  acc 0.4, auc 0.25, specificity 0.0, sensitivity 0.5
FOLD 10:  MV acc 0.4, MV auc 0.25, MV specificity 0.0, MV sensitivity 0.5


100%|██████████| 18/18 [00:00<00:00, 44.82it/s]
100%|██████████| 18/18 [00:00<00:00, 75.64it/s]
100%|██████████| 18/18 [00:00<00:00, 76.92it/s]
100%|██████████| 18/18 [00:00<00:00, 76.29it/s]
100%|██████████| 18/18 [00:00<00:00, 45.65it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.0001, 10, 'balanced')
FOLD 11:  acc 0.8, auc 0.75, specificity 1.0, sensitivity 0.5
FOLD 11:  MV acc 0.8, MV auc 0.75, MV specificity 1.0, MV sensitivity 0.5
0.790909090909091 0.7765151515151516
0.7818181818181817 0.7689393939393938
Experiment done


In [None]:
df = df_scattering
pca_n_components = 7
standardize = True
test_metrics={}  
#majority vote results
test_metrics_mv={} 
# test split 55 et 11
test_n_splits = 11
group_kfold_test = GroupKFold(n_splits=test_n_splits)
seed= 11
df_pid = df['id']
df_y = df['class']
fold_c =1 

for train_index, test_index in group_kfold_test.split(df, 
                                                  df_y, 
                                                  df_pid):
    random.seed(seed)
    random.shuffle(train_index)
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    y_train, y_test = df_y.iloc[train_index], df_y.iloc[test_index]
    
    X_test = X_test.drop(columns=['id', 'class'])
    X_train_pid = X_train.pop('id')
    X_train = X_train.drop(columns=['class'])
    
    # Do cross-validation for hyperparam tuning
    group_kfold_val = GroupKFold(n_splits=5)
    metrics={}
    #X_train_y = df.pop('class')
    for subtrain_index, valid_index in group_kfold_val.split(X_train, 
                                                      y_train, 
                                                      X_train_pid):
                                   
        X_subtrain, X_valid = X_train.iloc[subtrain_index], X_train.iloc[valid_index]
        y_subtrain, y_valid = y_train.iloc[subtrain_index], y_train.iloc[valid_index]


        
        pca = PCA(n_components=pca_n_components,random_state = seed)           
        X_subtrain = pca.fit_transform(X_subtrain)
        X_valid = pca.transform(X_valid)
        
        #standardize
        if standardize:
            scaler = StandardScaler()
            X_subtrain = scaler.fit_transform(X_subtrain)
            X_valid = scaler.transform(X_valid)
        

        for param in tqdm(params):
            if str(param) not in metrics.keys() :
                metrics[str(param)] ={'acc':[], 'auc':[], 'sensitivity':[], 'specificity':[]}
                                   
            acc, auc, specificity, sensitivity,_ = train_valid(param, X_subtrain,X_valid,y_subtrain, y_valid)
            metrics[str(param)]['auc'].append(auc)
            metrics[str(param)]['acc'].append(acc)
            metrics[str(param)]['sensitivity'].append(sensitivity)
            metrics[str(param)]['specificity'].append(specificity)
    #log validation metrics for all combination of params
    log_val_metrics(params, metrics, test_n_splits, pca_n_components, standardize=standardize )
    
    #highest accuracy
    index_param_max = np.array([np.array(metrics[str(param)]['auc']).mean() for param in params]).argmax()
    print('From all the combinations, the highest accuracy was achieved with', params[index_param_max])
    #train and test with max param

    
    pca = PCA(n_components=pca_n_components)           
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    
    #standardize
    if standardize:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    acc, auc, specificity, sensitivity, predictions = train_valid(params[index_param_max], X_train, X_test, y_train, y_test)
    
    #compute majority vote metrics
    acc_mv, auc_mv, specificity_mv, sensitivity_mv = get_majority_vote(y_test, predictions)
    
    print('FOLD '+ str(fold_c) + ':  acc ' + str(acc) +  ', auc ' +  str(auc) +  ', specificity '+ str(specificity)
          + ', sensitivity ' + str(sensitivity))
    print('FOLD '+ str(fold_c) + ':  MV acc ' + str(acc_mv) +  ', MV auc ' +  str(auc_mv) +  ', MV specificity '+ str(specificity_mv)
          + ', MV sensitivity ' + str(sensitivity_mv))
    
    test_metrics[fold_c]=  {'acc':acc, 'auc':auc, 'sensitivity':sensitivity, 'specificity':specificity, 'param':params[index_param_max]}
    test_metrics_mv[fold_c]=  {'acc':acc_mv, 'auc':auc_mv, 'sensitivity':sensitivity_mv, 'specificity':specificity_mv, 'param':params[index_param_max]}
    
    fold_c +=1 

log_test_metrics(test_metrics, test_metrics_mv, test_n_splits, 'Scattering features + PCA + SVM',None, seed, pca_n_components, standardize=standardize)


  and should_run_async(code)
100%|██████████| 18/18 [00:00<00:00, 118.33it/s]
100%|██████████| 18/18 [00:00<00:00, 102.50it/s]
100%|██████████| 18/18 [00:00<00:00, 61.35it/s]
100%|██████████| 18/18 [00:00<00:00, 126.82it/s]
100%|██████████| 18/18 [00:00<00:00, 123.63it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 10, 'balanced')
FOLD 1:  acc 1.0, auc 1.0, specificity 1.0, sensitivity 1.0
FOLD 1:  MV acc 1.0, MV auc 1.0, MV specificity 1.0, MV sensitivity 1.0


100%|██████████| 18/18 [00:00<00:00, 108.43it/s]
100%|██████████| 18/18 [00:00<00:00, 55.14it/s]
100%|██████████| 18/18 [00:00<00:00, 106.13it/s]
100%|██████████| 18/18 [00:00<00:00, 109.66it/s]
100%|██████████| 18/18 [00:00<00:00, 118.65it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.0001, 1000, 'balanced')
FOLD 2:  acc 0.86, auc 0.8833333333333333, specificity 0.7666666666666667, sensitivity 1.0
FOLD 2:  MV acc 0.8, MV auc 0.8333333333333334, MV specificity 0.6666666666666666, MV sensitivity 1.0


100%|██████████| 18/18 [00:00<00:00, 61.83it/s]
100%|██████████| 18/18 [00:00<00:00, 128.73it/s]
100%|██████████| 18/18 [00:00<00:00, 105.72it/s]
100%|██████████| 18/18 [00:00<00:00, 118.72it/s]
100%|██████████| 18/18 [00:00<00:00, 51.30it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 10, 'balanced')
FOLD 3:  acc 0.66, auc 0.7166666666666667, specificity 1.0, sensitivity 0.43333333333333335
FOLD 3:  MV acc 0.6, MV auc 0.6666666666666666, MV specificity 1.0, MV sensitivity 0.3333333333333333


100%|██████████| 18/18 [00:00<00:00, 111.73it/s]
100%|██████████| 18/18 [00:00<00:00, 122.62it/s]
100%|██████████| 18/18 [00:00<00:00, 42.93it/s]
100%|██████████| 18/18 [00:00<00:00, 120.88it/s]
100%|██████████| 18/18 [00:00<00:00, 133.51it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 10, 'balanced')
FOLD 4:  acc 0.8, auc 0.5, specificity 0.0, sensitivity 1.0
FOLD 4:  MV acc 0.8, MV auc 0.5, MV specificity 0.0, MV sensitivity 1.0


100%|██████████| 18/18 [00:00<00:00, 119.71it/s]
100%|██████████| 18/18 [00:00<00:00, 124.31it/s]
100%|██████████| 18/18 [00:00<00:00, 140.23it/s]
100%|██████████| 18/18 [00:00<00:00, 146.40it/s]
100%|██████████| 18/18 [00:00<00:00, 126.79it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 1, 'balanced')
FOLD 5:  acc 1.0, auc 1.0, specificity 1.0, sensitivity 1.0
FOLD 5:  MV acc 1.0, MV auc 1.0, MV specificity 1.0, MV sensitivity 1.0


100%|██████████| 18/18 [00:00<00:00, 106.46it/s]
100%|██████████| 18/18 [00:00<00:00, 112.08it/s]
100%|██████████| 18/18 [00:00<00:00, 116.43it/s]
100%|██████████| 18/18 [00:00<00:00, 133.87it/s]
100%|██████████| 18/18 [00:00<00:00, 120.46it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 10, 'balanced')
FOLD 6:  acc 0.8, auc 0.875, specificity 1.0, sensitivity 0.75
FOLD 6:  MV acc 0.8, MV auc 0.875, MV specificity 1.0, MV sensitivity 0.75


100%|██████████| 18/18 [00:00<00:00, 124.31it/s]
100%|██████████| 18/18 [00:00<00:00, 118.15it/s]
100%|██████████| 18/18 [00:00<00:00, 124.29it/s]
100%|██████████| 18/18 [00:00<00:00, 124.18it/s]
100%|██████████| 18/18 [00:00<00:00, 124.08it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 1, 'balanced')
FOLD 7:  acc 1.0, auc 1.0, specificity 1.0, sensitivity 1.0
FOLD 7:  MV acc 1.0, MV auc 1.0, MV specificity 1.0, MV sensitivity 1.0


100%|██████████| 18/18 [00:00<00:00, 108.29it/s]
100%|██████████| 18/18 [00:00<00:00, 101.26it/s]
100%|██████████| 18/18 [00:00<00:00, 150.27it/s]
100%|██████████| 18/18 [00:00<00:00, 159.21it/s]
100%|██████████| 18/18 [00:00<00:00, 121.84it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.001, 10, 'balanced')
FOLD 8:  acc 1.0, auc 1.0, specificity 1.0, sensitivity 1.0
FOLD 8:  MV acc 1.0, MV auc 1.0, MV specificity 1.0, MV sensitivity 1.0


100%|██████████| 18/18 [00:00<00:00, 128.00it/s]
100%|██████████| 18/18 [00:00<00:00, 130.50it/s]
100%|██████████| 18/18 [00:00<00:00, 121.47it/s]
100%|██████████| 18/18 [00:00<00:00, 165.10it/s]
100%|██████████| 18/18 [00:00<00:00, 133.99it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.0001, 1000, None)
FOLD 9:  acc 0.8, auc 0.5, specificity 0.0, sensitivity 1.0
FOLD 9:  MV acc 0.8, MV auc 0.5, MV specificity 0.0, MV sensitivity 1.0


100%|██████████| 18/18 [00:00<00:00, 143.35it/s]
100%|██████████| 18/18 [00:00<00:00, 136.12it/s]
100%|██████████| 18/18 [00:00<00:00, 148.91it/s]
100%|██████████| 18/18 [00:00<00:00, 176.58it/s]
100%|██████████| 18/18 [00:00<00:00, 161.15it/s]


Done logging validation params in MLFlow
From all the combinations, the highest accuracy was achieved with ('kernel', 0.0001, 10, 'balanced')
