In [39]:
import sys
sys.path.insert(0, '../')

import os
import pandas as pd 
import numpy as np

from sklearn.preprocessing import StandardScaler
from src.permutation_test import pair_permutation_test

from glob import glob
from pickle import load, dump
from tqdm import tqdm

from src.config import models_features_per


In [27]:
def normalize_create_training_data(train, test, blank_norm=False, remove_outlier=None):

    # Remove outlier only from the training dataset
    if remove_outlier=='all':
        train = train[train['file'].apply(lambda x: False if (x.split('/')[-1].replace('.txt', '') in ouliners_to_remove) else True)]
        test  = test[test['file'].apply(lambda x: False if (x.split('/')[-1].replace('.txt', '') in ouliners_to_remove) else True)]

    elif remove_outlier=='train_only':
        train = train[train['file'].apply(lambda x: False if (x.split('/')[-1].replace('.txt', '') in ouliners_to_remove) else True)]
        
    train = train.reset_index(drop=True)
    test  = test.reset_index(drop=True)
    
    X_train = train.drop(columns=['file']).copy()
    X_test  = test.drop(columns=['file']).copy()

    columns       = X_train.columns
    
    y_train = train['file'].apply(lambda x: int(x.split('_')[-2].replace('cbz','')))
    y_test  = test['file'].apply(lambda x: int(x.split('_')[-2].replace('cbz','')))

    assert (X_train.index.values == y_train.index.values).all()

    scaler  = StandardScaler()

    if blank_norm: scaler.fit(X_train[y_train==0])
    else: scaler.fit(X_train)

    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=columns)
    X_test  = pd.DataFrame(scaler.transform(X_test),  columns=columns)

    X_train.rename(columns={"PH": 'univariate, max(S)', 'signal_std':'univariate, std(S)', 'signal_mean':'univariate, mean(S)', 'peak area':'univariate, area(S)', \
                        'dS_dV_area':'univariate, area(dS/dV)', 'dS_dV_max_peak':'univariate, max(dS/dV)', 'dS_dV_min_peak':'univariate, min(dS/dV)',\
                    'dS_dV_peak_diff':'univariate, max(dS/dV) - min(dS/dV)', \
                    'peak V':'univariate, V_max(S)', 'dS_dV_max_V':'univariate, V_max(dS/dV)', 'dS_dV_min_V':'univariate, V_min(dS/dV)',\
        }, inplace = True)

    X_test.rename(columns={"PH": 'univariate, max(S)', 'signal_std':'univariate, std(S)', 'signal_mean':'univariate, mean(S)', 'peak area':'univariate, area(S)', \
                        'dS_dV_area':'univariate, area(dS/dV)', 'dS_dV_max_peak':'univariate, max(dS/dV)', 'dS_dV_min_peak':'univariate, min(dS/dV)',\
                    'dS_dV_peak_diff':'univariate, max(dS/dV) - min(dS/dV)', \
                    'peak V':'univariate, V_max(S)', 'dS_dV_max_V':'univariate, V_max(dS/dV)', 'dS_dV_min_V':'univariate, V_min(dS/dV)',\
        }, inplace = True)

    return (X_train, X_test, y_train, y_test), scaler

def load_dataset_train_test_splitted(filename):
    ML1 = pd.read_excel(f'/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_19_ML1/{filename}.xlsx')
    ML2 = pd.read_excel(f'/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_22_ML2/{filename}.xlsx')
    ML4 = pd.read_excel(f'/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML4/{filename}.xlsx')

    return ML1, ML2, ML4

In [28]:
ML1_noisy_train, ML2_noisy_train, ML4_noisy_train = load_dataset_train_test_splitted('extracted_features_noisy_training_dataset')
ML1_train, ML2_train, ML4_train = load_dataset_train_test_splitted('extracted_features_training_dataset')
ML1_test, ML2_test, ML4_test = load_dataset_train_test_splitted('extracted_features_testing_dataset')

# Test if training and testing has common dataset
assert len(set(ML1_train['file'].values.tolist()) & set(ML1_test['file'].values.tolist()))==0

In [29]:
# Dataset Preparation

data_propery   = 'noiseless' # noisy, both, and noiseless
blank_norm     = False
remove_outlier = ''

if data_propery=='noisy':
    (ML1_X_train, ML1_X_test, ML1_y_train, ML1_y_test), ML1_scalar = normalize_create_training_data(ML1_noisy_train, ML1_test, blank_norm, remove_outlier)
    (ML2_X_train, ML2_X_test, ML2_y_train, ML2_y_test), ML2_scalar = normalize_create_training_data(ML2_noisy_train, ML2_test, blank_norm, remove_outlier)
    (ML4_X_train, ML4_X_test, ML4_y_train, ML4_y_test), ML4_scalar = normalize_create_training_data(ML4_noisy_train, ML4_test, blank_norm, remove_outlier)
    
elif data_propery=='both':
    ML1_train_combined = pd.concat([ML1_noisy_train, ML1_train])
    ML2_train_combined = pd.concat([ML2_noisy_train, ML2_train])
    ML4_train_combined = pd.concat([ML4_noisy_train, ML4_train])
    
    (ML1_X_train, ML1_X_test, ML1_y_train, ML1_y_test), ML1_scalar = normalize_create_training_data(ML1_train_combined, ML1_test, blank_norm, remove_outlier)
    (ML2_X_train, ML2_X_test, ML2_y_train, ML2_y_test), ML2_scalar = normalize_create_training_data(ML2_train_combined, ML2_test, blank_norm, remove_outlier)
    (ML4_X_train, ML4_X_test, ML4_y_train, ML4_y_test), ML4_scalar = normalize_create_training_data(ML4_train_combined, ML4_test, blank_norm, remove_outlier)
    
else:
    (ML1_X_train, ML1_X_test, ML1_y_train, ML1_y_test), ML1_scalar = normalize_create_training_data(ML1_train, ML1_test, blank_norm, remove_outlier)
    (ML2_X_train, ML2_X_test, ML2_y_train, ML2_y_test), ML2_scalar = normalize_create_training_data(ML2_train, ML2_test, blank_norm, remove_outlier)
    (ML4_X_train, ML4_X_test, ML4_y_train, ML4_y_test), ML4_scalar = normalize_create_training_data(ML4_train, ML4_test, blank_norm, remove_outlier)


X_train = pd.concat([ML1_X_train, ML2_X_train, ML4_X_train], axis=0)
y_train = pd.concat([ML1_y_train, ML2_y_train, ML4_y_train], axis=0)

indx_shuffle = np.random.permutation(range(len(X_train)))
X_train      = X_train.iloc[indx_shuffle]
y_train      = y_train.iloc[indx_shuffle]

X_test  = pd.concat([ML1_X_test,  ML2_X_test,  ML4_X_test], axis=0)
y_test  = pd.concat([ML1_y_test,  ML2_y_test,  ML4_y_test], axis=0)
    

In [30]:
model_root_path = os.path.realpath('../models')

model_1_path  = os.path.join(model_root_path, 'Data_noiseless_outlier_remove_None')
model_2_path  = os.path.join(model_root_path, 'Data_augmentation_outlier_remove_None')

models_1      =  glob(f'{model_1_path}/*.pickle')
models_2      =  glob(f'{model_2_path}/*.pickle')

common_models = set([os.path.basename(i) for i in models_1]) & set([os.path.basename(i) for i in models_2])

In [41]:
permutation_test_results = pd.DataFrame(columns=['Comp Models', 'R2 Diff', '% error Diff', 'p value' ])
y_LOD = 0.9117010154341669

for model_file_name in tqdm(common_models):
    model_name = model_file_name.replace('.pickle', '')
    with open(os.path.join(model_1_path, model_file_name), 'rb') as f:
        model_1 = load(f)

    with open(os.path.join(model_2_path, model_file_name), 'rb') as f:
        model_2 = load(f)

    model1_pred = model_1.predict(X_test[models_features_per[model_name]])
    model2_pred = model_2.predict(X_test[models_features_per[model_name]])

    observed_r2_score, observed_statistic, p_value, _, _ = pair_permutation_test(model1_pred, model2_pred, y_test, y_LOD)
    temp = pd.DataFrame({'Comp Models':['() Without Aug | With Aug'], 'R2 Diff':[observed_r2_score], '% error Diff':[observed_statistic], 'p value':[p_value]})

    permutation_test_results = pd.concat([permutation_test_results, temp], axis=0)
    

  permutation_test_results = pd.concat([permutation_test_results, temp], axis=0)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:29<00:00,  5.94s/it]


In [42]:
permutation_test_results

Unnamed: 0,Comp Models,R2 Diff,% error Diff,p value
0,Without Aug | With Aug,0.002564,1.958394,0.3406
0,Without Aug | With Aug,0.008627,0.070601,0.9749
0,Without Aug | With Aug,0.000262,2.136466,0.0076
0,Without Aug | With Aug,0.01953,10.684698,0.0241
0,Without Aug | With Aug,0.026555,13.347617,0.001


In [24]:
pair_permutation_test?

[0;31mSignature:[0m
[0mpair_permutation_test[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmodel1_pred[0m[0;34m:[0m [0;34m<[0m[0mbuilt[0m[0;34m-[0m[0;32min[0m [0mfunction[0m [0marray[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmodel2_pred[0m[0;34m:[0m [0;34m<[0m[0mbuilt[0m[0;34m-[0m[0;32min[0m [0mfunction[0m [0marray[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mground_truth[0m[0;34m:[0m [0;34m<[0m[0mbuilt[0m[0;34m-[0m[0;32min[0m [0mfunction[0m [0marray[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0my_LOD[0m[0;34m:[0m [0mfloat[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mTuple[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mfloat[0m[0;34m,[0m [0mfloat[0m[0;34m,[0m [0mfloat[0m[0;34m,[0m [0mfloat[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Calculates Statistical Significance level p-value for the given pair
[0;31mFile:[0m      ~/Desktop/Epilepsey/Co