In this experiment, I train the models with noisy data and test the performance on test data.<br>
- If the performace of test dataset is comparable with the performance trained on noise free dataset then the data augmentation is helpful

In [1]:
import warnings
warnings.filterwarnings('ignore')

import sys
import os
import pandas as pd
import numpy as np

sys.path.insert(0, '../')

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.base import clone
from sklearn.metrics import r2_score

from pickle import load, dump

from src.load_dataset import load_dataset
from src.utils import tsen_pca_viz, verify_batch_label_dist, calculate_r2_score, calculate_per_diff, per_error, find_adj_score, combine_all_batches, split_batches_back, perform_combat_normalization
from src.load_models import select_model
from src.graph_visualization import visualization_testing_dataset
from src.config import *

from pyod.models.iforest import IForest
from pyod.models.lof import LOF

In [2]:
def select_outlier_removal(o_removal = 'iforest'):
    if o_removal=='iforest': return IForest(contamination=0.03,n_estimators=100,
                                                 random_state=0) 
    elif o_removal=='lof':   return LOF(contamination=0.03, n_neighbors=5) 

def remove_outlier_box_plot():
    pass
    
    
def find_outliers_in_data(all_combined, labels, outlier_removal_algo, outlier_threshold=0.7):
    ouliers_name = []

    for label in labels:
        
        data  = all_combined[all_combined['y']==label].copy()
        tx    = data.drop(columns=['file', 'y'])

        if outlier_removal_algo != 'IQR':
            o_removal = select_outlier_removal(outlier_removal_algo)
            o_removal.fit(tx)
            predicted = pd.Series(o_removal.predict(tx),index=tx.index)
            
            outliers = predicted[predicted > outlier_threshold] 
            outliers = data.loc[outliers.index] 
            ouliers_name += outliers['file'].map(lambda x: x.split('/')[-1].replace('.txt', '')).to_list()
        else:
            pass
            

    return ouliers_name
    
def normalize_create_training_data(train, test, blank_norm=False, remove_outlier=None):

    # Remove outlier only from the training dataset
    if remove_outlier=='all':
        
        train = train[train['file'].apply(lambda x: False if (x.split('/')[-1].replace('.txt', '') in ouliners_to_remove) else True)]
        test  = test[test['file'].apply(lambda x: False if (x.split('/')[-1].replace('.txt', '') in ouliners_to_remove) else True)]

    elif remove_outlier=='train_only':
        train = train[train['file'].apply(lambda x: False if (x.split('/')[-1].replace('.txt', '') in ouliners_to_remove) else True)]
        
    train = train.reset_index(drop=True)
    test  = test.reset_index(drop=True)
    
    X_train = train.drop(columns=['file']).copy()
    X_test  = test.drop(columns=['file']).copy()

    columns       = X_train.columns
    
    y_train = train['file'].apply(lambda x: int(x.split('_')[-2].replace('cbz','')))
    y_test  = test['file'].apply(lambda x: int(x.split('_')[-2].replace('cbz','')))

    assert (X_train.index.values == y_train.index.values).all()

    scaler  = StandardScaler()

    if blank_norm: scaler.fit(X_train[y_train==0])
    else: scaler.fit(X_train)

    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=columns)
    X_test  = pd.DataFrame(scaler.transform(X_test),  columns=columns)

    X_train.rename(columns={"PH": 'univariate, max(S)', 'signal_std':'univariate, std(S)', 'signal_mean':'univariate, mean(S)', 'peak area':'univariate, area(S)', \
                        'dS_dV_area':'univariate, area(dS/dV)', 'dS_dV_max_peak':'univariate, max(dS/dV)', 'dS_dV_min_peak':'univariate, min(dS/dV)',\
                    'dS_dV_peak_diff':'univariate, max(dS/dV) - min(dS/dV)', \
                    'peak V':'univariate, V_max(S)', 'dS_dV_max_V':'univariate, V_max(dS/dV)', 'dS_dV_min_V':'univariate, V_min(dS/dV)',\
        }, inplace = True)

    X_test.rename(columns={"PH": 'univariate, max(S)', 'signal_std':'univariate, std(S)', 'signal_mean':'univariate, mean(S)', 'peak area':'univariate, area(S)', \
                        'dS_dV_area':'univariate, area(dS/dV)', 'dS_dV_max_peak':'univariate, max(dS/dV)', 'dS_dV_min_peak':'univariate, min(dS/dV)',\
                    'dS_dV_peak_diff':'univariate, max(dS/dV) - min(dS/dV)', \
                    'peak V':'univariate, V_max(S)', 'dS_dV_max_V':'univariate, V_max(dS/dV)', 'dS_dV_min_V':'univariate, V_min(dS/dV)',\
        }, inplace = True)

    return (X_train, X_test, y_train, y_test), scaler

def load_dataset_train_test_splitted(filename):
    ML1 = pd.read_excel(f'/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_19_ML1/{filename}.xlsx')
    ML2 = pd.read_excel(f'/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_22_ML2/{filename}.xlsx')
    ML4 = pd.read_excel(f'/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML4/{filename}.xlsx')

    return ML1, ML2, ML4



In [3]:
# ouliners_to_remove = ['2024_03_08_cbz16_36', '2024_03_08_cbz16_15']

# ouliners_to_remove = ['2024_02_19_cbz08_43',
#                       '2024_02_19_cbz08_37',
#                       '2024_02_22_cbz08_10']

In [4]:
ML1_noisy_train, ML2_noisy_train, ML4_noisy_train = load_dataset_train_test_splitted('feature_extraction_vwidth_0.15_training_noisy')
ML1_train, ML2_train, ML4_train = load_dataset_train_test_splitted('feature_extraction_vwidth_0.15_training')
ML1_test, ML2_test, ML4_test = load_dataset_train_test_splitted('feature_extraction_vwidth_0.15_testing')

# Test if training and testing has common dataset
assert len(set(ML1_train['file'].values.tolist()) & set(ML1_test['file'].values.tolist()))==0
assert len(set(ML1_noisy_train['file'].values.tolist()) & set(ML1_test['file'].values.tolist()))==0


In [5]:
all_combined_data  = pd.concat([ML1_train, ML2_train, ML4_train], axis=0).reset_index(drop=True) 
all_combined_data['y'] = all_combined_data['file'].apply(lambda x: int(x.split('_')[-2].replace('cbz','')))                                                                           
ouliners_to_remove = find_outliers_in_data(all_combined_data, all_combined_data['y'].unique(), outlier_removal_algo='iforest')

ouliners_to_remove

['2024_02_22_cbz00_31',
 '2024_02_22_cbz00_18',
 '2024_03_08_cbz00_38',
 '2024_02_19_cbz08_37',
 '2024_02_22_cbz08_01',
 '2024_03_08_cbz08_04',
 '2024_02_19_cbz16_28',
 '2024_02_19_cbz16_01',
 '2024_03_08_cbz16_15']

In [14]:
data_propery   = 'noiseless' # noisy, augmentation, and noiseless
blank_norm     = False
remove_outlier = ''   #None, train_only, all

if data_propery=='noisy':
    (ML1_X_train, ML1_X_test, ML1_y_train, ML1_y_test), ML1_scalar = normalize_create_training_data(ML1_noisy_train, ML1_test, blank_norm, remove_outlier)
    (ML2_X_train, ML2_X_test, ML2_y_train, ML2_y_test), ML2_scalar = normalize_create_training_data(ML2_noisy_train, ML2_test, blank_norm, remove_outlier)
    (ML4_X_train, ML4_X_test, ML4_y_train, ML4_y_test), ML4_scalar = normalize_create_training_data(ML4_noisy_train, ML4_test, blank_norm, remove_outlier)
    
elif data_propery=='augmentation':
    ML1_train_combined = pd.concat([ML1_noisy_train, ML1_train])
    ML2_train_combined = pd.concat([ML2_noisy_train, ML2_train])
    ML4_train_combined = pd.concat([ML4_noisy_train, ML4_train])
    
    (ML1_X_train, ML1_X_test, ML1_y_train, ML1_y_test), ML1_scalar = normalize_create_training_data(ML1_train_combined, ML1_test, blank_norm, remove_outlier)
    (ML2_X_train, ML2_X_test, ML2_y_train, ML2_y_test), ML2_scalar = normalize_create_training_data(ML2_train_combined, ML2_test, blank_norm, remove_outlier)
    (ML4_X_train, ML4_X_test, ML4_y_train, ML4_y_test), ML4_scalar = normalize_create_training_data(ML4_train_combined, ML4_test, blank_norm, remove_outlier)
    
else:
    (ML1_X_train, ML1_X_test, ML1_y_train, ML1_y_test), ML1_scalar = normalize_create_training_data(ML1_train, ML1_test, blank_norm, remove_outlier)
    (ML2_X_train, ML2_X_test, ML2_y_train, ML2_y_test), ML2_scalar = normalize_create_training_data(ML2_train, ML2_test, blank_norm, remove_outlier)
    (ML4_X_train, ML4_X_test, ML4_y_train, ML4_y_test), ML4_scalar = normalize_create_training_data(ML4_train, ML4_test, blank_norm, remove_outlier)

In [15]:
X_train = pd.concat([ML1_X_train, ML2_X_train, ML4_X_train], axis=0)
y_train = pd.concat([ML1_y_train, ML2_y_train, ML4_y_train], axis=0)

indx_shuffle = np.random.permutation(range(len(X_train)))
X_train      = X_train.iloc[indx_shuffle]
y_train      = y_train.iloc[indx_shuffle]

X_test  = pd.concat([ML1_X_test,  ML2_X_test,  ML4_X_test], axis=0)
y_test  = pd.concat([ML1_y_test,  ML2_y_test,  ML4_y_test], axis=0)

In [16]:
(ML2_X_train.index.values == ML2_y_train.index.values).all()

True

In [17]:
X_train.shape, X_test.shape

((216, 13), (145, 13))

In [18]:
# List of models
models = ['Linear', 'KNN', 'SVM', 'RF', 'GP']

# Calcualte y_LOD
y_LOD = 0.9117010154341669 #calculate_y_LOD(X_testing, y_testing)
kf = KFold(n_splits=5)

r2_score_val,  per_diff_val  = {'Models':[], 'Scores':[]}, {'Models':[], 'Scores':[]}
r2_score_test, per_diff_test = {'Models':[], 'Scores':[]}, {'Models':[], 'Scores':[]}

for model_name in models:
    model      = select_model(model_name)

    val_r2     = calculate_r2_score(model, X_train[models_features_r2[model_name]],  y_train, kf)
    val_per    = calculate_per_diff(model, X_train[models_features_per[model_name]], y_train, kf, y_LOD)

    r2_score_val['Scores'].append(val_r2)
    per_diff_val['Scores'].append(val_per)

    model_r2  = clone(model)
    model_r2.fit(X_train[models_features_r2[model_name]], y_train)
    y_pred_r2 = model_r2.predict(X_test[models_features_r2[model_name]])

    r2_test_score = r2_score(y_test, y_pred_r2)
    adj_r2_test   = find_adj_score(len(y_pred_r2), len(models_features_r2[model_name]), r2_test_score)
    
    r2_score_test['Scores'].append((r2_test_score, adj_r2_test))

    model_per_diff = clone(model)
    model_per_diff.fit(X_train[models_features_per[model_name]], y_train)
    y_pred_per_diff = model_per_diff.predict(X_test[models_features_per[model_name]])
    
    per_diff_test['Scores'].append(per_error(y_test, y_pred_per_diff, y_LOD))

    r2_score_val['Models'].append(model_name)
    per_diff_val['Models'].append(model_name) 
    r2_score_test['Models'].append(model_name)
    per_diff_test['Models'].append(model_name)

    save_model_name = f'../models/Data_{data_propery}_outlier_remove_{remove_outlier}_vwidth_0.15'
    os.makedirs(save_model_name, exist_ok=True)

    with open(f'{save_model_name}/{model_name}.pickle', 'wb') as f:
        dump(model_per_diff, f)

In [19]:
print(save_model_name)
model_name = 'GP'
with open(f'{save_model_name}/{model_name}.pickle', 'rb') as f:
    model = load(f)
y_pred = model.predict(X_test[models_features_per[model_name]])

per_error(y_test, y_pred, y_LOD)

../models/Data_noiseless_outlier_remove__vwidth_0.15


26.986712074857145

In [20]:
y_test

0     16
1      0
2      8
3     16
4     16
      ..
41    16
42    16
43     8
44     0
45     8
Name: file, Length: 145, dtype: int64

In [21]:
savedir   = f'../results/Noisy_Training_Dataset/data_property_{data_propery}_blank_norm_{blank_norm}_outlier_remove_{remove_outlier}_vwidth_0.15'
adj_score = False

os.makedirs(savedir, exist_ok=True)

visualization_testing_dataset(r2_score_val,  f'{savedir}/r2_score_val.png',   model_name_conversion, only_one_multivariate=False, adj_score=adj_score, legends=True)
visualization_testing_dataset(per_diff_val, f'{savedir}/per_error_val.png', model_name_conversion, only_one_multivariate=False, r2_score=False, adj_score=False, legends=True)

visualization_testing_dataset(r2_score_test,  f'{savedir}/r2_score_test.png',   model_name_conversion, only_one_multivariate=False, adj_score=adj_score, legends=True)
visualization_testing_dataset(per_diff_test, f'{savedir}/per_error_test.png', model_name_conversion, only_one_multivariate=False, r2_score=False, adj_score=False, legends=True)

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

## Outlier plot with box plot

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
feature = 'PH'

# Find outliers 
for i in all_combined_data['y'].unique():
    Q1  = np.percentile(all_combined_data[all_combined_data['y']==i][feature], q=25)
    Q3  = np.percentile(all_combined_data[all_combined_data['y']==i][feature], q=75)
    
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5*IQR
    upper_bound = Q3 + 1.5*IQR

    above_upper_bound = all_combined_data[[feature, 'file']][(all_combined_data[feature]>upper_bound) & (all_combined_data['y']==i)]
    above_lower_bound = all_combined_data[[feature, 'file']][(all_combined_data[feature]<lower_bound) & (all_combined_data['y']==i)]

    print(f"Con:{i}", "Above_upper_bound:", above_upper_bound['file'].values, " | ","Lower_bound",  above_lower_bound['file'].values)

plt.figure(figsize=(7,4))
sns.boxplot(all_combined_data, x='y', y=feature)
plt.show()


## Inference

In [None]:
model_r2  = select_model('GP')
model_r2.fit(X_train[models_features_r2[model_name]], y_train)

In [None]:
filename  = '2024_02_22_cbz00_01'
test_data = ML2_test[ML2_test['file'].apply(lambda x: True if filename in x else False)]
test_data.drop(columns='file', inplace=True)
columns   = test_data.columns
test_data = pd.DataFrame(ML2_scalar.transform(test_data), columns=columns)

test_data.rename(columns={"PH": 'univariate, max(S)', 'signal_std':'univariate, std(S)', 'signal_mean':'univariate, mean(S)', 'peak area':'univariate, area(S)', \
                        'dS_dV_area':'univariate, area(dS/dV)', 'dS_dV_max_peak':'univariate, max(dS/dV)', 'dS_dV_min_peak':'univariate, min(dS/dV)',\
                    'dS_dV_peak_diff':'univariate, max(dS/dV) - min(dS/dV)', \
                    'peak V':'univariate, V_max(S)', 'dS_dV_max_V':'univariate, V_max(dS/dV)', 'dS_dV_min_V':'univariate, V_min(dS/dV)',\
        }, inplace = True)

test_data.shape
model_r2.predict(test_data[models_features_r2[model_name]])