In [None]:
import sys
sys.path.insert(0, '../')

import pandas as pd
import numpy as np

from src.load_dataset import load_dataset
from src.load_models  import select_model
from src.config import *
from src.utils import per_error

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone

In [None]:
def normalize_create_training_data(train, test, blank_norm=False, remove_outlier=None):

    # Remove outlier only from the training dataset
    if remove_outlier=='all':
        train = train[train['file'].apply(lambda x: False if (x.split('/')[-1].replace('.txt', '') in ouliners_to_remove) else True)]
        test  = test[test['file'].apply(lambda x: False if (x.split('/')[-1].replace('.txt', '') in ouliners_to_remove) else True)]

    elif remove_outlier=='train_only':
        train = train[train['file'].apply(lambda x: False if (x.split('/')[-1].replace('.txt', '') in ouliners_to_remove) else True)]
        
    train = train.reset_index(drop=True)
    test  = test.reset_index(drop=True)
    
    X_train = train.drop(columns=['file']).copy()
    X_test  = test.drop(columns=['file']).copy()

    columns       = X_train.columns
    
    y_train = train['file'].apply(lambda x: int(x.split('_')[-2].replace('cbz','')))
    y_test  = test['file'].apply(lambda x: int(x.split('_')[-2].replace('cbz','')))

    assert (X_train.index.values == y_train.index.values).all()

    scaler  = StandardScaler()

    if blank_norm: scaler.fit(X_train[y_train==0])
    else: scaler.fit(X_train)

    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=columns)
    X_test  = pd.DataFrame(scaler.transform(X_test),  columns=columns)

    X_train.rename(columns={"PH": 'univariate, max(S)', 'signal_std':'univariate, std(S)', 'signal_mean':'univariate, mean(S)', 'peak area':'univariate, area(S)', \
                        'dS_dV_area':'univariate, area(dS/dV)', 'dS_dV_max_peak':'univariate, max(dS/dV)', 'dS_dV_min_peak':'univariate, min(dS/dV)',\
                    'dS_dV_peak_diff':'univariate, max(dS/dV) - min(dS/dV)', \
                    'peak V':'univariate, V_max(S)', 'dS_dV_max_V':'univariate, V_max(dS/dV)', 'dS_dV_min_V':'univariate, V_min(dS/dV)',\
        }, inplace = True)

    X_test.rename(columns={"PH": 'univariate, max(S)', 'signal_std':'univariate, std(S)', 'signal_mean':'univariate, mean(S)', 'peak area':'univariate, area(S)', \
                        'dS_dV_area':'univariate, area(dS/dV)', 'dS_dV_max_peak':'univariate, max(dS/dV)', 'dS_dV_min_peak':'univariate, min(dS/dV)',\
                    'dS_dV_peak_diff':'univariate, max(dS/dV) - min(dS/dV)', \
                    'peak V':'univariate, V_max(S)', 'dS_dV_max_V':'univariate, V_max(dS/dV)', 'dS_dV_min_V':'univariate, V_min(dS/dV)',\
        }, inplace = True)

    return (X_train, X_test, y_train, y_test), scaler

def load_dataset_train_test_splitted(filename):
    ML1 = pd.read_excel(f'/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_19_ML1/{filename}.xlsx')
    ML2 = pd.read_excel(f'/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_22_ML2/{filename}.xlsx')
    ML4 = pd.read_excel(f'/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML4/{filename}.xlsx')

    return ML1, ML2, ML4

In [None]:
ouliners_to_remove = ['2024_02_19_cbz00_15', '2024_02_19_cbz08_40', '2024_02_19_cbz08_37', '2024_02_22_cbz00_01', '2024_02_22_cbz16_21', 
                     '2024_02_22_cbz08_10', '2024_02_22_cbz08_01']

In [None]:
data_propery   = 'augmentation' # noisy, both, and noiseless
blank_norm     = False
remove_outlier = None

ML1_noisy_train, ML2_noisy_train, ML4_noisy_train = load_dataset_train_test_splitted('extracted_features_noisy_training_dataset')
ML1_train, ML2_train, ML4_train = load_dataset_train_test_splitted('extracted_features_training_dataset')
ML1_test, ML2_test, ML4_test = load_dataset_train_test_splitted('extracted_features_testing_dataset')

# Test if training and testing has common dataset
assert len(set(ML1_train['file'].values.tolist()) & set(ML1_test['file'].values.tolist()))==0



if data_propery=='noisy':
    (ML1_X_train, ML1_X_test, ML1_y_train, ML1_y_test), ML1_scalar = normalize_create_training_data(ML1_noisy_train, ML1_test, blank_norm, remove_outlier)
    (ML2_X_train, ML2_X_test, ML2_y_train, ML2_y_test), ML2_scalar = normalize_create_training_data(ML2_noisy_train, ML2_test, blank_norm, remove_outlier)
    (ML4_X_train, ML4_X_test, ML4_y_train, ML4_y_test), ML4_scalar = normalize_create_training_data(ML4_noisy_train, ML4_test, blank_norm, remove_outlier)
    
elif data_propery=='augmentation':
    ML1_train_combined = pd.concat([ML1_noisy_train, ML1_train])
    ML2_train_combined = pd.concat([ML2_noisy_train, ML2_train])
    ML4_train_combined = pd.concat([ML4_noisy_train, ML4_train])
    
    (ML1_X_train, ML1_X_test, ML1_y_train, ML1_y_test), ML1_scalar = normalize_create_training_data(ML1_train_combined, ML1_test, blank_norm, remove_outlier)
    (ML2_X_train, ML2_X_test, ML2_y_train, ML2_y_test), ML2_scalar = normalize_create_training_data(ML2_train_combined, ML2_test, blank_norm, remove_outlier)
    (ML4_X_train, ML4_X_test, ML4_y_train, ML4_y_test), ML4_scalar = normalize_create_training_data(ML4_train_combined, ML4_test, blank_norm, remove_outlier)
    
else:
    (ML1_X_train, ML1_X_test, ML1_y_train, ML1_y_test), ML1_scalar = normalize_create_training_data(ML1_train, ML1_test, blank_norm, remove_outlier)
    (ML2_X_train, ML2_X_test, ML2_y_train, ML2_y_test), ML2_scalar = normalize_create_training_data(ML2_train, ML2_test, blank_norm, remove_outlier)
    (ML4_X_train, ML4_X_test, ML4_y_train, ML4_y_test), ML4_scalar = normalize_create_training_data(ML4_train, ML4_test, blank_norm, remove_outlier)


X_train = pd.concat([ML1_X_train, ML2_X_train, ML4_X_train], axis=0)
y_train = pd.concat([ML1_y_train, ML2_y_train, ML4_y_train], axis=0)

indx_shuffle = np.random.permutation(range(len(X_train)))
X_train      = X_train.iloc[indx_shuffle].reset_index(drop=True)
y_train      = y_train.iloc[indx_shuffle].reset_index(drop=True)

X_test  = pd.concat([ML1_X_test,  ML2_X_test,  ML4_X_test], axis=0).reset_index(drop=True)
y_test  = pd.concat([ML1_y_test,  ML2_y_test,  ML4_y_test], axis=0).reset_index(drop=True)

In [None]:
X_train.shape, X_test.shape

In [None]:
# List of models
models = ['Linear', 'KNN', 'SVM', 'RF', 'GP']

# Calcualte y_LOD
y_LOD = 0.9117010154341669 #calculate_y_LOD(X_testing, y_testing)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
r2_score_val,  per_diff_val  = {'Models':[], 'Scores':[]}, {'Models':[], 'Scores':[]}
r2_score_test, per_diff_test = {'Models':[], 'Scores':[]}, {'Models':[], 'Scores':[]}

mix_par = 0.01


for model_name in models:
    model      = select_model(model_name)

    teacher_model = clone(model)

    hard_labels  = []
    soft_labels  = []

    X_test_kfold = []

    # Create Soft labels from the teacher model
    for (train_index, test_index) in kf.split(X_train):
 
        X_training, X_testing = X_train.iloc[train_index], X_train.iloc[test_index]
        y_training, y_testing = y_train.iloc[train_index], y_train.iloc[test_index]

        teacher_model.fit(X_training[models_features_per[model_name]], y_training)
        soft_output_labels = teacher_model.predict(X_testing[models_features_per[model_name]])

        X_test_kfold += [X_testing]
        soft_labels  += soft_output_labels.tolist()
        hard_labels  += y_testing.values.tolist()

    y_new_labels   = np.array(soft_labels) * mix_par + np.array(hard_labels) * (1 - mix_par)
    X_test_kfold   = pd.concat(X_test_kfold, axis=0)

    # Train Student Model
    student_model = clone(model)
    student_model.fit(X_test_kfold[models_features_per[model_name]], y_new_labels)

    y_pred          = student_model.predict(X_test[models_features_per[model_name]])
    per_error_final = per_error(y_test, y_pred, y_LOD)

    print(f"Model Name: {model_name}", f" | % error: {per_error_final}")
    

## Outlier with Knowledge Distillation

In [None]:
def find_outlier(t, st):
    st_err = (t - st)

    print("Median", np.median(st_err))
    MAD    = np.median(st_err - np.median(st_err))
    print(np.median(st_err - np.median(st_err)))
    sigma  = 1.4826 * MAD
    B      = len(t)
    alpha  = 1
    
    epsilon_outlier = sigma * np.sqrt(-2 * np.log(np.sqrt(2 * np.pi) * sigma * alpha / B))

    outlier_flag = np.abs(st_err) > epsilon_outlier

    return outlier_flag

In [None]:
r2_score_val,  per_diff_val  = {'Models':[], 'Scores':[]}, {'Models':[], 'Scores':[]}
r2_score_test, per_diff_test = {'Models':[], 'Scores':[]}, {'Models':[], 'Scores':[]}

mix_par = 0.01


for model_name in models:
    model      = select_model(model_name)

    teacher_model = clone(model)

    hard_labels  = []
    soft_labels  = []

    X_test_kfold = []

    # Create Soft labels from the teacher model
    for (train_index, test_index) in kf.split(X_train):
 
        X_training, X_testing = X_train.iloc[train_index], X_train.iloc[test_index]
        y_training, y_testing = y_train.iloc[train_index], y_train.iloc[test_index]

        teacher_model.fit(X_training[models_features_per[model_name]], y_training)
        soft_output_labels = teacher_model.predict(X_testing[models_features_per[model_name]])

        outlier_flag = find_outlier(y_testing, soft_output_labels)
        if outlier_flag.any():
            print(outlier_flag)

        break

    break

        # X_test_kfold += [X_testing]
        # soft_labels  += soft_output_labels.tolist()
        # hard_labels  += y_testing.values.tolist()

    # y_new_labels   = np.array(soft_labels) * mix_par + np.array(hard_labels) * (1 - mix_par)
    # X_test_kfold   = pd.concat(X_test_kfold, axis=0)

    # # Train Student Model
    # student_model = clone(model)
    # student_model.fit(X_test_kfold[models_features_per[model_name]], y_new_labels)

    # y_pred          = student_model.predict(X_test[models_features_per[model_name]])
    # per_error_final = per_error(y_test, y_pred, y_LOD)

    print(f"Model Name: {model_name}", f" | % error: {per_error_final}")