In [None]:
import sys
import os
import pandas as pd
import numpy  as np

from sklearn.svm import SVR
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.base import clone
from sklearn.metrics import r2_score, make_scorer

sys.path.insert(0, '../')

from src.load_dataset import load_dataset
from src.config import *
from src.utils import per_error, calculate_r2_score, calculate_per_diff, find_adj_score
from src.graph_visualization import visualization_testing_dataset

In [None]:
# Load dataset
normalization     = True
# Perform normalization
standardize_type  = 'mean_std' if normalization else 'none'  # select normalization type (min_max, mean_std)
split             = True                                    # Split batch dataset into training and testing set
combat_norm       = False                                     # apply combat normalization
normalize_blanks  = False
test_nor_separate = False

if split:
    (ML1_X_train, ML1_X_test, ML1_y_train, ML1_y_test), _  = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_19_ML1', normalization=normalization, standardize_type=standardize_type, normalize_blanks=normalize_blanks, test_nor_separate=test_nor_separate, split=split)
    (ML2_X_train, ML2_X_test, ML2_y_train, ML2_y_test), _  = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_22_ML2', normalization=normalization, standardize_type=standardize_type, normalize_blanks=normalize_blanks, test_nor_separate=test_nor_separate, split=split)
    (ML4_X_train, ML4_X_test, ML4_y_train, ML4_y_test), _  = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML4', normalization=normalization, standardize_type=standardize_type, normalize_blanks=normalize_blanks, test_nor_separate=test_nor_separate, split=split)

else:
    ML1_X, ML1_y = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_19_ML1', normalization=normalization, standardize_type=standardize_type, split=split)
    ML2_X, ML2_y = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_22_ML2', normalization=normalization, standardize_type=standardize_type, split=split)
    ML4_X, ML4_y = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML4', normalization=normalization, standardize_type=standardize_type, split=split)

In [None]:
# Visualize PCA and t-SEN
if split:
    data_train          = pd.concat([ML1_X_train, ML2_X_train, ML4_X_train])
    batch_labels_train  = np.repeat(['ML1', 'ML2', 'ML4'], repeats= [len(ML1_X_train), len(ML2_X_train), len(ML4_X_train)])
    batch_labels_test   = np.repeat(['ML1', 'ML2', 'ML4'], repeats= [len(ML1_X_test),  len(ML2_X_test),  len(ML4_X_test)])
    
    labels_train = ML1_y_train.values.tolist() + ML2_y_train.values.tolist() + ML4_y_train.values.tolist()
    labels_test  = ML1_y_test.values.tolist()  + ML2_y_test.values.tolist()  + ML4_y_test.values.tolist()
    
    # tsen_pca_viz([ML1_X_train, ML2_X_train, ML4_X_train], batch_labels_train, labels_train)
    # tsen_pca_viz([ML1_X_test,  ML2_X_test,  ML4_X_test],  batch_labels_test,  labels_test)

In [None]:
# perform stratified train test split
if split==False:
    X                = pd.concat([ML1_X, ML2_X, ML4_X], axis=0)
    y                = ML1_y.values.tolist() + ML2_y.values.tolist() + ML4_y.values.tolist()
    stratified_label = ML1_y.map(lambda x: f'ML1_{x}').values.tolist() + ML2_y.map(lambda x: f'ML2_{x}').values.tolist() + ML4_y.map(lambda x: f'ML4_{x}').values.tolist()
    
    X_train, X_test, y_train_strat, y_test_strat = train_test_split(X, stratified_label, test_size=0.4, shuffle=True, random_state=20, stratify=stratified_label)
    y_train, y_test = pd.Series(y_train_strat).map(lambda x: eval(x.split('_')[1])), pd.Series(y_test_strat).map(lambda x: eval(x.split('_')[1]))

else:
    y_train_strat = ML1_y_train.map(lambda x: f'ML1_{x}').values.tolist() + ML2_y_train.map(lambda x: f'ML2_{x}').values.tolist() + ML4_y_train.map(lambda x: f'ML4_{x}').values.tolist()
    y_test_strat  = ML1_y_test.map(lambda x: f'ML1_{x}').values.tolist() + ML2_y_test.map(lambda x: f'ML2_{x}').values.tolist() + ML4_y_test.map(lambda x: f'ML4_{x}').values.tolist()

    # Combine ML1, ML2, and ML4 dataset
    X_train, X_test, y_train, y_test =  pd.concat([ML1_X_train, ML2_X_train, ML4_X_train], axis=0).reset_index(drop=True), \
                                        pd.concat([ML1_X_test,  ML2_X_test,  ML4_X_test],  axis=0).reset_index(drop=True), \
                                        pd.concat([ML1_y_train, ML2_y_train, ML4_y_train], axis=0).reset_index(drop=True), \
                                        pd.concat([ML1_y_test,  ML2_y_test,  ML4_y_test],  axis=0).reset_index(drop=True)

    # Create shuffling index with random choice
    shuffle_ind_train = np.random.choice(range(len(X_train)), len(X_train), replace=False)
    assert len(np.unique(shuffle_ind_train))==len(X_train)
    
    shuffle_ind_test = np.random.choice(range(len(X_test)), len(X_test), replace=False)
    assert len(np.unique(shuffle_ind_test))==len(X_test)

    # Shuffle training dataset
    X_train, y_train = X_train.iloc[shuffle_ind_train], y_train.iloc[shuffle_ind_train]
    X_test,  y_test  = X_test.iloc[shuffle_ind_test],   y_test.iloc[shuffle_ind_test]

    # Check if the labels and features have same index after shuffling
    assert X_train.index.tolist()==y_train.index.tolist()
    assert X_test.index.tolist()==y_test.index.tolist()
    

In [None]:
def FindBestGridSearch(model, features, X_train, y_train, param_grid, metric):

    scorer = make_scorer((r2_score if metric=='r2' else per_error), greater_is_better=(True if metric=='r2' else False))
    
    estimator = clone(model)
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1, scoring=scorer)
    
    # Fit the grid search to the data
    grid_search.fit(X_train[features], y_train)

    # Print the best parameters and best score
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Score: {grid_search.best_score_}")

    # Use the best estimator to make predictions
    best_svc = grid_search.best_estimator_


    return best_svc, grid_search.best_score_

In [None]:
# Define SVR models with different kernels
svr_linear  =  SVR(kernel='linear')
svr_poly    =  SVR(kernel='poly', gamma='scale', degree=3, coef0=1)
svr_rbf     =  SVR(kernel='rbf',  gamma=0.01, C=100)
svr_sigmoid =  SVR(kernel='sigmoid', coef0=1)

C_all       = [0.1, 1, 10, 100, 200]
coef_all    = [0.1, 0, 1, 2]
degrees     = [2, 3, 4]
gammas      = [0.01, 0.1, 1, 2]

param_grids  = {
                'Linear':{'C':C_all},
               'Poly':{'coef0': coef_all, 'degree':degrees, 'gamma':gammas, 'C':C_all},
               'RBF': {'gamma':gammas, 'C':C_all},
               'Sigmoid': {'coef0':coef_all, 'C':C_all}
              }

In [None]:
y_LOD = 0.9117010154341669
kf    = KFold(n_splits=5)

r2_score_val,  per_diff_val  = {'Models':[], 'Scores':[]}, {'Models':[], 'Scores':[]}
r2_score_test, per_diff_test = {'Models':[], 'Scores':[]}, {'Models':[], 'Scores':[]}

svm_models = [svr_linear, svr_poly, svr_rbf, svr_sigmoid]
svm_names  = ['Linear', 'Poly', 'RBF', 'Sigmoid']

for model, model_name in zip(svm_models, svm_names):

    param_grid = param_grids[model_name]

    model_r2, val_r2        = FindBestGridSearch(model, models_features_r2['SVM'],  X_train, y_train, param_grid, metric='r2')
    model_per_diff, val_per = FindBestGridSearch(model, models_features_per['SVM'], X_train, y_train, param_grid, metric='per_error')

    r2_score_val['Scores'].append(val_r2)
    per_diff_val['Scores'].append(val_per)

    model_r2.fit(X_train[models_features_r2['SVM']], y_train)
    y_pred_r2 = model_r2.predict(X_test[models_features_r2['SVM']])

    r2_test_score = r2_score(y_test, y_pred_r2)
    adj_r2_test   = find_adj_score(len(y_pred_r2), len(models_features_r2['SVM']), r2_test_score)
    
    r2_score_test['Scores'].append((r2_test_score, adj_r2_test))

    model_per_diff.fit(X_train[models_features_per['SVM']], y_train)
    y_pred_per_diff = model_per_diff.predict(X_test[models_features_per['SVM']])
    
    per_diff_test['Scores'].append(per_error(y_test, y_pred_per_diff, y_LOD))

    r2_score_val['Models'].append(model_name)
    per_diff_val['Models'].append(model_name) 
    r2_score_test['Models'].append(model_name)
    per_diff_test['Models'].append(model_name)


In [None]:
savedir   = f'../results/RandomForest_Kernel_Selection'
adj_score = False

os.makedirs(savedir, exist_ok=True)

visualization_testing_dataset(r2_score_val,  f'{savedir}/r2_score_val.png',   model_name_conversion, only_one_multivariate=False, adj_score=adj_score, legends=True)
visualization_testing_dataset(per_diff_val, f'{savedir}/per_error_val.png', model_name_conversion, only_one_multivariate=False, r2_score=False, adj_score=False, legends=True)

visualization_testing_dataset(r2_score_test,  f'{savedir}/r2_score_test.png',   model_name_conversion, only_one_multivariate=False, adj_score=adj_score, legends=True)
visualization_testing_dataset(per_diff_test, f'{savedir}/per_error_test.png', model_name_conversion, only_one_multivariate=False, r2_score=False, adj_score=False, legends=True)