## Following experiments are being performed in this notebook
 1) Use Features extracted from ML12
 2) Training ML12
 3) Check the generability when ML12 is normalized with only the blanks and ML4 is normalized only with the blanks
 4) Testing 5 fold <br>
        -> Calculate the variance and mean only from 4 folds (only from the blanks) and test on 1 fold <br>
		-> Repeat for every fold.

## Blank feature comparison between ML12 and ML4
```
                                        Mean ML12         ML4
 univariate, area(S)                     0.625470         0.338291
 peak curvature                         35.631810         36.161643
 univariate, V_max(S)                    1.012192         1.007822
 vcenter                                 1.015360         1.016870
 univariate, max(S)                      0.008779         0.006264
 univariate, mean(S)                     0.000892         0.000487
 univariate, std(S)                      0.002334         0.002030
 univariate, max(dS/dV)                  0.296886         0.307639
 univariate, min(dS/dV)                 -0.284828         -0.265513
 univariate, max(dS/dV) - min(dS/dV)     0.581714         0.573135       
 univariate, V_max(dS/dV)                0.981520         0.998957
 univariate, V_min(dS/dV)                1.045040         1.025391
 univariate, area(dS/dV)                 0.017548         0.012530
```

The difference between the two different dataset for blank is close

## label 8 feature comparison between ML12 and ML4
```
                                        Mean ML12        Mean ML4
 univariate, area(S)                     3.544772        7.607660
 peak curvature                         80.466083        192.092580
 univariate, V_max(S)                    1.056402        1.048924
 vcenter                                 1.056426        1.050648
 univariate, max(S)                      0.054969        0.122838
 univariate, mean(S)                     0.005053        0.010868
 univariate, std(S)                      0.013428        0.029184
 univariate, max(dS/dV)                  1.221215        2.677500
 univariate, min(dS/dV)                 -1.391168        -3.465284
 univariate, max(dS/dV) - min(dS/dV)     2.612391        6.142800
 univariate, V_max(dS/dV)                1.025362        1.021280
 univariate, V_min(dS/dV)                1.080255        1.075200
 univariate, area(dS/dV)                 0.109943        0.245688
```

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.gaussian_process.kernels import Matern, RBF

from sklearn.base import clone
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, make_scorer
from sklearn.base import BaseEstimator
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from typing import Tuple
from pycombat import Combat

from src.load_models import select_model
from src.load_dataset import load_dataset, select_normalizer
from src.config import models_features_r2, models_features_per, model_name_conversion
from src.graph_visualization import visualize_highest_score_feature_selection, feature_selection_tabularize, visualization_testing_dataset

from src.utils import find_adj_score, calculate_y_LOD, per_error

In [2]:
# Load Training Dataset
normalize_blanks   = False
normalization      = True
use_training_scale = False
standardize_type   = 'mean_std'

%matplotlib
ML1_X, ML1_y  = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/ML1_ML2/2024_02_19_ML1', normalization=normalization, standardize_type=standardize_type, split=False)
ML2_X, ML2_y  = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/ML1_ML2/2024_02_22_ML2', normalization=normalization, standardize_type=standardize_type, split=False)
ML4_X, ML4_y      = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/ML4', normalization=normalization, standardize_type=standardize_type, split=False)

Using matplotlib backend: <object object at 0x13fb759a0>


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.rename(columns={"PH": 'univariate, max(S)', 'signal_std':'univariate, std(S)', 'signal_mean':'univariate, mean(S)', 'peak area':'univariate, area(S)', \
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.rename(columns={"PH": 'univariate, max(S)', 'signal_std':'univariate, std(S)', 'signal_mean':'univariate, mean(S)', 'peak area':'univariate, area(S)', \
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.rename(columns={"PH": 'univariate, max(S)', 'signal_std':'univar

In [3]:
# Combine
# X, y     = pd.concat([ML1_X, ML2_X], axis=0), pd.concat([ML1_y, ML2_y], axis=0)

In [4]:
models = ['Linear', 'KNN', 'SVM', 'RF', 'GP', 'Ridge', 'Lasso', 'univariate, std(S)', 'univariate, max(dS/dV)', 'univariate, area(dS/dV)', 'univariate, area(S)', 'univariate, max(S)']
metric = 'per_error'
nor_with_blank = False

scorer = make_scorer((r2_score if metric=='r2' else per_error), greater_is_better=(True if metric=='r2' else False))

score_dict_r2 = {'Models': [],
              'Scores': []}

score_dict_per = {'Models': [],
              'Scores': []}


training_dataset = 'ML12'
testing_dataset  = 'ML4'

# Calcualte y_LOD
y_LOD = 0.9117010154341676
print("y_LOD", y_LOD)

kf = KFold(n_splits=5)

y_LOD 0.9117010154341676


In [5]:
for model_name in models:

    # Test for R2 score
    model    = select_model(model_name)

    model_r2  = clone(model)
    model_per = clone(model)
  
    y_pred_r2  = []
    y_pred_per = []
    y_gt       = []
    
    for train_ind, test_ind in kf.split(ML4_X):
        # Features
        X_train_temp = ML4_X.iloc[train_ind].copy()
        X_test_temp  = ML4_X.iloc[test_ind].copy()

        # labels
        y_train_temp = ML4_y.to_numpy()[train_ind].copy()
        y_test_temp  = ML4_y.to_numpy()[test_ind].copy()


        # Concat ML12 with ML4 four folds
        features      = pd.concat([ML1_X, ML2_X, X_train_temp])
        batch_labels  = np.repeat(['ML1', 'ML2', 'ML4'], repeats= [len(ML1_X), len(ML2_X), len(y_train_temp)])

        combat        = Combat()
        combat_scaler = combat.fit(features.values, batch_labels)

        training      = combat_scaler.transform(pd.concat([ML1_X, ML2_X, X_test_temp]).values, np.repeat(['ML1', 'ML2', 'ML4'], repeats= [len(ML1_X), len(ML2_X), len(y_test_temp)]))
        training      = pd.DataFrame(training, columns=ML1_X.columns)

        X_training   = training.iloc[0:len(ML1_X)+len(ML2_X)]
        X_test_temp  = training.iloc[len(ML1_X)+len(ML2_X):]

        y_training   = pd.concat([ML1_y, ML2_y], axis=0)

        assert len(X_test_temp) == len(y_test_temp)
        
        model_r2.fit(X_training[models_features_r2[model_name]],   y_training)                 # Fit model with R2 features on ML1_2 dataset
        model_per.fit(X_training[models_features_per[model_name]], y_training)                 # Fit percent error with R2 features on ML1_2 dataset

        y_pred_r2  += model_r2.predict(X_test_temp[models_features_r2[model_name]]).tolist()      # Inference model with R2 features on ML4 dataset
        y_pred_per += model_per.predict(X_test_temp[models_features_per[model_name]]).tolist()    # Inference percent error features on ML4 dataset

        y_gt       += y_test_temp.tolist()
  
    r2Score      = r2_score(y_gt, y_pred_r2)
    adj_r2_Score = find_adj_score(len(y_gt), len(models_features_r2[model_name]), r2Score)

    score_dict_r2['Models'].append(model_name)
    score_dict_per['Models'].append(model_name)
    
    score_dict_r2['Scores'].append((r2Score, adj_r2_Score))
    score_dict_per['Scores'].append(per_error(pd.Series(y_gt), y_pred_per, y_LOD=y_LOD))

score_dict_r2, score_dict_per

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


({'Models': ['Linear',
   'KNN',
   'SVM',
   'RF',
   'GP',
   'Ridge',
   'Lasso',
   'univariate, std(S)',
   'univariate, max(dS/dV)',
   'univariate, area(dS/dV)',
   'univariate, area(S)',
   'univariate, max(S)'],
  'Scores': [(0.8295665632298912, 0.8217485156716293),
   (0.8937724631352659, 0.8899096436129119),
   (0.8988743374052509, 0.8922586398523233),
   (0.8857362965074588, 0.8836958732308062),
   (0.8920369517348361, 0.8910815265289497),
   (0.8362742456375137, 0.8255632149782857),
   (0.8301777007446831, 0.822387687017375),
   (0.8754192692874051, 0.8743167849448158),
   (0.8832851302875492, 0.8822522553343417),
   (0.877876768618915, 0.8767960320580204),
   (0.8732562811476315, 0.8721346553170797),
   (0.8778745954565963, 0.8767938396641768)]},
 {'Models': ['Linear',
   'KNN',
   'SVM',
   'RF',
   'GP',
   'Ridge',
   'Lasso',
   'univariate, std(S)',
   'univariate, max(dS/dV)',
   'univariate, area(dS/dV)',
   'univariate, area(S)',
   'univariate, max(S)'],
  'Score

In [6]:
savedir      = 'Generability_output_diff_mean_std'
r2_savefile  = f'{savedir}/r2_score_D_{training_dataset}_T_{testing_dataset}_sTrain_{use_training_scale}_nBlank_{normalize_blanks}_nType_{standardize_type}_ComBat.png'
per_savefile = f'{savedir}/per_error_score_D_{training_dataset}_T_{testing_dataset}_sTrain_{use_training_scale}_nBlank_{normalize_blanks}_nType_{standardize_type}_ComBat.png'

os.makedirs(savedir, exist_ok=True)
visualization_testing_dataset(score_dict_r2,  r2_savefile,  model_name_conversion, only_one_multivariate=False, adj_score=True, legends=True)
visualization_testing_dataset(score_dict_per, per_savefile, model_name_conversion, only_one_multivariate=False, r2_score=False, adj_score=False, legends=True)

In [None]:
X_testing

In [None]:
y_train_temp==0