## Following experiments are being performed in this notebook
 1) Use Features extracted from ML12
 2) Training ML12
 3) Check the generability when ML12 is normalized with only the blanks and ML4 is normalized only with the blanks
 4) Testing 5 fold <br>
        -> Calculate the variance and mean only from 4 folds (only from the blanks) and test on 1 fold <br>
		-> Repeat for every fold.

## Blank feature comparison between ML12 and ML4
```
                                        Mean ML12         ML4
 univariate, area(S)                     0.625470         0.338291
 peak curvature                         35.631810         36.161643
 univariate, V_max(S)                    1.012192         1.007822
 vcenter                                 1.015360         1.016870
 univariate, max(S)                      0.008779         0.006264
 univariate, mean(S)                     0.000892         0.000487
 univariate, std(S)                      0.002334         0.002030
 univariate, max(dS/dV)                  0.296886         0.307639
 univariate, min(dS/dV)                 -0.284828         -0.265513
 univariate, max(dS/dV) - min(dS/dV)     0.581714         0.573135       
 univariate, V_max(dS/dV)                0.981520         0.998957
 univariate, V_min(dS/dV)                1.045040         1.025391
 univariate, area(dS/dV)                 0.017548         0.012530
```

The difference between the two different dataset for blank is close

## label 8 feature comparison between ML12 and ML4
```
                                        Mean ML12        Mean ML4
 univariate, area(S)                     3.544772        7.607660
 peak curvature                         80.466083        192.092580
 univariate, V_max(S)                    1.056402        1.048924
 vcenter                                 1.056426        1.050648
 univariate, max(S)                      0.054969        0.122838
 univariate, mean(S)                     0.005053        0.010868
 univariate, std(S)                      0.013428        0.029184
 univariate, max(dS/dV)                  1.221215        2.677500
 univariate, min(dS/dV)                 -1.391168        -3.465284
 univariate, max(dS/dV) - min(dS/dV)     2.612391        6.142800
 univariate, V_max(dS/dV)                1.025362        1.021280
 univariate, V_min(dS/dV)                1.080255        1.075200
 univariate, area(dS/dV)                 0.109943        0.245688
```

In [2]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.gaussian_process.kernels import Matern, RBF

from sklearn.base import clone
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, make_scorer
from sklearn.base import BaseEstimator
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from typing import Tuple

from src.load_models import select_model
from src.load_dataset import load_dataset, select_normalizer
from src.config import models_features_r2, models_features_per, model_name_conversion
from src.graph_visualization import visualize_highest_score_feature_selection, feature_selection_tabularize, visualization_testing_dataset

from src.utils import find_adj_score, calculate_y_LOD, per_error

In [3]:
# Load Training Dataset
normalize_blanks   = False
normalization      = False
use_training_scale = False
standardize_type   = ''

%matplotlib
(X_train, X_test, y_train, y_test), scaler_trainer      = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/ML1_ML2', normalization= normalization, 
                                                        normalize_blanks=normalize_blanks, standardize_type=standardize_type)
(X_train_4, X_test_4, y_train_4, y_test_4), scaler_4    = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/ML4', normalization=False, normalize_blanks=False)

Using matplotlib backend: <object object at 0x28f8a18c0>
######Data Distribution:#########
Training {0: 50, 16: 50, 8: 47}
Testing {0: 34, 8: 31, 16: 34}
#################################
######Data Distribution:#########
Training {0: 23, 8: 25, 16: 21}
Testing {0: 15, 8: 16, 16: 15}
#################################


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.rename(columns={"PH": 'univariate, max(S)', 'signal_std':'univariate, std(S)', 'signal_mean':'univariate, mean(S)', 'peak area':'univariate, area(S)', \
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.rename(columns={"PH": 'univariate, max(S)', 'signal_std':'univariate, std(S)', 'signal_mean':'univariate, mean(S)', 'peak area':'univariate, area(S)', \


In [4]:
# Combine
X_4, y_4 = pd.concat([X_train_4, X_test_4], axis=0), pd.concat([y_train_4, y_test_4], axis=0)
X, y     = pd.concat([X_train, X_test], axis=0), pd.concat([y_train, y_test], axis=0)

In [5]:
models = ['Linear', 'KNN', 'SVM', 'RF', 'GP', 'Ridge', 'Lasso', 'univariate, std(S)', 'univariate, max(dS/dV)', 'univariate, area(dS/dV)', 'univariate, area(S)', 'univariate, max(S)']
metric = 'per_error'
nor_with_blank = False

scorer = make_scorer((r2_score if metric=='r2' else per_error), greater_is_better=(True if metric=='r2' else False))

score_dict_r2 = {'Models': [],
              'Scores': []}

score_dict_per = {'Models': [],
              'Scores': []}


training_dataset = 'ML12'
testing_dataset  = 'ML4'

X_training, y_training, X_testing, y_testing = (X, y, X_4, y_4) if training_dataset=='ML12' else (X_4, y_4, X, y)

# Calcualte y_LOD
y_LOD = calculate_y_LOD(X_testing, y_testing)
print("y_LOD", y_LOD)

kf = KFold(n_splits=5)

y_LOD 0.9117010154341676


In [6]:
for model_name in models:

    # Test for R2 score
    model    = select_model(model_name)

    model_r2  = clone(model)
    model_per = clone(model)

    model_r2.fit(X_training[models_features_r2[model_name]],   y_training)                 # Fit model with R2 features on ML1_2 dataset
    model_per.fit(X_training[models_features_per[model_name]], y_training)                 # Fit percent error with R2 features on ML1_2 dataset
    
    y_pred_r2  = []
    y_pred_per = []
    y_gt       = []
    
    for train_ind, test_ind in kf.split(X_testing):
        # Features
        X_train_temp = X_testing.iloc[train_ind].copy()
        X_test_temp  = X_testing.iloc[test_ind].copy()

        # labels
        y_train_temp = X_testing.to_numpy()[train_ind].copy()
        y_test_temp  = y_testing.to_numpy()[test_ind].copy()


        if normalization==True:
            if use_training_scale: scaler     = scaler_trainer
    
            else:
                scaler = select_normalizer(standardize_type)
                if normalize_blanks: scaler.fit(X_train_temp[y_train_temp==0])
        
                # Fit the scaler to the training dataset
                else: scaler.fit(X_train_temp)                       # Calculate mean and standard deviation only on the four folds
            
            X_test_temp = scaler.transform(X_test_temp)    # Normalize the remaining validation fold
            X_test_temp = pd.DataFrame(X_test_temp, columns=X_testing.columns)
 
        
        y_pred_r2  += model_r2.predict(X_test_temp[models_features_r2[model_name]]).tolist()      # Inference model with R2 features on ML4 dataset
        y_pred_per += model_per.predict(X_test_temp[models_features_per[model_name]]).tolist()    # Inference percent error features on ML4 dataset

        y_gt       += y_test_temp.tolist()
  
    r2Score      = r2_score(y_gt, y_pred_r2)
    adj_r2_Score = find_adj_score(len(y_gt), len(models_features_r2[model_name]), r2Score)

    score_dict_r2['Models'].append(model_name)
    score_dict_per['Models'].append(model_name)
    
    score_dict_r2['Scores'].append((r2Score, adj_r2_Score))
    score_dict_per['Scores'].append(per_error(pd.Series(y_gt), y_pred_per, y_LOD=y_LOD))

score_dict_r2, score_dict_per

({'Models': ['Linear',
   'KNN',
   'SVM',
   'RF',
   'GP',
   'Ridge',
   'Lasso',
   'univariate, std(S)',
   'univariate, max(dS/dV)',
   'univariate, area(dS/dV)',
   'univariate, area(S)',
   'univariate, max(S)'],
  'Scores': [(0.18823781584456145, 0.15100101840623859),
   (0.4611366766316214, 0.4415416466909531),
   (0.41225797998552294, 0.37380756746121135),
   (0.4776878862033582, 0.4683608841712753),
   (-0.26479029762760153, -0.27598313211988135),
   (-0.25905667247478936, -0.3414248660011776),
   (-0.5776123040535401, -0.6499798409367301),
   (-1.5347043089922816, -1.557135320576284),
   (-1.703851087991513, -1.7277789737259512),
   (-1.566691763603365, -1.5894058500069344),
   (-1.0840249423417538, -1.1024676409465481),
   (-1.5664074568001194, -1.5891190272142799)]},
 {'Models': ['Linear',
   'KNN',
   'SVM',
   'RF',
   'GP',
   'Ridge',
   'Lasso',
   'univariate, std(S)',
   'univariate, max(dS/dV)',
   'univariate, area(dS/dV)',
   'univariate, area(S)',
   'univaria

In [7]:
savedir      = 'Generability_output_diff_mean_std'
r2_savefile  = f'{savedir}/r2_score_D_{training_dataset}_T_{testing_dataset}_sTrain_{use_training_scale}_nBlank_{normalize_blanks}_nType_{standardize_type}.png'
per_savefile = f'{savedir}/per_error_score_D_{training_dataset}_T_{testing_dataset}_sTrain_{use_training_scale}_nBlank_{normalize_blanks}_nType_{standardize_type}.png'

os.makedirs(savedir, exist_ok=True)
visualization_testing_dataset(score_dict_r2,  r2_savefile,  model_name_conversion, only_one_multivariate=False, adj_score=True, legends=True)
visualization_testing_dataset(score_dict_per, per_savefile, model_name_conversion, only_one_multivariate=False, r2_score=False, adj_score=False, legends=True)

In [None]:
X_testing

In [None]:
y_train_temp==0