## Following experiments are being performed in this notebook
 1) Use Features extracted from ML12
 2) Training ML12
 3) Check the generability when ML12 is normalized with only the blanks and ML4 is normalized only with the blanks
 4) Testing 5 fold <br>
        -> Calculate the variance and mean only from 4 folds (only from the blanks) and test on 1 fold <br>
		-> Repeat for every fold.

## Blank feature comparison between ML12 and ML4
```
                                        Mean ML12         ML4
 univariate, area(S)                     0.625470         0.338291
 peak curvature                         35.631810         36.161643
 univariate, V_max(S)                    1.012192         1.007822
 vcenter                                 1.015360         1.016870
 univariate, max(S)                      0.008779         0.006264
 univariate, mean(S)                     0.000892         0.000487
 univariate, std(S)                      0.002334         0.002030
 univariate, max(dS/dV)                  0.296886         0.307639
 univariate, min(dS/dV)                 -0.284828         -0.265513
 univariate, max(dS/dV) - min(dS/dV)     0.581714         0.573135       
 univariate, V_max(dS/dV)                0.981520         0.998957
 univariate, V_min(dS/dV)                1.045040         1.025391
 univariate, area(dS/dV)                 0.017548         0.012530
```

The difference between the two different dataset for blank is close

## label 8 feature comparison between ML12 and ML4
```
                                        Mean ML12        Mean ML4
 univariate, area(S)                     3.544772        7.607660
 peak curvature                         80.466083        192.092580
 univariate, V_max(S)                    1.056402        1.048924
 vcenter                                 1.056426        1.050648
 univariate, max(S)                      0.054969        0.122838
 univariate, mean(S)                     0.005053        0.010868
 univariate, std(S)                      0.013428        0.029184
 univariate, max(dS/dV)                  1.221215        2.677500
 univariate, min(dS/dV)                 -1.391168        -3.465284
 univariate, max(dS/dV) - min(dS/dV)     2.612391        6.142800
 univariate, V_max(dS/dV)                1.025362        1.021280
 univariate, V_min(dS/dV)                1.080255        1.075200
 univariate, area(dS/dV)                 0.109943        0.245688
```

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.gaussian_process.kernels import Matern, RBF

from sklearn.base import clone
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, make_scorer
from sklearn.base import BaseEstimator
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from typing import Tuple

from src.load_models import select_model
from src.load_dataset import load_dataset
from src.config import models_features_r2, models_features_per, model_name_conversion
from src.graph_visualization import visualize_highest_score_feature_selection, feature_selection_tabularize, visualization_testing_dataset

from src.utils import find_adj_score, calculate_y_LOD, per_error

In [3]:
# Load Training Dataset
normalize_blanks = False
normalization    = True
standardize_type = 'mean_std'

%matplotlib
(X_train, X_test, y_train, y_test), scaler      = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/ML1_ML2', normalization= normalization, 
                                                        normalize_blanks=normalize_blanks, standardize_type=standardize_type)
(X_train_4, X_test_4, y_train_4, y_test_4), scaler_4   = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/ML4', normalization=False, normalize_blanks=False)

Using matplotlib backend: MacOSX


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.rename(columns={"PH": 'univariate, max(S)', 'signal_std':'univariate, std(S)', 'signal_mean':'univariate, mean(S)', 'peak area':'univariate, area(S)', \


######Data Distribution:#########
Training {0: 50, 16: 50, 8: 47}
Testing {0: 34, 8: 31, 16: 34}
#################################


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.rename(columns={"PH": 'univariate, max(S)', 'signal_std':'univariate, std(S)', 'signal_mean':'univariate, mean(S)', 'peak area':'univariate, area(S)', \


######Data Distribution:#########
Training {0: 23, 8: 25, 16: 21}
Testing {0: 15, 8: 16, 16: 15}
#################################


In [4]:
scaler.mean_ ,scaler.scale_

(array([ 3.53515238e+00,  8.54187605e+01,  1.04138027e+00,  1.04299320e+00,
         5.41466612e-02,  5.04761905e-03,  1.32197279e-02,  1.23252993e+00,
        -1.36619728e+00,  2.59872993e+00,  1.01115646e+00,  1.06908844e+00,
         1.08291156e-01]),
 array([2.66174099e+00, 5.27730758e+01, 2.75560619e-02, 2.35275194e-02,
        4.11144284e-02, 3.80524760e-03, 9.79721354e-03, 8.67778885e-01,
        9.97902816e-01, 1.86112419e+00, 2.93317178e-02, 2.96503261e-02,
        8.22300754e-02]))

In [6]:
# Combine
X_4, y_4 = pd.concat([X_train_4, X_test_4], axis=0), pd.concat([y_train_4, y_test_4], axis=0)
X, y     = pd.concat([X_train, X_test], axis=0), pd.concat([y_train, y_test], axis=0)

In [7]:
models = ['Linear', 'KNN', 'SVM', 'RF', 'GP', 'Ridge', 'Lasso', 'univariate, std(S)', 'univariate, max(dS/dV)', 'univariate, area(dS/dV)', 'univariate, area(S)', 'univariate, max(S)']
metric = 'per_error'
nor_with_blank = False

scorer = make_scorer((r2_score if metric=='r2' else per_error), greater_is_better=(True if metric=='r2' else False))

score_dict_r2 = {'Models': [],
              'Scores': []}

score_dict_per = {'Models': [],
              'Scores': []}


training_dataset = 'ML12'
testing_dataset  = 'ML4'

X_training, y_training, X_testing, y_testing = (X, y, X_4, y_4) if training_dataset=='ML12' else (X_4, y_4, X, y)

# Calcualte y_LOD
y_LOD = calculate_y_LOD(X_testing, y_testing)
print("y_LOD", y_LOD)

kf = KFold(n_splits=5)

y_LOD 0.9117010154341676


In [8]:
for model_name in models:

    # Test for R2 score
    model    = select_model(model_name)

    model_r2  = clone(model)
    model_per = clone(model)

    model_r2.fit(X_training[models_features_r2[model_name]],   y_training)                 # Fit model with R2 features on ML1_2 dataset
    model_per.fit(X_training[models_features_per[model_name]], y_training)                 # Fit percent error with R2 features on ML1_2 dataset
    
    y_pred_r2  = []
    y_pred_per = []
    y_gt       = []
    
    for train_ind, test_ind in kf.split(X_testing):
        # Features
        X_train_temp = X_testing.iloc[train_ind].copy()
        X_test_temp  = X_testing.iloc[test_ind].copy()

        # labels
        y_train_temp = X_testing.to_numpy()[train_ind].copy()
        y_test_temp  = y_testing.to_numpy()[test_ind].copy()

        # find the index of the blank
        # if nor_with_blank:
        #     blank_ind    = np.where(y_train_temp==0)[0].tolist()
        #     X_train_temp = X_train_temp[blank_ind]     # Select only the blanks
        
        # Initialize the StandardScaler
        scaler = StandardScaler() if standardize_type == 'mean_std' else MinMaxScaler()

        # Fit the scaler to only the blank of the training dataset
        if (normalize_blanks and standardize_type=='mean_std'): scaler.fit(X_train_temp[y_train_temp==0].copy())
        else: scaler.fit(X_train_temp)

        print(scaler.mean_, scaler.scale_)
        
        X_test_temp = scaler.transform(X_test_temp)    # Normalize the remaining validation fold
        X_test_temp = pd.DataFrame(X_test_temp, columns=X_testing.columns)
 
        
        y_pred_r2  += model_r2.predict(X_test_temp[models_features_r2[model_name]]).tolist()      # Inference model with R2 features on ML4 dataset
        y_pred_per += model_per.predict(X_test_temp[models_features_per[model_name]]).tolist()    # Inference percent error features on ML4 dataset

        y_gt       += y_test_temp.tolist()
  
    r2Score      = r2_score(y_gt, y_pred_r2)
    adj_r2_Score = find_adj_score(len(y_gt), len(models_features_r2[model_name]), r2Score)

    score_dict_r2['Models'].append(model_name)
    score_dict_per['Models'].append(model_name)
    
    score_dict_r2['Scores'].append((r2Score, adj_r2_Score))
    score_dict_per['Scores'].append(per_error(pd.Series(y_gt), y_pred_per, y_LOD=y_LOD))

score_dict_r2, score_dict_per

[ 6.54620000e+00  1.69678267e+02  1.03529130e+00  1.04212391e+00
  1.07293094e-01  9.35760870e-03  2.55304348e-02  2.41983152e+00
 -3.09523696e+00  5.51507717e+00  1.01160870e+00  1.06126087e+00
  2.14584783e-01] [5.17662883e+00 1.18007611e+02 3.32540212e-02 2.07313424e-02
 8.48600913e-02 7.39174383e-03 1.99927038e-02 1.82026278e+00
 2.44173696e+00 4.25916638e+00 3.22115819e-02 3.02104465e-02
 1.69718649e-01]
[ 6.72070652e+00  1.71942567e+02  1.03791630e+00  1.04162609e+00
  1.09484585e-01  9.60543478e-03  2.61369565e-02  2.46485109e+00
 -3.14709130e+00  5.61195326e+00  1.01078261e+00  1.06291304e+00
  2.18969565e-01] [5.35115356e+00 1.17963412e+02 2.86374499e-02 2.02871606e-02
 8.68624418e-02 7.64424143e-03 2.05331477e-02 1.84806397e+00
 2.47326472e+00 4.31830014e+00 3.09374636e-02 3.01134277e-02
 1.73725338e-01]
[ 6.86402609e+00  1.74563263e+02  1.03869130e+00  1.04332174e+00
  1.12036656e-01  9.80978261e-03  2.66923913e-02  2.51858043e+00
 -3.22719457e+00  5.74578478e+00  1.01134783

  model = cd_fast.enet_coordinate_descent(


({'Models': ['Linear',
   'KNN',
   'SVM',
   'RF',
   'GP',
   'Ridge',
   'Lasso',
   'univariate, std(S)',
   'univariate, max(dS/dV)',
   'univariate, area(dS/dV)',
   'univariate, area(S)',
   'univariate, max(S)'],
  'Scores': [(0.8354192924613687, 0.8278697187210645),
   (0.9059127530626641, 0.9024913986285791),
   (0.8988181205656839, 0.8921987452755884),
   (0.8958999879362434, 0.8940410591493906),
   (0.9002758157488678, 0.8993933008439906),
   (0.8411509727365516, 0.830758980298756),
   (0.8355207680046207, 0.8279758491057501),
   (0.8657898203828633, 0.8646021196782869),
   (0.8711078307209306, 0.869967192054744),
   (0.8684504913679727, 0.8672863364243264),
   (0.8632565171457972, 0.8620463978285033),
   (0.8684466262919064, 0.8672824371440472)]},
 {'Models': ['Linear',
   'KNN',
   'SVM',
   'RF',
   'GP',
   'Ridge',
   'Lasso',
   'univariate, std(S)',
   'univariate, max(dS/dV)',
   'univariate, area(dS/dV)',
   'univariate, area(S)',
   'univariate, max(S)'],
  'Score

In [9]:
savedir = 'Generability_output_diff_mean_std'
os.makedirs(savedir, exist_ok=True)
visualization_testing_dataset(score_dict_r2, f'{savedir}/r2_score_D_{training_dataset}_T_{testing_dataset}.png', model_name_conversion, only_one_multivariate=False, adj_score=True, legends=True)
visualization_testing_dataset(score_dict_per, f'{savedir}/per_error_score_D_{training_dataset}_T_{testing_dataset}.png', model_name_conversion, only_one_multivariate=False, r2_score=False, adj_score=False, legends=True)

In [None]:
X_testing

In [None]:
y_train_temp==0