In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.gaussian_process.kernels import Matern, RBF

from sklearn.base import clone
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, make_scorer
from sklearn.base import BaseEstimator
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from typing import Tuple

from src.load_models import select_model
from src.load_dataset import load_dataset
from src.config import models_features_r2, models_features_per, model_name_conversion
from src.graph_visualization import visualize_highest_score_feature_selection, feature_selection_tabularize, visualization_testing_dataset

from src.utils import find_adj_score, calculate_y_LOD, per_error

In [60]:
param_grids = {'SVM':{
                    'C': [1, 10, 15, 20, 100],
                    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01],
                    'kernel': ['rbf']},
              
              'RF': {
                    'n_estimators': [80, 100, 120],
                    'max_depth': [5, 10, 15],
                    'min_samples_split': [6, 8, 10, 16,20],
                    'min_samples_leaf': [2,4,6] 
                     },
              
              'KNN': {
                        'n_neighbors': [3, 5, 6, 7, 8, 9],
                        'weights': ['uniform', 'distance'],
                        'metric': ['euclidean', 'manhattan']
                    },
              
              'GP': {'kernel': [1.0 * RBF(length_scale=1.0), 
                                1.0 * RBF(length_scale=0.5), 
                                1.0 * RBF(length_scale=1.5), 
                                1.0 * RBF(length_scale=2.0),
                                1.0 * RBF(length_scale=2.5),
                                1.0 * Matern(length_scale=1.0, nu=1.5)],
                     'alpha': [0.001, 0.01, 0.1, 1, 1.5, 2, 2.5]},
               'Ridge': {'alpha': [0.0001, 0.0005, 0.001, 0.003, 0.005, 0.01, 0.1, 1.0]},
               'Lasso': {'alpha': [0.00001, 0.00003, 0.00005, 0.0001, 0.0003, 0.0005, 0.001]}
              }

In [61]:
class ModelSelectionEnsemble():
    def __init__(self, model_name:str, X_train:pd.DataFrame, y_train:pd.Series):
        self.X_train, self.y_train = X_train, y_train
        self.model = select_model(model_name)
        self.selectd_features = []
    
        self.y_LOD = calculate_y_LOD(self.X_train, self.y_train) 

    def save(self, path:str) -> None:
        with open(path, 'wb') as f:
            pickle.dump(self.model, path) 

    def find_score(self, kf:KFold, features:list) -> np.ndarray:
        return np.array(self.calculate_r2_score(self.model, self.X_train[features], self.y_train, kf))
    
    def find_per_diff(self, kf:KFold, features:list) -> np.ndarray:
        return np.array(self.calculate_per_diff(self.model, self.X_train[features], self.y_train, kf))
    
    def calculate_per_diff(self, model:BaseEstimator, X:pd.DataFrame, y:pd.Series, kf:KFold) -> np.ndarray:
        per_diff_all = []
        
        for train_index, test_index in kf.split(X):
            model_ = clone(model)
            
            # Split the data into training and testing sets
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.to_numpy()[train_index], y.to_numpy()[test_index]
        
            model_.fit(X_train, y_train)
            
            mask           = (y_test != 0)    # Non Zero Concentration
            zero_mask      = ~(mask)          # Zero Concentration

            y_pred         = model_.predict(X_test)
            y_pred         = np.maximum(y_pred, 0.0)

            # Only for non zero concentration
            non_zero_per_error = np.abs(y_test[mask] - y_pred[mask])/(0.5*(y_test[mask] + y_pred[mask]))
           
            # zero concentration
            zero_per_error     = np.abs(y_test[zero_mask] - y_pred[zero_mask]) / self.y_LOD

            assert not(np.isnan(zero_per_error).any())
            assert not(np.isnan(non_zero_per_error).any())

            per_error         = np.concatenate((non_zero_per_error, zero_per_error))
            per_error         = np.mean(per_error) * 100

            assert not(np.isnan(per_error)) # To check if any output is invalid or nan
            per_diff_all.append(per_error)

        
        return np.array(per_diff_all).mean()
    

    def calculate_r2_score(self, model:BaseEstimator, X:pd.DataFrame, y:pd.Series, kf:KFold) -> np.ndarray:
        scores, adj_scores = [], []

        for train_index, test_index in kf.split(X):
            model_ = clone(model)
            
            # Split the data into training and testing sets
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.to_numpy()[train_index], y.to_numpy()[test_index]
        
            model_.fit(X_train, y_train)
            
            y_pred         = model_.predict(X_test)
            y_pred         = np.maximum(y_pred, 0.0)

            score          = r2_score(y_test, y_pred)

            adj_score      = find_adj_score(len(y_pred), X_train.shape[1], score) # N, P, R2 score

            scores.append(score)
            adj_scores.append(adj_score)

        return np.array(scores).mean(), np.array(adj_scores).mean()
    
    def fit(self, features:list) -> None:
        self.model.fit(self.X_train[features], self.y_train)

    # def find_best_features_backward(self, data:tuple, is_r2_score:bool) -> list:

    #     X_train, X_test, y_train, y_test = data

        
    #     all_features            = self.X_train.columns.values
    #     self.selected_features  = all_features.copy().tolist()
    #     all_feature_scores      = []

    #     model = clone(self.model)
    #     model.fit(X_train, y_train)
        
    #     y_pred = model.predict(X_test)

    #     best_score              = self.calculate_r2_score(y_pred, y_test) if is_r2_score else \
    #                               self.calculate_per_diff(y_pred, y_test)

    #     best_param              = None
        
    #     flag                    = False

        
    #     scorer = make_scorer(r2_score , greater_is_better=True) if is_r2_score \
    #              else make_scorer(per_error , y_LOD=self.y_LOD, greater_is_better=False)

    #     while len(self.selected_features) != 0:
    #         one_line_score    = []
    #         one_line_features = []
    #         best_parameters   = []
            
    #         for feature in all_features:
    #             if feature in self.selected_features: 
    #                 testing_feature = [i for i in self.selected_features if i != feature] # Remove the feature from the set
                    
    #                 estimator = clone(self.model)

    #                 grid_search = GridSearchCV(estimator=estimator, param_grid=param_grids[model_name], cv=5, verbose=1, n_jobs=-1, scoring=scorer)
    
    #                 # Fit the grid search to the data
    #                 grid_search.fit(X_train[testing_feature], y_train)

    #                 model  = grid_search.best_estimator_
    #                 y_pred = model.predict(X_test[testing_feature])
                    
    #                 if is_r2_score:
    #                     score = self.calculate_r2_score(y_pred, y_test)
    #                 else:  
    #                     score = self.calculate_per_diff(y_pred, y_test)
                    
    #                 one_line_score.append(score)
    #                 one_line_features.append(feature)
    #                 best_parameters.append(grid_search.best_params_)
           
    #         one_line_score = np.array(one_line_score) if is_r2_score else one_line_score
            
    #         if is_r2_score==True:
    #             best_socre_ind      = np.argmax(one_line_score[:,1])
    #             one_line_best_score = one_line_score[best_socre_ind]
                

    #         else:
    #             best_socre_ind, one_line_best_score = np.argmin(one_line_score), np.min(one_line_score)

    #         sel_one_line_feature    = one_line_features[best_socre_ind] 

    #         temp = {}
    #         for key, score in zip(one_line_features, one_line_score):
    #             # key = [i for i in self.selected_features if i != key]
    #             temp[str(key)] = score

    #         if is_r2_score:
    #             if one_line_best_score[0] > best_score[0]:
    #                 best_score = one_line_best_score
    #                 best_param = best_parameters[best_socre_ind] 
    #                 self.selected_features.remove(sel_one_line_feature)
    #                 all_feature_scores.append(temp)
    #                 flag = False

    #             else: flag = True
                        
    #         else:
    #             if one_line_best_score <= best_score:
    #                 best_score = one_line_best_score
    #                 self.selected_features.remove(sel_one_line_feature)
    #                 best_param = best_parameters[best_socre_ind] 
    #                 all_feature_scores.append(temp)
    #                 flag = False

    #             else: flag = True

    #         if flag: break
        
    #     return best_score, self.selected_features, best_param

    def find_best_features_forward(self, kf:KFold, is_r2_score:float) -> list:
        
        estimator = clone(self.model)
    
        all_features            = self.X_train.columns.values
        self.selected_features  = []
        self.all_feature_scores = []
        best_parameters         = []
    
        best_score        = [0, 0] if is_r2_score else 100.0
        flag              = False
    
        scorer = make_scorer(r2_score , greater_is_better=True) if is_r2_score \
                 else make_scorer(per_error , y_LOD=self.y_LOD, greater_is_better=False)
    
        while len(self.selected_features) != len(all_features):
            one_line_score    = []
            one_line_features = []
            
            for feature in all_features:
                
                if feature not in self.selected_features:
                    testing_feature = self.selected_features + [feature]
    
                    # Select best parameters for the given feature combination
                    grid_search = GridSearchCV(estimator=estimator, param_grid=param_grids[model_name], cv=kf, verbose=1, n_jobs=-1, scoring=scorer)
                    grid_search.fit(X_train[testing_feature], y_train)
    
                    best_param  = grid_search.best_params_
    
                    model = clone(self.model)
                    model.set_params(**best_param)
    
                    # # Calculate the score
                    # if is_r2_score:
                    #     score = self.calculate_r2_score(model, self.X_train[testing_feature], self.y_train, kf)
                    # else:  
                    #     score = self.calculate_per_diff(model, self.X_train[testing_feature], self.y_train, kf)

                    score = abs(grid_search.best_score_)
                    
                    one_line_score.append(score)
                    one_line_features.append(feature)
                    best_parameters.append(best_param)
           
            one_line_score = np.array(one_line_score) if is_r2_score else one_line_score
            
            if is_r2_score==True:
                best_socre_ind      = np.argmax(one_line_score[:,0])
                one_line_best_score = one_line_score[best_socre_ind]
    
            else:
                best_socre_ind, one_line_best_score = np.argmin(one_line_score), np.min(one_line_score)
    
            sel_one_line_feature    = one_line_features[best_socre_ind] 
    
            temp = {}
            for key, score in zip(one_line_features, one_line_score):
                key = self.selected_features + [key]
                temp[str(key)] = score
                
            if is_r2_score:
                if one_line_best_score[0] > best_score[0]:
                    best_score = one_line_best_score
                    best_param = best_parameters[best_socre_ind]
                    self.selected_features.append(sel_one_line_feature)
                    self.all_feature_scores.append(temp)
                    flag = False
    
                else: flag = True
                        
            else:
                if one_line_best_score <= best_score:
                    best_score = one_line_best_score
                    best_param = best_parameters[best_socre_ind]
                    self.selected_features.append(sel_one_line_feature)
                    self.all_feature_scores.append(temp)
                    flag = False
    
                else: flag = True
    
            if flag: break
        
        return best_score, self.selected_features, best_param

    def find_best_features(self, kf:KFold, r2_score:float) -> list:
            model = clone(self.model)
    
            all_features           = self.X_train.columns.values
            self.selected_features = []
            self.all_feature_scores = []
    
            best_score        = [0, 0] if r2_score else 100.0
            flag              = False
    
            while len(self.selected_features) != len(all_features):
                one_line_score    = []
                one_line_features = []
                
                for feature in all_features:
                    
                    if feature not in self.selected_features:
                        testing_feature = self.selected_features + [feature]
                        
                        if r2_score:
                            score = self.calculate_r2_score(model, self.X_train[testing_feature], self.y_train, kf)
                        else:  
                            score = self.calculate_per_diff(model, self.X_train[testing_feature], self.y_train, kf)
                        
                        one_line_score.append(score)
                        one_line_features.append(feature)
               
                one_line_score = np.array(one_line_score) if r2_score else one_line_score
                
                if r2_score==True:
                    best_socre_ind      = np.argmax(one_line_score[:,0])
                    one_line_best_score = one_line_score[best_socre_ind]
    
                else:
                    best_socre_ind, one_line_best_score = np.argmin(one_line_score), np.min(one_line_score)
    
                sel_one_line_feature    = one_line_features[best_socre_ind] 
    
                temp = {}
                for key, score in zip(one_line_features, one_line_score):
                    key = self.selected_features + [key]
                    temp[str(key)] = score
                    
                if r2_score:
                    if one_line_best_score[0] > best_score[0]:
                        best_score = one_line_best_score
                        self.selected_features.append(sel_one_line_feature)
                        self.all_feature_scores.append(temp)
                        flag = False
    
                    else: flag = True
                            
                else:
                    if one_line_best_score <= best_score:
                        best_score = one_line_best_score
                        self.selected_features.append(sel_one_line_feature)
                        self.all_feature_scores.append(temp)
                        flag = False
    
                    else: flag = True
    
                if flag: break
            
            self.best_score = best_score
            return best_score, self.selected_features, model.get_params()


In [62]:
X_train, X_test, y_train, y_test = load_dataset()

######Data Distribution:#########
Training {0: 50, 16: 50, 8: 47}
Testing {0: 34, 8: 31, 16: 34}
#################################


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.rename(columns={"PH": 'univariate, max(S)', 'signal_std':'univariate, std(S)', 'signal_mean':'univariate, mean(S)', 'peak area':'univariate, area(S)', \


In [63]:
model_name    = 'RF'

kf            = KFold(n_splits=5, shuffle=True, random_state=42)
selectModel_tuning   = ModelSelectionEnsemble(model_name, X_train, y_train)
output_tuning = selectModel_tuning.find_best_features_forward(kf, is_r2_score=False)

selectModel   = ModelSelectionEnsemble(model_name, X_train, y_train)
output        = selectModel.find_best_features(kf, r2_score=False)

Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 135 candidates, totalling 675 fits


Process LokyProcess-38:
Process LokyProcess-45:
Process LokyProcess-46:
Process LokyProcess-37:
Process LokyProcess-40:
Process LokyProcess-42:
Process LokyProcess-39:
Process LokyProcess-47:
Process LokyProcess-36:
Process LokyProcess-41:
Process LokyProcess-44:
Process LokyProcess-43:
Traceback (most recent call last):
  File "/Users/sangam/miniconda3/envs/vgramreg/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 463, in _process_worker
    r = call_item()
  File "/Users/sangam/miniconda3/envs/vgramreg/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 291, in __call__
    return self.fn(*self.args, **self.kwargs)
  File "/Users/sangam/miniconda3/envs/vgramreg/lib/python3.9/site-packages/joblib/parallel.py", line 598, in __call__
    return [func(*args, **kwargs)
  File "/Users/sangam/miniconda3/envs/vgramreg/lib/python3.9/site-packages/joblib/parallel.py", line 598, in <listcomp>
    return [func(*args, **kwargs)
  File "/Users/s

KeyboardInterrupt: 

In [55]:
output_tuning, output

((18.724020594198382,
  ['univariate, max(dS/dV) - min(dS/dV)',
   'univariate, area(S)',
   'univariate, V_max(dS/dV)',
   'univariate, V_max(S)',
   'vcenter',
   'univariate, area(dS/dV)',
   'univariate, std(S)',
   'univariate, max(S)',
   'univariate, mean(S)'],
  {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}),
 (18.724020594198382,
  ['univariate, max(dS/dV) - min(dS/dV)',
   'univariate, area(S)',
   'univariate, V_max(dS/dV)',
   'univariate, V_max(S)',
   'vcenter',
   'univariate, area(dS/dV)',
   'univariate, std(S)',
   'univariate, max(S)',
   'univariate, mean(S)'],
  {'C': 100,
   'cache_size': 200,
   'coef0': 0.0,
   'degree': 3,
   'epsilon': 0.1,
   'gamma': 0.01,
   'kernel': 'rbf',
   'max_iter': -1,
   'shrinking': True,
   'tol': 0.001,
   'verbose': False}))

In [56]:
train_five_fold_score, features, params = output_tuning

print(features)
selectModel.model.set_params(**params)
selectModel.model.fit(X_train[features], y_train)

y_pred = selectModel.model.predict(X_test[features])

percent_error  = per_error(y_test, y_pred, y_LOD=selectModel.y_LOD)
r2             = r2_score(y_test, y_pred)

print("% error", percent_error)
print("R2 Score",r2)


['univariate, max(dS/dV) - min(dS/dV)', 'univariate, area(S)', 'univariate, V_max(dS/dV)', 'univariate, V_max(S)', 'vcenter', 'univariate, area(dS/dV)', 'univariate, std(S)', 'univariate, max(S)', 'univariate, mean(S)']
% error 24.33908303010931
R2 Score 0.8465498852094955


In [57]:


train_five_fold_score, features, params = output
print(features)
# selectModel.model.set_params(**params)
selectModel.model.fit(X_train[features], y_train)

y_pred = selectModel.model.predict(X_test[features])

percent_error  = per_error(y_test, y_pred, y_LOD=selectModel.y_LOD)
r2             = r2_score(y_test, y_pred)

print("% error", percent_error)
print("R2 Score",r2)


['univariate, max(dS/dV) - min(dS/dV)', 'univariate, area(S)', 'univariate, V_max(dS/dV)', 'univariate, V_max(S)', 'vcenter', 'univariate, area(dS/dV)', 'univariate, std(S)', 'univariate, max(S)', 'univariate, mean(S)']
% error 24.33908303010931
R2 Score 0.8465498852094955
