In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.gaussian_process.kernels import Matern, RBF

from sklearn.base import clone
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, make_scorer
from sklearn.base import BaseEstimator
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from typing import Tuple

from src.load_models import select_model
from src.load_dataset import load_dataset
from src.config import models_features_r2, models_features_per, model_name_conversion
from src.graph_visualization import visualize_highest_score_feature_selection, feature_selection_tabularize, visualization_testing_dataset

from src.utils import find_adj_score, calculate_y_LOD, per_error

In [2]:
param_grids = {'SVM':{
                    'C': [1, 10, 15, 20, 100],
                    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01],
                    'kernel': ['rbf']},
              
              'RF': {
                    'n_estimators': [80, 90, 100, 110, 120],
                    'max_depth': [None, 5, 8, 10, 12, 15],
                    'min_samples_split': [6, 8, 10, 12, 14,16,20],
                    'min_samples_leaf': [2,3,4,5,6] 
                     },
              
              'KNN': {
                        'n_neighbors': [3, 5, 6, 7, 8, 9],
                        'weights': ['uniform', 'distance'],
                        'metric': ['euclidean', 'manhattan']
                    },
              
              'GP': {'kernel': [1.0 * RBF(length_scale=1.0), 
                                1.0 * RBF(length_scale=0.5), 
                                1.0 * RBF(length_scale=1.5), 
                                1.0 * RBF(length_scale=2.0),
                                1.0 * RBF(length_scale=2.5),
                                1.0 * Matern(length_scale=1.0, nu=1.5)],
                     'alpha': [0.001, 0.01, 0.1, 1, 1.5, 2, 2.5]},
               'Ridge': {'alpha': [0.0001, 0.0005, 0.001, 0.003, 0.005, 0.01, 0.1, 1.0]},
               'Lasso': {'alpha': [0.00001, 0.00003, 0.00005, 0.0001, 0.0003, 0.0005, 0.001]}
              }

In [3]:
class ModelSelectionEnsemble():
    def __init__(self, model_name:str, X_train:pd.DataFrame, y_train:pd.Series):
        self.X_train, self.y_train = X_train, y_train
        self.model = select_model(model_name)
        self.selectd_features = []
    
        self.y_LOD = calculate_y_LOD(self.X_train, self.y_train) 

    def save(self, path:str) -> None:
        with open(path, 'wb') as f:
            pickle.dump(self.model, path) 

    def find_score(self, kf:KFold, features:list) -> np.ndarray:
        return np.array(self.calculate_r2_score(self.model, self.X_train[features], self.y_train, kf))
    
    def find_per_diff(self, kf:KFold, features:list) -> np.ndarray:
        return np.array(self.calculate_per_diff(self.model, self.X_train[features], self.y_train, kf))
    
    def calculate_per_diff(self, y_pred, y_test) -> np.ndarray:
        
        mask           = (y_test != 0)    # Non Zero Concentration
        zero_mask      = ~(mask)          # Zero Concentration

        y_pred         = np.maximum(y_pred, 0.0)

        # Only for non zero concentration
        non_zero_per_error = np.abs(y_test[mask] - y_pred[mask])/(0.5*(y_test[mask] + y_pred[mask]))
       
        # zero concentration
        zero_per_error     = np.abs(y_test[zero_mask] - y_pred[zero_mask]) / self.y_LOD

        assert not(np.isnan(zero_per_error).any())
        assert not(np.isnan(non_zero_per_error).any())

        per_error         = np.concatenate((non_zero_per_error, zero_per_error))
        per_error         = np.mean(per_error) * 100

        assert not(np.isnan(per_error)) # To check if any output is invalid or nan
        
        return per_error
    

    def calculate_r2_score(self, y_pred, y_test) -> np.ndarray:
        
        y_pred         = np.maximum(y_pred, 0.0)

        score          = r2_score(y_test, y_pred)
        adj_score      = find_adj_score(len(y_pred), X_train.shape[1], score) # N, P, R2 score

        return score, adj_score
    
    def fit(self, features:list) -> None:
        self.model.fit(self.X_train[features], self.y_train)

    def find_best_features_backward(self, data:tuple, is_r2_score:bool) -> list:

        X_train, X_test, y_train, y_test = data

        
        all_features            = self.X_train.columns.values
        self.selected_features  = all_features.copy().tolist()
        all_feature_scores      = []

        model = clone(self.model)
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)

        best_score              = self.calculate_r2_score(y_pred, y_test) if is_r2_score else \
                                  self.calculate_per_diff(y_pred, y_test)

        best_param              = None
        
        flag                    = False

        
        scorer = make_scorer(r2_score , greater_is_better=True) if is_r2_score \
                 else make_scorer(per_error , y_LOD=self.y_LOD, greater_is_better=False)

        while len(self.selected_features) != 0:
            one_line_score    = []
            one_line_features = []
            best_parameters   = []
            
            for feature in all_features:
                if feature in self.selected_features: 
                    testing_feature = [i for i in self.selected_features if i != feature] # Remove the feature from the set
                    
                    estimator = clone(self.model)

                    grid_search = GridSearchCV(estimator=estimator, param_grid=param_grids[model_name], cv=5, verbose=1, n_jobs=-1, scoring=scorer)
    
                    # Fit the grid search to the data
                    grid_search.fit(X_train[testing_feature], y_train)

                    model  = grid_search.best_estimator_
                    y_pred = model.predict(X_test[testing_feature])
                    
                    if is_r2_score:
                        score = self.calculate_r2_score(y_pred, y_test)
                    else:  
                        score = self.calculate_per_diff(y_pred, y_test)
                    
                    one_line_score.append(score)
                    one_line_features.append(feature)
                    best_parameters.append(grid_search.best_params_)
           
            one_line_score = np.array(one_line_score) if is_r2_score else one_line_score
            
            if is_r2_score==True:
                best_socre_ind      = np.argmax(one_line_score[:,1])
                one_line_best_score = one_line_score[best_socre_ind]
                

            else:
                best_socre_ind, one_line_best_score = np.argmin(one_line_score), np.min(one_line_score)

            sel_one_line_feature    = one_line_features[best_socre_ind] 

            temp = {}
            for key, score in zip(one_line_features, one_line_score):
                # key = [i for i in self.selected_features if i != key]
                temp[str(key)] = score

            if is_r2_score:
                if one_line_best_score[0] > best_score[0]:
                    best_score = one_line_best_score
                    best_param = best_parameters[best_socre_ind] 
                    self.selected_features.remove(sel_one_line_feature)
                    all_feature_scores.append(temp)
                    flag = False

                else: flag = True
                        
            else:
                if one_line_best_score <= best_score:
                    best_score = one_line_best_score
                    self.selected_features.remove(sel_one_line_feature)
                    best_param = best_parameters[best_socre_ind] 
                    all_feature_scores.append(temp)
                    flag = False

                else: flag = True

            if flag: break
        
        return best_score, self.selected_features, best_param

    def find_best_features_forward(self, data:tuple, is_r2_score:bool) -> list:
        X_train, X_test, y_train, y_test = data

        all_features            = self.X_train.columns.values
        self.selected_features  = []
        self.all_feature_scores = []
        best_parameters         = []

        best_score        = [0, 0] if is_r2_score else 100.0
        best_param        = None
        flag              = False

        scorer = make_scorer(r2_score , greater_is_better=True) if is_r2_score \
                 else make_scorer(per_error , y_LOD=self.y_LOD, greater_is_better=False)

        while len(self.selected_features) != len(all_features):
            one_line_score    = []
            one_line_features = []
            
            for feature in all_features:
                
                if feature not in self.selected_features:
                    testing_feature = self.selected_features + [feature]
                    
                    estimator = clone(self.model)

                    grid_search = GridSearchCV(estimator=estimator, param_grid=param_grids[model_name], cv=5, verbose=1, n_jobs=-1, scoring=scorer)
    
                    # Fit the grid search to the data
                    grid_search.fit(X_train[testing_feature], y_train)

                    model  = grid_search.best_estimator_
                    
                    y_pred = model.predict(X_test[testing_feature])
                    
                    if is_r2_score:
                        score = self.calculate_r2_score(y_pred, y_test)
                    else:  
                        score = self.calculate_per_diff(y_pred, y_test)
                    
                    one_line_score.append(score)
                    one_line_features.append(feature)
                    best_parameters.append(grid_search.best_params_)
           
            one_line_score = np.array(one_line_score) if is_r2_score else one_line_score
            
            if is_r2_score==True:
                best_socre_ind      = np.argmax(one_line_score[:,0])
                one_line_best_score = one_line_score[best_socre_ind]

            else:
                best_socre_ind, one_line_best_score = np.argmin(one_line_score), np.min(one_line_score)

            sel_one_line_feature    = one_line_features[best_socre_ind] 

            temp = {}
            for key, score in zip(one_line_features, one_line_score):
                key = self.selected_features + [key]
                temp[str(key)] = score
                
            if is_r2_score:
                if one_line_best_score[0] > best_score[0]:
                    best_score = one_line_best_score
                    best_param = best_parameters[best_socre_ind]
                    self.selected_features.append(sel_one_line_feature)
                    self.all_feature_scores.append(temp)
                    flag = False

                else: flag = True
                        
            else:
                if one_line_best_score <= best_score:
                    best_score = one_line_best_score
                    best_param = best_parameters[best_socre_ind]
                    self.selected_features.append(sel_one_line_feature)
                    self.all_feature_scores.append(temp)
                    flag = False

                else: flag = True

            if flag: break
        
        return best_score, self.selected_features, best_param

    def find_K_fold_ensemble(self, X, y, kf, is_r2_score):
        for (train_index, test_index) in kf.split(X):
            
            # Split the data into training and testing sets
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.to_numpy()[train_index], y.to_numpy()[test_index]

            data = (X_train, X_test, y_train, y_test)
            
            best_score, selected_features, best_param = self.find_best_features_forward(data, is_r2_score)
            self.selectd_features.append((best_score, selected_features, best_param))

        return self.selectd_features


In [4]:
X_train, X_test, y_train, y_test = load_dataset()

######Data Distribution:#########
Training {0: 50, 16: 50, 8: 47}
Testing {0: 34, 8: 31, 16: 34}
#################################


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.rename(columns={"PH": 'univariate, max(S)', 'signal_std':'univariate, std(S)', 'signal_mean':'univariate, mean(S)', 'peak area':'univariate, area(S)', \


In [6]:
model_name  = 'KNN'
kf          = KFold(n_splits=5, shuffle=True, random_state=42)
selectModel = ModelSelectionEnsemble(model_name, X_train, y_train)
output      = selectModel.find_K_fold_ensemble(X_train, y_train, kf, is_r2_score=False)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 

In [7]:
r2_score_all = []
for i in output:
    r2_score_all.append(i[0]) # Take only the R2 value
    
print(np.mean(r2_score_all))

7.1350615956946415


In [8]:
output

[(9.64167148045592,
  ['univariate, max(dS/dV) - min(dS/dV)',
   'univariate, min(dS/dV)',
   'univariate, std(S)',
   'univariate, mean(S)'],
  {'metric': 'euclidean', 'n_neighbors': 6, 'weights': 'uniform'}),
 (8.275613275613276,
  ['univariate, max(S)',
   'univariate, V_min(dS/dV)',
   'univariate, std(S)',
   'univariate, V_max(S)',
   'univariate, mean(S)'],
  {'metric': 'euclidean', 'n_neighbors': 6, 'weights': 'uniform'}),
 (6.781369882875605,
  ['univariate, max(dS/dV)',
   'univariate, V_min(dS/dV)',
   'univariate, V_max(dS/dV)',
   'univariate, max(S)',
   'univariate, V_max(S)',
   'univariate, std(S)'],
  {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}),
 (4.492103406702614,
  ['univariate, mean(S)', 'univariate, V_max(dS/dV)', 'univariate, std(S)'],
  {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}),
 (6.484549932825794,
  ['univariate, max(dS/dV) - min(dS/dV)',
   'univariate, max(S)',
   'univariate, area(S)',
   'univariate, V_max(S)',


In [9]:
test_score_per, test_score_r2 = [], []

for _, features, params in output:
    # print(params)
    
    selectModel.model.set_params(**params)
    selectModel.model.fit(X_train[features], y_train)
    y_pred = selectModel.model.predict(X_test[features])

    percent_error  = per_error(y_test, y_pred, y_LOD=selectModel.y_LOD)
    r2             = r2_score(y_test, y_pred)
    
    test_score_per.append(percent_error)
    test_score_r2.append(r2)
    
    print("% error", percent_error)
    print("R2 Score",r2)

    print("##########")

print("Avg % Error", np.mean(test_score_per), "R2 Score", np.mean(test_score_r2))

% error 16.589108590718897
R2 Score 0.8517156862745098
##########
% error 19.56925774225195
R2 Score 0.8533496732026145
##########
% error 16.044251548308605
R2 Score 0.8518518518518519
##########
% error 18.57780224795684
R2 Score 0.8670588235294118
##########
% error 17.203559629588113
R2 Score 0.8138569093106478
##########
Avg % Error 17.596795951764882 R2 Score 0.8475665888338071
