# Imports and functions

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
from numpy.random import RandomState
import pandas as pd
from math import sqrt
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn import feature_selection as fs
from sklearn import metrics as me
from sklearn.metrics import classification_report,auc,r2_score,matthews_corrcoef,roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import chi2,f_classif,f_regression
from catboost import CatBoostRegressor,CatBoostClassifier
from catboost.utils import get_roc_curve
from catboost import Pool, cv
import shap
from scipy import stats
from scipy.optimize import curve_fit
import copy
from tabulate import tabulate
from tqdm import tqdm
from BorutaShap import BorutaShap
from powershap  import PowerShap
import shapicant

In [None]:
def scores_calc_print(Y,Y_pred,print_bool):
    if len(Y_pred) > 1:
        R2_total = me.r2_score(Y,Y_pred)
    else:
        R2_total = -1
    RMSE_total = sqrt(me.mean_squared_error(Y,Y_pred))
    MAE_total = me.mean_absolute_error(Y,Y_pred)
    ME_total = 0
    
    diff = np.subtract(Y_pred,Y)
    for i in range(0,len(diff)):
        ME_total+=diff[i]
        
    ME_total = ME_total/len(diff)
    
    if print_bool:
        print(tabulate([[RMSE_total, MAE_total,R2_total,ME_total]], ["RMSE","MAE","R²","ME"], tablefmt="grid"))
        #print(f"RMSE of total: {RMSE_total:.4f}")
        #print(f"MAE of total: {MAE_total:.4f}")
        #print(f"R² of total: {R2_total:.4f}")
        #print(f"ME of total: {ME_total:.4f}")
        #print(f"MAPE of total: {MAPE_total:.4f}%")
        #print(f"MdAPE of total: {MdAPE_total:.4f}%")
        #print(f"MdPE of total: {MdPE_total:.4f}%")
        #print(f"Std of results: {std_arr:.4f}")
        #print("\n")
    else:
        return {"R2":R2_total,"RMSE":RMSE_total,"MAE":MAE_total,"ME":ME_total}


In [None]:
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import roc_auc_score,classification_report,auc,r2_score,matthews_corrcoef

## This cross_validation is different from the standard cross validation, because the a priori dataset is only tested on actual a priori samples
# Inp_db:pandas_DataFrame = input database (contains labels and features), should contain heroi2c numbers as IDs, previous concentrations
# Features:List = features to use
# Folds:int = amount of folds
# RS:int = Random state
# Output:dict = dictionary with the results of each fold
def benchmark_classification_cross_validation(Model,Inp_db,index_col,folds,RS,features,target_col,disable_tqdm_output=False):
    kf = KFold(n_splits=folds,shuffle=True,random_state=RS)

    scores_cv_train = {"AUC":np.array([]),
                 "MCC":np.array([]),      
                 "ACCURACY":np.array([]),
                 "RECALL":np.array([]),
                 "F1":np.array([]),
                 "PRECISION":np.array([])}
    
    scores_cv_test = {"AUC":np.array([]),
                 "MCC":np.array([]),      
                 "ACCURACY":np.array([]),
                 "RECALL":np.array([]),
                 "F1":np.array([]),
                 "PRECISION":np.array([])}
    
    for CV_train_idx, CV_test_idx in tqdm(kf.split(Inp_db[index_col].unique()),disable=disable_tqdm_output):  
        ## Split per patient (to avoid data leakage)
        X_CV_train = Inp_db[Inp_db[index_col].isin(Inp_db[index_col].unique()[CV_train_idx])]
        X_CV_test = Inp_db[Inp_db[index_col].isin(Inp_db[index_col].unique()[CV_test_idx])]
        
        Y_CV_train = X_CV_train[target_col].values
        Y_CV_test = X_CV_test[target_col].values
        
        ## Extract the required features
        X_CV_train_feat = X_CV_train[features]
        X_CV_test_feat = X_CV_test[features]
        
        try:
            Model.fit(X_CV_train_feat,Y_CV_train,eval_set=(X_CV_test[features],Y_CV_test))
        except:
            Model.fit(X_CV_train_feat,Y_CV_train)
        
        Y_CV_predict_test = Model.predict(X_CV_test_feat)
        Y_CV_predict_train = Model.predict(X_CV_train_feat)
        y_cv_train_pred = Model.predict_proba(X_CV_train_feat)[:,1]
        y_cv_test_pred = Model.predict_proba(X_CV_test_feat)[:,1]
        
        train_results = classification_report(Y_CV_train,Model.predict(X_CV_train_feat),output_dict=True)
        test_results = classification_report(Y_CV_test,Model.predict(X_CV_test_feat),output_dict=True)
        
        scores_cv_train["AUC"]=np.append(scores_cv_train["AUC"],roc_auc_score(Y_CV_train,y_cv_train_pred))
        scores_cv_test["AUC"]=np.append(scores_cv_test["AUC"],roc_auc_score(Y_CV_test,y_cv_test_pred))

        scores_cv_train["MCC"]=np.append(scores_cv_train["MCC"],matthews_corrcoef(Y_CV_train,Model.predict(X_CV_train_feat)))
        scores_cv_test["MCC"]=np.append(scores_cv_test["MCC"],matthews_corrcoef(Y_CV_test,Model.predict(X_CV_test_feat)))
        
        scores_cv_train["ACCURACY"]=np.append(scores_cv_train["ACCURACY"],train_results["accuracy"])
        scores_cv_test["ACCURACY"]=np.append(scores_cv_test["ACCURACY"],test_results["accuracy"])

        scores_cv_train["RECALL"]=np.append(scores_cv_train["RECALL"],train_results['weighted avg']['recall'])
        scores_cv_test["RECALL"]=np.append(scores_cv_test["RECALL"],test_results['weighted avg']['recall'])

        scores_cv_train["F1"]=np.append(scores_cv_train["F1"],train_results['weighted avg']['f1-score'])
        scores_cv_test["F1"]=np.append(scores_cv_test["F1"],test_results['weighted avg']['f1-score'])

        scores_cv_train["PRECISION"]=np.append(scores_cv_train["PRECISION"],train_results['weighted avg']['precision'])
        scores_cv_test["PRECISION"]=np.append(scores_cv_test["PRECISION"],test_results['weighted avg']['precision'])

    #for key in scores_cv_train:
    #    scores_cv_train[key]=[np.mean(scores_cv_train[key][0]),np.std(scores_cv_train[key][0])]
    #    scores_cv_test[key]=[np.mean(scores_cv_test[key][0]),np.std(scores_cv_test[key][0])]

    return scores_cv_train,scores_cv_test

## This cross_validation is different from the standard cross validation, because the a priori dataset is only tested on actual a priori samples
# Inp_db:pandas_DataFrame = input database (contains labels and features), should contain heroi2c numbers as IDs, previous concentrations
# Features:List = features to use
# Folds:int = amount of folds
# RS:int = Random state
# Output:dict = dictionary with the results of each fold
def benchmark_regression_cross_validation(Model,Inp_db,index_col,folds,RS,features,target_col,disable_tqdm_output=False):
    kf = KFold(n_splits=folds,shuffle=True,random_state=RS)

    scores_cv_train = {"R2":np.array([]),
                 "RMSE":np.array([]),      
                 "MAE":np.array([]),
                 "ME":np.array([])}
    
    scores_cv_test = {"R2":np.array([]),
                 "RMSE":np.array([]),      
                 "MAE":np.array([]),
                 "ME":np.array([])}
    
    for CV_train_idx, CV_test_idx in tqdm(kf.split(Inp_db[index_col].unique()),disable=disable_tqdm_output): 
        ## Split per patient (to avoid data leakage)
        X_CV_train = Inp_db[Inp_db[index_col].isin(Inp_db[index_col].unique()[CV_train_idx])]
        X_CV_test = Inp_db[Inp_db[index_col].isin(Inp_db[index_col].unique()[CV_test_idx])]
        
        Y_CV_train = X_CV_train[target_col].values
        Y_CV_test = X_CV_test[target_col].values
        
        ## Extract the required features
        X_CV_train_feat = X_CV_train[features]
        X_CV_test_feat = X_CV_test[features]
        
        try:
            Model.fit(X_CV_train_feat,Y_CV_train,eval_set=(X_CV_test[features],Y_CV_test))
        except:
            Model.fit(X_CV_train_feat,Y_CV_train)
        
        Y_CV_predict_test = Model.predict(X_CV_test_feat)
        Y_CV_predict_train = Model.predict(X_CV_train_feat)
        
        train_results = scores_calc_print(Y_CV_train,Model.predict(X_CV_train_feat),print_bool=False)
        test_results = scores_calc_print(Y_CV_test,Model.predict(X_CV_test_feat),print_bool=False)
        
        scores_cv_train["R2"]=np.append(scores_cv_train["R2"],train_results["R2"])
        scores_cv_test["R2"]=np.append(scores_cv_test["R2"],test_results["R2"])

        scores_cv_train["RMSE"]=np.append(scores_cv_train["RMSE"],train_results["RMSE"])
        scores_cv_test["RMSE"]=np.append(scores_cv_test["RMSE"],test_results["RMSE"])
        
        scores_cv_train["MAE"]=np.append(scores_cv_train["MAE"],train_results["MAE"])
        scores_cv_test["MAE"]=np.append(scores_cv_test["MAE"],test_results["MAE"])

        scores_cv_train["ME"]=np.append(scores_cv_train["ME"],train_results['ME'])
        scores_cv_test["ME"]=np.append(scores_cv_test["ME"],test_results['ME'])

    #for key in scores_cv_train:
    #    scores_cv_train[key]=[np.mean(scores_cv_train[key][0]),np.std(scores_cv_train[key][0])]
    #    scores_cv_test[key]=[np.mean(scores_cv_test[key][0]),np.std(scores_cv_test[key][0])]

    return scores_cv_train,scores_cv_test

In [None]:
from tqdm import tqdm
import plotly.graph_objects as go
import time
from sklearn.base import clone



def classification_forward_feature_selection(Model,Inp_db,index_col,folds,RS,features,target_col,metric):
    start_time = time.time()
    
    MAEs_train = []  
    MAEs_test = []
    Metrics_best = {"AUC":0.5,"MCC":0,"ACCURACY":0,"RECALL":0,"F1":0,"PRECISION":0}
    temp_mean_sc = {"AUC":0.5,"MCC":0,"ACCURACY":0,"RECALL":0,"F1":0,"PRECISION":0}
    Metric_changed = True

    CV_features_all = features
    CV_features_current_best = []
    CV_features_arr_final = []

    CV_RS = RS

    #Forward feature selection method
    while Metric_changed:
        print(60*"=")
        print("Iteration to select the "+str(len(CV_features_arr_final)+1)+"th feature.")

        #To keep the feature selection going
        Metric_changed = False

        for cv_feature in tqdm(CV_features_all,ascii=True):#iterator_array,ascii=True):
            if cv_feature not in CV_features_arr_final:
                try:
                    CV_features_current = CV_features_arr_final+[cv_feature]

                    try:
                        CB_model_cv = Model.copy()
                    except:
                        CB_model_cv = clone(Model) 

                    train_sc,test_sc = benchmark_classification_cross_validation(Model,Inp_db,
                                                                                 index_col,folds,RS,
                                                                                 CV_features_current,target_col,True)

                    for key in test_sc:
                        temp_mean_sc[key]=np.mean(test_sc[key])

                    if temp_mean_sc[metric]>Metrics_best[metric]:
                        for key in temp_mean_sc:
                            Metrics_best[key]=temp_mean_sc[key]
                        CV_features_current_best = CV_features_current
                        Metric_changed = True
                except Exception as e:
                    print(e)
                    print("Skipping this feature")

        CV_features_arr_final = CV_features_current_best
        print("Current features: "+str(CV_features_arr_final))
        print("Status update: The current best metrics are: "+str(Metrics_best))

    print(60*"=")
    print("The best metrics are: "+str(Metrics_best)+" with the features: "+str(CV_features_arr_final)) 
    print("--- %s seconds ---" % (time.time() - start_time))
    return CV_features_arr_final,Metrics_best


def regression_forward_feature_selection(Model,Inp_db,index_col,folds,RS,features,target_col,metric):
    start_time = time.time()
    
    MAEs_train = []  
    MAEs_test = []
    Metrics_best = {"R2":-10,"RMSE":9999,"MAE":9999,"ME":9999}
    temp_mean_sc = {"R2":-10,"RMSE":9999,"MAE":9999,"ME":9999}

    Metric_changed = True

    CV_features_all = features
    CV_features_current_best = []
    CV_features_arr_final = []

    CV_RS = RS

    #Forward feature selection method
    while Metric_changed:
        print(60*"=")
        print("Iteration to select the "+str(len(CV_features_arr_final)+1)+"th feature.")

        #To keep the feature selection going
        Metric_changed = False

        for cv_feature in tqdm(CV_features_all,ascii=True):#iterator_array,ascii=True):
            if cv_feature not in CV_features_arr_final:
                try:
                    CV_features_current = CV_features_arr_final+[cv_feature]

                    try:
                        CB_model_cv = Model.copy()
                    except:
                        CB_model_cv = clone(Model) 

                    train_sc,test_sc = benchmark_regression_cross_validation(Model,Inp_db,index_col,folds,RS,CV_features_current,target_col,True)

                    for key in test_sc:
                        temp_mean_sc[key]=np.mean(test_sc[key])

                    if metric == "R2":
                        if temp_mean_sc[metric]>Metrics_best[metric]:
                            for key in temp_mean_sc:
                                Metrics_best[key]=temp_mean_sc[key]
                            CV_features_current_best = CV_features_current
                            Metric_changed = True
                    else:
                        if temp_mean_sc[metric]<Metrics_best[metric]:
                            for key in temp_mean_sc:
                                Metrics_best[key]=temp_mean_sc[key]
                            CV_features_current_best = CV_features_current
                            Metric_changed = True
                except Exception as e:
                    print(e)
                    print("Skipping this feature")

        CV_features_arr_final = CV_features_current_best
        print("Current features: "+str(CV_features_arr_final))
        print("Status update: The current best metrics are: "+str(Metrics_best))

    print(60*"=")
    print("The best metrics are: "+str(Metrics_best)+" with the features: "+str(CV_features_arr_final)) 
    print("--- %s seconds ---" % (time.time() - start_time))
    return CV_features_arr_final,Metrics_best


In [None]:
from tqdm import tqdm
import plotly.graph_objects as go


def classification_backwards_feature_selection(Model,Inp_db,index_col,folds,RS,features,target_col,metric):
    start_time = time.time()
    MAEs_train = []  
    MAEs_test = []
    Metrics_best = {"AUC":0.5,"MCC":0,"ACCURACY":0,"RECALL":0,"F1":0,"PRECISION":0}
    temp_mean_sc = {"AUC":0.5,"MCC":0,"ACCURACY":0,"RECALL":0,"F1":0,"PRECISION":0}
    Metric_changed = True

    CV_features_all = list(features)
    CV_features_current_best = []
    CV_features_arr_final = list(features)

    CV_RS = RS
    
    print("Getting the best metrics")
        #first test         
    train_sc,test_sc = benchmark_classification_cross_validation(Model,Inp_db,
                                         index_col,folds,RS,
                                         CV_features_all,target_col,True)
    for key in test_sc:
        Metrics_best[key]=np.mean(test_sc[key])
    print("Status update: The current best metrics are: "+str(Metrics_best))

    #Backwards feature selection method
    while Metric_changed:
        print(60*"=")
        print("Iteration to delete the "+str(len(CV_features_arr_final)+1)+"th feature.")

        #To keep the feature selection going
        Metric_changed = False

        for cv_feature in tqdm(CV_features_all,ascii=True):#iterator_array,ascii=True):
            if cv_feature in CV_features_arr_final:
                CV_features_current = copy.deepcopy(CV_features_all)
                CV_features_current.remove(cv_feature)

                CB_model_cv = Model.copy()

                train_sc,test_sc = benchmark_classification_cross_validation(Model,Inp_db,
                                                                             index_col,folds,RS,
                                                                             CV_features_current,target_col,True)

                for key in test_sc:
                    temp_mean_sc[key]=np.mean(test_sc[key])

                if temp_mean_sc[metric]>Metrics_best[metric]:
                    for key in temp_mean_sc:
                        Metrics_best[key]=temp_mean_sc[key]
                    CV_features_current_best = CV_features_current
                    Metric_changed = True

        CV_features_arr_final = CV_features_current_best
        print("Current features: "+str(CV_features_arr_final))
        print("Status update: The current best metrics are: "+str(Metrics_best))

    print(60*"=")
    print("The best metrics are: "+str(Metrics_best)+" with the features: "+str(CV_features_arr_final)) 
    print("--- %s seconds ---" % (time.time() - start_time))
    return CV_features_arr_final,Metrics_best


def regression_backwards_feature_selection(Model,Inp_db,index_col,folds,RS,features,target_col,metric):
    start_time = time.time()
    MAEs_train = []  
    MAEs_test = []
    Metrics_best = {"R2":-10,"RMSE":9999,"MAE":9999,"ME":9999}
    temp_mean_sc = {"R2":-10,"RMSE":9999,"MAE":9999,"ME":9999}
    Metric_changed = True

    CV_features_all = list(features)
    CV_features_current_best = []
    CV_features_arr_final = list(features)

    CV_RS = RS
    
    #first test   
    print("Getting the best metrics")
    train_sc,test_sc = benchmark_regression_cross_validation(Model,Inp_db,
                                         index_col,folds,RS,
                                         CV_features_all,target_col,True)
    for key in test_sc:
        Metrics_best[key]=np.mean(test_sc[key])
    print("Status update: The current best metrics are: "+str(Metrics_best))


    #Forward feature selection method
    while Metric_changed:
        print(60*"=")
        print("Iteration to delete the "+str(len(CV_features_arr_final)+1)+"th feature.")

        #To keep the feature selection going
        Metric_changed = False
        
    
        for cv_feature in tqdm(CV_features_all,ascii=True):#iterator_array,ascii=True):
            if cv_feature in CV_features_arr_final:
                CV_features_current = copy.deepcopy(CV_features_all)
                CV_features_current.remove(cv_feature)

                CB_model_cv = Model.copy()

                train_sc,test_sc = benchmark_regression_cross_validation(Model,Inp_db,
                                                                             index_col,folds,RS,
                                                                             CV_features_current,target_col,True)

                for key in test_sc:
                    temp_mean_sc[key]=np.mean(test_sc[key])

                if temp_mean_sc[metric]>Metrics_best[metric]:
                    for key in temp_mean_sc:
                        Metrics_best[key]=temp_mean_sc[key]
                    CV_features_current_best = CV_features_current
                    Metric_changed = True

        CV_features_arr_final = CV_features_current_best
        print("Current features: "+str(CV_features_arr_final))
        print("Status update: The current best metrics are: "+str(Metrics_best))

    print(60*"=")
    print("The best metrics are: "+str(Metrics_best)+" with the features: "+str(CV_features_arr_final)) 
    print("--- %s seconds ---" % (time.time() - start_time))
    return CV_features_arr_final,Metrics_best


In [None]:
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import roc_auc_score,classification_report,auc,r2_score,matthews_corrcoef

## This cross_validation is different from the standard cross validation, because the a priori dataset is only tested on actual a priori samples
# Inp_db:pandas_DataFrame = input database (contains labels and features), should contain heroi2c numbers as IDs, previous concentrations
# Features:List = features to use
# Folds:int = amount of folds
# RS:int = Random state
# Output:dict = dictionary with the results of each fold
def test_bootstrap_eval_class(Model,test_df,bootstrap_its,RS,features,target_col,disable_tqdm_output=False):
    scores_cv_test = {"AUC":np.array([]),
                 "MCC":np.array([]),      
                 "ACCURACY":np.array([]),
                 "RECALL":np.array([]),
                 "F1":np.array([]),
                 "PRECISION":np.array([])}
    
    for i in tqdm(range(bootstrap_its)):
        sampled_df = test_df.sample(n=int(0.66*len(test_df)),replace=True,random_state=i)

        Y_CV_test = sampled_df[target_col].values
        ## Extract the required features
        X_CV_test_feat = sampled_df[features].values

        Y_CV_predict_test = Model.predict(X_CV_test_feat)
        y_cv_test_pred = Model.predict_proba(X_CV_test_feat)[:,1]

        test_results = classification_report(Y_CV_test,Y_CV_predict_test,output_dict=True)

        scores_cv_test["AUC"]=np.append(scores_cv_test["AUC"],roc_auc_score(Y_CV_test,y_cv_test_pred))
        scores_cv_test["MCC"]=np.append(scores_cv_test["MCC"],matthews_corrcoef(Y_CV_test,Y_CV_predict_test))
        scores_cv_test["ACCURACY"]=np.append(scores_cv_test["ACCURACY"],test_results["accuracy"])
        scores_cv_test["RECALL"]=np.append(scores_cv_test["RECALL"],test_results['weighted avg']['recall'])
        scores_cv_test["F1"]=np.append(scores_cv_test["F1"],test_results['weighted avg']['f1-score'])
        scores_cv_test["PRECISION"]=np.append(scores_cv_test["PRECISION"],test_results['weighted avg']['precision'])

    return scores_cv_test

## This cross_validation is different from the standard cross validation, because the a priori dataset is only tested on actual a priori samples
# Inp_db:pandas_DataFrame = input database (contains labels and features), should contain heroi2c numbers as IDs, previous concentrations
# Features:List = features to use
# Folds:int = amount of folds
# RS:int = Random state
# Output:dict = dictionary with the results of each fold
def test_bootstrap_eval_regres(Model,test_df,bootstrap_its,RS,features,target_col,disable_tqdm_output=False):

    scores_cv_train = {"R2":np.array([]),
                 "RMSE":np.array([]),      
                 "MAE":np.array([]),
                 "ME":np.array([])}
    
    scores_cv_test = {"R2":np.array([]),
                 "RMSE":np.array([]),      
                 "MAE":np.array([]),
                 "ME":np.array([])}
    
    for i in tqdm(range(bootstrap_its)):
        ## Split per patient (to avoid data leakage)
        sampled_df = test_df.sample(n=int(0.66*len(test_df)),replace=True,random_state=i)

        Y_CV_test = sampled_df[target_col].values
        ## Extract the required features
        X_CV_test_feat = sampled_df[features]

        Y_CV_predict_test = Model.predict(X_CV_test_feat)
        test_results = scores_calc_print(Y_CV_test,Model.predict(X_CV_test_feat),print_bool=False)
        
        scores_cv_test["R2"]=np.append(scores_cv_test["R2"],test_results["R2"])
        scores_cv_test["RMSE"]=np.append(scores_cv_test["RMSE"],test_results["RMSE"])
        scores_cv_test["MAE"]=np.append(scores_cv_test["MAE"],test_results["MAE"])
        scores_cv_test["ME"]=np.append(scores_cv_test["ME"],test_results['ME'])

    return scores_cv_test

# MADELON dataset

### Read dataset

In [None]:
current_df = pd.read_csv("data/madelon.csv")
current_df = current_df.reset_index()
current_df.loc[current_df.Class==0,"Class"]=-1#0
train_idx,val_idx = train_test_split(current_df["index"],test_size=0.25,random_state = 1)
current_db_train = current_df[current_df["index"].isin(train_idx)]
current_db_test = current_df[current_df["index"].isin(val_idx)]

target_col = "Class"
Index_col = "index"

In [None]:
current_db_test

### Powershap

In [None]:
selector = PowerShap(
    model = CatBoostClassifier(verbose=0, n_estimators=250,use_best_model=True),
    power_iterations=10,automatic=True, limit_automatic=10,verbose=True,target_col=target_col,index_col=Index_col,
)
selector.fit(current_db_train.drop(columns=[Index_col,target_col]), current_db_train[target_col])
t = selector._processed_shaps_df
#t.reset_index().to_csv("results/madelon_PowerSHAP_catboost_results_automatic_mode.csv",index=False)


### Borutashap

In [None]:
model = CatBoostClassifier(verbose=False,iterations=250)#,use_best_model=True)

# if classification is False it is a Regression problem
Feature_Selector = BorutaShap(model=model,
                              importance_measure='shap',
                              classification=True)

Feature_Selector.fit(X=current_db_train[list(current_db_train.columns.values[1:-1])], y=current_db_train[target_col], sample=False,
                        train_or_test = 'test', normalize=True,verbose=True)
subset = Feature_Selector.Subset()

In [None]:
subset.columns

### Shapicant

In [None]:
Inp_db = current_db_train.copy(deep=True)
train_idx,val_idx = train_test_split(Inp_db[Index_col],test_size=0.2,random_state = 0)

X_train = Inp_db[Inp_db[Index_col].isin(train_idx)].copy(deep=True).drop(columns=[Index_col,target_col])
X_val = Inp_db[Inp_db[Index_col].isin(val_idx)].copy(deep=True).drop(columns=[Index_col,target_col])
Y_train = Inp_db[Inp_db[Index_col].isin(train_idx)][target_col]

# LightGBM in RandomForest-like mode (with rows subsampling), without columns subsampling
model = CatBoostClassifier(verbose=False,iterations=250,use_best_model=False)

# This is the class (not its instance) of SHAP's TreeExplainer
explainer_type = shap.TreeExplainer

# Use PandasSelector with 100 iterations
selector = shapicant.PandasSelector(model, explainer_type, random_state=42)

# Run the feature selection
# If we provide a validation set, SHAP values are computed on it, otherwise they are computed on the training set
# We can also provide additional parameters to the underlying estimator's fit method through estimator_params
selector.fit(X_train, Y_train, X_validation=X_val)#, estimator_params={"categorical_feature": None})

# Just get the features list
selected_features = selector.get_features()

# We can also get the p-values as pandas Series
p_values = selector.p_values_

np.array(selected_features)

### Forward Feature Selection

In [None]:
shap_model = CatBoostClassifier(verbose=False,iterations=250,use_best_model=True)
CV_features_arr_final,Metrics_best = classification_forward_feature_selection(shap_model,current_db_train,Index_col,5,0,current_db_train.drop(columns=[target_col,Index_col]).columns.values,target_col,"AUC")

### 10 fold cross validation

In [None]:
model = "chi"

if model == "forward":
    selected_features = ['V339', 'V242', 'V379', 'V29', 'V456', 'V282', 'V494', 'V129'] #forward feature selection on AUC
    
elif model =="borutashap":
    selected_features = ['V434', 'V242', 'V379', 'V5', 'V476', 'V49', 'V282', 'V286', 'V339','V29'] #borutoSHAP 
    
elif model =="powershap":
    processed_shaps_df = pd.read_csv("results/madelon_PowerSHAP_catboost_results_automatic_mode.csv")
    selected_features = processed_shaps_df[(processed_shaps_df.p_value<0.01)]["index"].values

elif model =="shapicant":
    #shapicant
    selected_features = ['V5', 'V29', 'V49', 'V65', 'V106', 'V129', 'V154', 'V198', 'V205',
           'V242', 'V249', 'V282', 'V283', 'V286', 'V305', 'V307', 'V319',
           'V337', 'V339', 'V379', 'V425', 'V434', 'V443', 'V452', 'V454',
           'V456', 'V472', 'V473', 'V476', 'V494']

elif model =="chi":
    #chi squared p value = 0.01
    selected_features = list(current_db_train.columns.values[1:-1][np.where(chi2(current_db_train[current_db_train.columns.values[1:-1]],current_db_train[target_col])[1]<0.01)[0]])

elif model =="f_test":
    #f_classif p value = 0.01
    selected_features = list(current_db_train.columns.values[1:-1][np.where(f_classif(current_db_train[current_db_train.columns.values[1:-1]],current_db_train[target_col])[1]<0.01)[0]])

elif model =="default":
    selected_features = list(current_df.columns.values[1:-1])
    

print(len(selected_features))

CB_model = CatBoostClassifier(verbose=False,iterations=250,random_seed=2,use_best_model=True)

scores_cv_train,scores_cv_test = benchmark_classification_cross_validation(Model = CB_model,Inp_db = current_db_train.copy(deep=True),index_col=Index_col,folds=10,RS=1,features = selected_features,target_col = target_col)

print(model)
print("TRAIN")
for key in scores_cv_train:
    print(str(key)+": "+str(np.round(np.mean(scores_cv_train[key]),3))+" ("+str(np.round(np.std(scores_cv_train[key]),3))+")")
print(50*"=")
print("TEST")
for key in scores_cv_test:
    print(str(key)+": "+str(np.round(np.mean(scores_cv_test[key]),3))+" ("+str(np.round(np.std(scores_cv_test[key]),3))+")")

### Bootstrap testing

In [None]:
model = "powershap"

if model == "forward":
    selected_features = ['V339', 'V242', 'V379', 'V29', 'V456', 'V282', 'V494', 'V129'] #forward feature selection on AUC
    
elif model =="borutashap":
    selected_features = ['V434', 'V242', 'V379', 'V5', 'V476', 'V49', 'V282', 'V286', 'V339','V29'] #borutoSHAP 
    
elif model =="powershap":
    processed_shaps_df = pd.read_csv("results/madelon_PowerSHAP_catboost_results_automatic_mode.csv")
    selected_features = processed_shaps_df[(processed_shaps_df.p_value<0.01)]["index"].values

elif model =="shapicant":
    #shapicant
    selected_features = ['V5', 'V29', 'V49', 'V65', 'V106', 'V129', 'V154', 'V198', 'V205',
           'V242', 'V249', 'V282', 'V283', 'V286', 'V305', 'V307', 'V319',
           'V337', 'V339', 'V379', 'V425', 'V434', 'V443', 'V452', 'V454',
           'V456', 'V472', 'V473', 'V476', 'V494']

elif model =="chi":
    #chi squared p value = 0.01
    selected_features = list(current_db_train.columns.values[1:-1][np.where(chi2(current_db_train[current_db_train.columns.values[1:-1]],current_db_train[target_col])[1]<0.01)[0]])

elif model =="f_test":
    #f_classif p value = 0.01
    selected_features = list(current_db_train.columns.values[1:-1][np.where(f_classif(current_db_train[current_db_train.columns.values[1:-1]],current_db_train[target_col])[1]<0.01)[0]])

elif model =="default":
    selected_features = list(current_df.columns.values[1:-1])

X_train = current_db_train[selected_features]
Y_train = current_db_train[target_col]

X_test = current_db_test[selected_features]
Y_test = current_db_test[target_col]

CB_model = CatBoostClassifier(verbose=False,iterations=250,random_seed=2)#,per_float_feature_quantization=['1:border_count=1024'])
CB_model.fit(X_train,Y_train)


scores_cv_test = test_bootstrap_eval_class(Model = CB_model,test_df = current_db_test.copy(deep=True),bootstrap_its=1000,RS=1,features = selected_features,target_col = target_col)

print(model)
print("TEST")
for key in scores_cv_test:
    print(str(key)+": "+str(np.round(np.mean(scores_cv_test[key]),3))+" ("+str(np.round(np.std(scores_cv_test[key]),3))+")")

# GINA PRIORI dataset

### Read dataset

In [None]:
gina_prior_df = pd.read_csv("data/gina_prior.csv")
gina_prior_df = gina_prior_df.reset_index()
gina_prior_df.loc[gina_prior_df.label==-1,"label"]=0
train_idx,val_idx = train_test_split(gina_prior_df["index"],test_size=0.25,random_state = 1)
current_db_train = gina_prior_df[gina_prior_df["index"].isin(train_idx)]
current_db_test = gina_prior_df[gina_prior_df["index"].isin(val_idx)]

target_col = "label"
Index_col = "index"

### Powershap

In [None]:
selector = PowerShap(
    model = CatBoostClassifier(verbose=0, n_estimators=250,use_best_model=True),
    power_iterations=10,automatic=True, limit_automatic=10,verbose=True,target_col=target_col,index_col=Index_col,
)
selector.fit(current_db_train.drop(columns=[Index_col,target_col]), current_db_train[target_col])
t = selector._processed_shaps_df
#t.reset_index().to_csv("results/gina_prior_PowerSHAP_catboost_results_automatic_mode.csv",index=False)


### Borutashap

In [None]:
model = CatBoostClassifier(verbose=False,iterations=250)#,use_best_model=True)

# if classification is False it is a Regression problem
Feature_Selector = BorutaShap(model=model,
                              importance_measure='shap',
                              classification=True)

Feature_Selector.fit(X=current_db_train[list(current_db_train.columns.values[1:-1])], y=current_db_train[target_col], sample=False,
                        train_or_test = 'test', normalize=True,verbose=True)
subset = Feature_Selector.Subset()

### Shapicant

In [None]:
Inp_db = current_db_train.copy(deep=True)
train_idx,val_idx = train_test_split(Inp_db[Index_col],test_size=0.2,random_state = 0)

X_train = Inp_db[Inp_db[Index_col].isin(train_idx)].copy(deep=True).drop(columns=[Index_col,target_col])
X_val = Inp_db[Inp_db[Index_col].isin(val_idx)].copy(deep=True).drop(columns=[Index_col,target_col])
Y_train = Inp_db[Inp_db[Index_col].isin(train_idx)][target_col]

# LightGBM in RandomForest-like mode (with rows subsampling), without columns subsampling
model = CatBoostClassifier(verbose=False,iterations=250,use_best_model=False)

# This is the class (not its instance) of SHAP's TreeExplainer
explainer_type = shap.TreeExplainer

# Use PandasSelector with 100 iterations
selector = shapicant.PandasSelector(model, explainer_type, random_state=42)

# Run the feature selection
# If we provide a validation set, SHAP values are computed on it, otherwise they are computed on the training set
# We can also provide additional parameters to the underlying estimator's fit method through estimator_params
selector.fit(X_train, Y_train, X_validation=X_val)#, estimator_params={"categorical_feature": None})

# Just get the features list
selected_features = selector.get_features()

# We can also get the p-values as pandas Series
p_values = selector.p_values_

np.array(selected_features)

### Forward feature selection

In [None]:
shap_model = CatBoostClassifier(verbose=False,iterations=250,use_best_model=True)
CV_features_arr_final,Metrics_best = classification_forward_feature_selection(shap_model,current_db_train,Index_col,5,0,current_db_train.drop(columns=[target_col,Index_col]).columns.values,target_col,"AUC")

### 10 fold cross validation

In [None]:
model = "powershap"

if model == "forward":
    selected_features = ['pixel514', 'pixel324', 'pixel455', 'pixel240', 'pixel544', 
               'pixel626', 'pixel460', 'pixel154', 'pixel211', 'pixel266', 
               'pixel457', 'pixel436', 'pixel376', 'pixel383', 'pixel490', 'pixel540', 'pixel242', 'pixel636', 
               'pixel550', 'pixel630', 'pixel301', 'pixel158', 'pixel627', 'pixel267', 'pixel458', 'pixel223'] #forward feature selection on AUC
    
elif model =="borutashap":
    selected_features = ['pixel543', 'pixel296', 'pixel157', 'pixel455', 'pixel515', 'pixel490', 'pixel352', 'pixel548', 'pixel488', 
                         'pixel324', 'pixel211', 'pixel351', 'pixel239', 'pixel713', 'pixel269', 'pixel489', 'pixel516', 'pixel460', 
                         'pixel212', 'pixel513', 'pixel463', 'pixel457', 'pixel514', 
                         'pixel240', 'pixel241', 'pixel267', 'pixel573', 'pixel487', 'pixel486', 'pixel484', 'pixel268', 'pixel511', 
                         'pixel485', 'pixel544', 'pixel456', 'pixel213', 'pixel323'] #borutoSHAP 
    
elif model =="powershap":
    processed_shaps_df = pd.read_csv("results/gina_prior_PowerSHAP_catboost_results_automatic_mode.csv")
    selected_features = processed_shaps_df[(processed_shaps_df.p_value<0.01)]["index"].values

elif model =="shapicant":
    #shapicant
    selected_features = ['pixel103', 'pixel104', 'pixel137', 'pixel152', 'pixel153',
       'pixel154', 'pixel157', 'pixel158', 'pixel184', 'pixel211',
       'pixel212', 'pixel213', 'pixel238', 'pixel239', 'pixel240',
       'pixel241', 'pixel242', 'pixel243', 'pixel249', 'pixel251',
       'pixel267', 'pixel268', 'pixel269', 'pixel295', 'pixel296',
       'pixel297', 'pixel319', 'pixel323', 'pixel324', 'pixel347',
       'pixel348', 'pixel351', 'pixel352', 'pixel358', 'pixel359',
       'pixel376', 'pixel377', 'pixel383', 'pixel387', 'pixel403',
       'pixel404', 'pixel405', 'pixel410', 'pixel414', 'pixel416',
       'pixel427', 'pixel428', 'pixel429', 'pixel432', 'pixel438',
       'pixel454', 'pixel455', 'pixel456', 'pixel458', 'pixel459',
       'pixel460', 'pixel461', 'pixel462', 'pixel463', 'pixel465',
       'pixel483', 'pixel484', 'pixel485', 'pixel487', 'pixel488',
       'pixel489', 'pixel490', 'pixel498', 'pixel500', 'pixel511',
       'pixel513', 'pixel514', 'pixel515', 'pixel516', 'pixel517',
       'pixel518', 'pixel528', 'pixel540', 'pixel541', 'pixel543',
       'pixel544', 'pixel545', 'pixel546', 'pixel548', 'pixel569',
       'pixel572', 'pixel573', 'pixel576', 'pixel579', 'pixel581',
       'pixel584', 'pixel585', 'pixel604', 'pixel607', 'pixel611',
       'pixel626', 'pixel627', 'pixel630', 'pixel635', 'pixel680',
       'pixel709', 'pixel713', 'pixel714', 'pixel716', 'pixel718',
       'pixel719']

elif model =="chi":
    #chi squared p value = 0.01
    selected_features = list(current_db_train.columns.values[1:-1][np.where(chi2(current_db_train[current_db_train.columns.values[1:-1]],current_db_train[target_col])[1]<0.01)[0]])

elif model =="f_test":
    #f_classif p value = 0.01
    selected_features = list(current_db_train.columns.values[1:-1][np.where(f_classif(current_db_train[current_db_train.columns.values[1:-1]],current_db_train[target_col])[1]<0.01)[0]])

elif model =="default":
    selected_features = list(current_db_train.columns.values[1:-1]) 

print(len(selected_features))

CB_model = CatBoostClassifier(verbose=False,iterations=250,random_seed=2,use_best_model=True)

scores_cv_train,scores_cv_test = benchmark_classification_cross_validation(Model = CB_model,Inp_db = current_db_train.copy(deep=True),index_col=Index_col,folds=10,RS=0,features = selected_features,target_col = target_col)

print(model)
print("TRAIN")
for key in scores_cv_train:
    print(str(key)+": "+str(np.round(np.mean(scores_cv_train[key]),3))+" ("+str(np.round(np.std(scores_cv_train[key]),3))+")")
print(50*"=")
print("TEST")
for key in scores_cv_test:
    print(str(key)+": "+str(np.round(np.mean(scores_cv_test[key]),3))+" ("+str(np.round(np.std(scores_cv_test[key]),3))+")")

### bootstrap testing

In [None]:
model = "powershap"

if model == "forward":
    selected_features = ['pixel514', 'pixel324', 'pixel455', 'pixel240', 'pixel544', 
               'pixel626', 'pixel460', 'pixel154', 'pixel211', 'pixel266', 
               'pixel457', 'pixel436', 'pixel376', 'pixel383', 'pixel490', 'pixel540', 'pixel242', 'pixel636', 
               'pixel550', 'pixel630', 'pixel301', 'pixel158', 'pixel627', 'pixel267', 'pixel458', 'pixel223'] #forward feature selection on AUC
    
elif model =="borutashap":
    selected_features = ['pixel543', 'pixel296', 'pixel157', 'pixel455', 'pixel515', 'pixel490', 'pixel352', 'pixel548', 'pixel488', 
                         'pixel324', 'pixel211', 'pixel351', 'pixel239', 'pixel713', 'pixel269', 'pixel489', 'pixel516', 'pixel460', 
                         'pixel212', 'pixel513', 'pixel463', 'pixel457', 'pixel514', 
                         'pixel240', 'pixel241', 'pixel267', 'pixel573', 'pixel487', 'pixel486', 'pixel484', 'pixel268', 'pixel511', 
                         'pixel485', 'pixel544', 'pixel456', 'pixel213', 'pixel323'] #borutoSHAP 
    
elif model =="powershap":
    processed_shaps_df = pd.read_csv("results/gina_prior_PowerSHAP_catboost_results_automatic_mode.csv")
    selected_features = processed_shaps_df[(processed_shaps_df.p_value<0.01)]["index"].values

elif model =="shapicant":
    #shapicant
    selected_features = ['pixel103', 'pixel104', 'pixel137', 'pixel152', 'pixel153',
       'pixel154', 'pixel157', 'pixel158', 'pixel184', 'pixel211',
       'pixel212', 'pixel213', 'pixel238', 'pixel239', 'pixel240',
       'pixel241', 'pixel242', 'pixel243', 'pixel249', 'pixel251',
       'pixel267', 'pixel268', 'pixel269', 'pixel295', 'pixel296',
       'pixel297', 'pixel319', 'pixel323', 'pixel324', 'pixel347',
       'pixel348', 'pixel351', 'pixel352', 'pixel358', 'pixel359',
       'pixel376', 'pixel377', 'pixel383', 'pixel387', 'pixel403',
       'pixel404', 'pixel405', 'pixel410', 'pixel414', 'pixel416',
       'pixel427', 'pixel428', 'pixel429', 'pixel432', 'pixel438',
       'pixel454', 'pixel455', 'pixel456', 'pixel458', 'pixel459',
       'pixel460', 'pixel461', 'pixel462', 'pixel463', 'pixel465',
       'pixel483', 'pixel484', 'pixel485', 'pixel487', 'pixel488',
       'pixel489', 'pixel490', 'pixel498', 'pixel500', 'pixel511',
       'pixel513', 'pixel514', 'pixel515', 'pixel516', 'pixel517',
       'pixel518', 'pixel528', 'pixel540', 'pixel541', 'pixel543',
       'pixel544', 'pixel545', 'pixel546', 'pixel548', 'pixel569',
       'pixel572', 'pixel573', 'pixel576', 'pixel579', 'pixel581',
       'pixel584', 'pixel585', 'pixel604', 'pixel607', 'pixel611',
       'pixel626', 'pixel627', 'pixel630', 'pixel635', 'pixel680',
       'pixel709', 'pixel713', 'pixel714', 'pixel716', 'pixel718',
       'pixel719']

elif model =="chi":
    #chi squared p value = 0.01
    selected_features = list(current_db_train.columns.values[1:-1][np.where(chi2(current_db_train[current_db_train.columns.values[1:-1]],current_db_train[target_col])[1]<0.01)[0]])

elif model =="f_test":
    #f_classif p value = 0.01
    selected_features = list(current_db_train.columns.values[1:-1][np.where(f_classif(current_db_train[current_db_train.columns.values[1:-1]],current_db_train[target_col])[1]<0.01)[0]])

elif model =="default":
    selected_features = list(current_db_train.columns.values[1:-1]) 
    

X_train = current_db_train[selected_features]
Y_train = current_db_train[target_col]

X_test = current_db_test[selected_features]
Y_test = current_db_test[target_col]

CB_model = CatBoostClassifier(verbose=False,iterations=250,random_seed=2)#,per_float_feature_quantization=['1:border_count=1024'])
#CB_model = LogisticRegression()
#CB_model = RandomForestClassifier()#verbose=False,iterations=250,random_seed=2,use_best_model=True)
CB_model.fit(X_train,Y_train)
        
scores_cv_test = test_bootstrap_eval_class(Model = CB_model,test_df = current_db_test.copy(deep=True),bootstrap_its=1000,RS=1,features = selected_features,target_col = target_col)

print(model)
print("TEST")
for key in scores_cv_test:
    print(str(key)+": "+str(np.round(np.mean(scores_cv_test[key]),3))+" ("+str(np.round(np.std(scores_cv_test[key]),3))+")")


# Single Class SCENE dataset

### Read dataset

In [None]:
current_db = pd.read_csv("data/scene.csv")
current_db = current_db.reset_index()

Index_col = "index"
target_col = "Urban"

current_db[target_col]=current_db[target_col].astype(np.int32)

train_idx,val_idx = train_test_split(current_db[Index_col],test_size=0.25,random_state = 1,stratify=current_db[target_col])
current_db_train = current_db[current_db[Index_col].isin(train_idx)]
current_db_test = current_db[current_db[Index_col].isin(val_idx)]


### Powershap

In [None]:
selector = PowerShap(
    model = CatBoostClassifier(verbose=0, n_estimators=250,use_best_model=True,class_weights=[1-len(current_db_train[current_db_train[target_col] == 0])/len(current_db_train),len(current_db_train[current_db_train[target_col] == 0])/len(current_db_train)]),
    power_iterations=10,automatic=True, limit_automatic=10,verbose=True,stratify=True,#force_convergence = True,
)
selector.fit(current_db_train[list(current_db_train.columns.values[1:-6])], current_db_train[target_col])
t = selector._processed_shaps_df
#t.reset_index().to_csv("results/scene_PowerSHAP_catboost_results_automatic_mode.csv",index=False)


### Borutashap

In [None]:
model = CatBoostClassifier(verbose=False,iterations=250,class_weights=[1-len(current_db_train[current_db_train[target_col] == 0])/len(current_db_train),len(current_db_train[current_db_train[target_col] == 0])/len(current_db_train)])

# if classification is False it is a Regression problem
Feature_Selector = BorutaShap(model=model,
                              importance_measure='shap',
                              classification=True)

Feature_Selector.fit(X=current_db_train[l/ist(current_db_train.columns.values[1:-6])], y=current_db_train[target_col], sample=False,
                        train_or_test = 'test', normalize=True,verbose=True)
subset = Feature_Selector.Subset()

### Shapicant

In [None]:
Inp_db = current_db_train.copy(deep=True)
train_idx,val_idx = train_test_split(Inp_db[Index_col],test_size=0.2,random_state = 0)

X_train = Inp_db[Inp_db[Index_col].isin(train_idx)].copy(deep=True)[list(current_db_train.columns.values[1:-6])]
X_val = Inp_db[Inp_db[Index_col].isin(val_idx)].copy(deep=True)[list(current_db_train.columns.values[1:-6])]
Y_train = Inp_db[Inp_db[Index_col].isin(train_idx)][target_col]

# LightGBM in RandomForest-like mode (with rows subsampling), without columns subsampling
model = CatBoostClassifier(verbose=False,iterations=250,use_best_model=False,class_weights=[1-len(current_db_train[current_db_train[target_col] == 0])/len(current_db_train),len(current_db_train[current_db_train[target_col] == 0])/len(current_db_train)])

# This is the class (not its instance) of SHAP's TreeExplainer
explainer_type = shap.TreeExplainer

# Use PandasSelector with 100 iterations
selector = shapicant.PandasSelector(model, explainer_type, random_state=42)

# Run the feature selection
# If we provide a validation set, SHAP values are computed on it, otherwise they are computed on the training set
# We can also provide additional parameters to the underlying estimator's fit method through estimator_params
selector.fit(X_train, Y_train, X_validation=X_val)#, estimator_params={"categorical_feature": None})

# Just get the features list
selected_features = selector.get_features()

# We can also get the p-values as pandas Series
p_values = selector.p_values_

np.array(selected_features)

### Forward feature selection

In [None]:
shap_model = CatBoostClassifier(verbose=False,iterations=250,class_weights=[1-len(current_db_train[current_db_train[target_col] == 0])/len(current_db_train),len(current_db_train[current_db_train[target_col] == 0])/len(current_db_train)],use_best_model=True)

CV_features_arr_final,Metrics_best = classification_forward_feature_selection(shap_model,current_db_train,Index_col,5,0,list(current_db_train.columns.values[1:-6]),target_col,"AUC")

### 10 fold cross validation

In [None]:
for model in ["powershap"]:#["forward","borutashap","powershap","shapicant","chi","f_test","default"]:

    if model == "forward":
        selected_features = ['Att240', 'Att200', 'Att88', 'Att46', 'Att214', 
                   'Att53', 'Att118', 'Att80', 'Att225', 'Att22', 'Att32', 'Att191', 'Att58', 'Att65', 'Att245'] #forward feature selection on AUC

    elif model =="borutashap":
        selected_features = ['Att83', 'Att240', 'Att226', 'Att223', 'Att45', 'Att98', 'Att89', 'Att241', 'Att222', 'Att245', 'Att82', 'Att72', 'Att22', 'Att91'] #borutoSHAP 

    elif model =="powershap":
        processed_shaps_df = pd.read_csv("results/scene_PowerSHAP_catboost_results_automatic_mode.csv")
        selected_features = processed_shaps_df[(processed_shaps_df.p_value<0.01)]["index"].values

    elif model =="shapicant":
        #shapicant
        selected_features = ['Att15', 'Att17', 'Att18', 'Att20', 'Att22', 'Att23', 'Att27',
           'Att40', 'Att44', 'Att45', 'Att47', 'Att48', 'Att49', 'Att53',
           'Att68', 'Att72', 'Att78', 'Att80', 'Att82', 'Att83', 'Att86',
           'Att87', 'Att89', 'Att91', 'Att98', 'Att103', 'Att106', 'Att108',
           'Att118', 'Att132', 'Att133', 'Att141', 'Att171', 'Att185',
           'Att195', 'Att200', 'Att201', 'Att204', 'Att205', 'Att207',
           'Att208', 'Att209', 'Att212', 'Att222', 'Att223', 'Att226',
           'Att228', 'Att229', 'Att237', 'Att239', 'Att240', 'Att241',
           'Att242', 'Att245', 'Att253', 'Att269']

    elif model =="chi":
        #chi squared p value = 0.01
        selected_features = list(current_db_train.columns.values[1:-6][np.where(chi2(current_db_train[current_db_train.columns.values[1:-6]],current_db_train[target_col])[1]<0.01)[0]])

    elif model =="f_test":
        #f_classif p value = 0.01
        selected_features = list(current_db_train.columns.values[1:-6][np.where(f_classif(current_db_train[current_db_train.columns.values[1:-6]],current_db_train[target_col])[1]<0.01)[0]])

    elif model =="default":
        selected_features = list(current_db_train.columns.values[1:-6])

    print(len(selected_features))

    CB_model = CatBoostClassifier(verbose=False,iterations=250,random_seed=2,use_best_model=True,
                                  class_weights=[1-len(current_db_train[current_db_train[target_col] == 0])/len(current_db_train),len(current_db_train[current_db_train[target_col] == 0])/len(current_db_train)])

    scores_cv_train,scores_cv_test = benchmark_classification_cross_validation(Model = CB_model,Inp_db = current_db_train.copy(deep=True),index_col=Index_col,folds=10,RS=0,features = selected_features,target_col = target_col)

    print(model)
    print("TRAIN")
    for key in scores_cv_train:
        print(str(key)+": "+str(np.round(np.mean(scores_cv_train[key]),3))+" ("+str(np.round(np.std(scores_cv_train[key]),3))+")")
    print(50*"=")
    print("TEST")
    for key in scores_cv_test:
        print(str(key)+": "+str(np.round(np.mean(scores_cv_test[key]),3))+" ("+str(np.round(np.std(scores_cv_test[key]),3))+")")
    print(100*"-")

### bootstrap testing

In [None]:
for model in ["powershap"]:#"forward","borutashap","powershap","shapicant","chi","f_test","default"]:


    if model == "forward":
        selected_features = ['Att240', 'Att200', 'Att88', 'Att46', 'Att214', 
                   'Att53', 'Att118', 'Att80', 'Att225', 'Att22', 'Att32', 'Att191', 'Att58', 'Att65', 'Att245'] #forward feature selection on AUC

    elif model =="borutashap":
        selected_features = ['Att83', 'Att240', 'Att226', 'Att223', 'Att45', 'Att98', 'Att89', 'Att241', 'Att222', 'Att245', 'Att82', 'Att72', 'Att22', 'Att91'] #borutoSHAP 

    elif model =="powershap":
        processed_shaps_df = pd.read_csv("results/scene_PowerSHAP_catboost_results_automatic_mode.csv")
        selected_features = processed_shaps_df[(processed_shaps_df.p_value<0.01)]["index"].values

    elif model =="shapicant":
        #shapicant
        selected_features = ['Att15', 'Att17', 'Att18', 'Att20', 'Att22', 'Att23', 'Att27',
           'Att40', 'Att44', 'Att45', 'Att47', 'Att48', 'Att49', 'Att53',
           'Att68', 'Att72', 'Att78', 'Att80', 'Att82', 'Att83', 'Att86',
           'Att87', 'Att89', 'Att91', 'Att98', 'Att103', 'Att106', 'Att108',
           'Att118', 'Att132', 'Att133', 'Att141', 'Att171', 'Att185',
           'Att195', 'Att200', 'Att201', 'Att204', 'Att205', 'Att207',
           'Att208', 'Att209', 'Att212', 'Att222', 'Att223', 'Att226',
           'Att228', 'Att229', 'Att237', 'Att239', 'Att240', 'Att241',
           'Att242', 'Att245', 'Att253', 'Att269']

    elif model =="chi":
        #chi squared p value = 0.01
        selected_features = list(current_db_train.columns.values[1:-6][np.where(chi2(current_db_train[current_db_train.columns.values[1:-6]],current_db_train[target_col])[1]<0.01)[0]])

    elif model =="f_test":
        #f_classif p value = 0.01
        selected_features = list(current_db_train.columns.values[1:-6][np.where(f_classif(current_db_train[current_db_train.columns.values[1:-6]],current_db_train[target_col])[1]<0.01)[0]])

    elif model =="default":
        selected_features = list(current_db_train.columns.values[1:-6])

    print(len(selected_features))

    X_train = current_db_train[selected_features]
    Y_train = current_db_train[target_col]

    X_test = current_db_test[selected_features]
    Y_test = current_db_test[target_col]

    CB_model = CatBoostClassifier(verbose=False,iterations=250,random_seed=2,
                                  class_weights=[1-len(current_db_train[current_db_train[target_col] == 0])/len(current_db_train),len(current_db_train[current_db_train[target_col] == 0])/len(current_db_train)])
    CB_model.fit(X_train,Y_train)

    scores_cv_test = test_bootstrap_eval_class(Model = CB_model,test_df = current_db_test.copy(deep=True),bootstrap_its=1000,RS=1,features = selected_features,target_col = target_col)

    print(model)
    print("TEST")
    for key in scores_cv_test:
        print(str(key)+": "+str(np.round(np.mean(scores_cv_test[key]),3))+" ("+str(np.round(np.std(scores_cv_test[key]),3))+")")
        
    print(100*"-")

# CT Location Dataset

### Read csv

In [None]:
current_db = pd.read_csv("data/slice_localization_data.csv")
current_db = current_db.reset_index()

Index_col = "patientId"
target_col = "reference"

train_idx,val_idx = train_test_split(current_db[Index_col].unique(),test_size=0.25,random_state = 1)
current_db_train = current_db[current_db[Index_col].isin(train_idx)]
current_db_test = current_db[current_db[Index_col].isin(val_idx)]

In [None]:
current_db

### powershap

In [None]:
selector = PowerShap(
    model = CatBoostRegressor(verbose=0, n_estimators=250,use_best_model=True),
    power_iterations=10,automatic=True, limit_automatic=10,verbose=True,target_col=target_col,index_col=Index_col,
)
selector.fit(current_db_train[list(current_db_train.columns.values[2:-1])], current_db_train[target_col])
t = selector._processed_shaps_df
#t.reset_index().to_csv("results/CT_location_PowerSHAP_catboost_results_automatic_mode.csv",index=False)


### Borutashap

In [None]:
model = CatBoostRegressor(verbose=False,iterations=250)

# if classification is False it is a Regression problem
Feature_Selector = BorutaShap(model=model,
                              importance_measure='shap',
                              classification=False)

Feature_Selector.fit(X=current_db_train[list(current_db_train.columns.values[2:-1])], y=current_db_train[target_col], sample=False,
                        train_or_test = 'test', normalize=True,verbose=True)
subset = Feature_Selector.Subset()

### shapicant

In [None]:
Inp_db = current_db_train.copy(deep=True)
train_idx,val_idx = train_test_split(Inp_db[Index_col],test_size=0.2,random_state = 0)

X_train = Inp_db[Inp_db[Index_col].isin(train_idx)].copy(deep=True)[list(current_db_train.columns.values[2:-1])]
X_val = Inp_db[Inp_db[Index_col].isin(val_idx)].copy(deep=True)[list(current_db_train.columns.values[2:-1])]
Y_train = Inp_db[Inp_db[Index_col].isin(train_idx)][target_col]

# LightGBM in RandomForest-like mode (with rows subsampling), without columns subsampling
model = CatBoostRegressor(verbose=False,iterations=250,use_best_model=False)

# This is the class (not its instance) of SHAP's TreeExplainer
explainer_type = shap.TreeExplainer

# Use PandasSelector with 100 iterations
selector = shapicant.PandasSelector(model, explainer_type, random_state=42)

# Run the feature selection
# If we provide a validation set, SHAP values are computed on it, otherwise they are computed on the training set
# We can also provide additional parameters to the underlying estimator's fit method through estimator_params
selector.fit(X_train, Y_train, X_validation=X_val)#, estimator_params={"categorical_feature": None})

# Just get the features list
selected_features = selector.get_features()

# We can also get the p-values as pandas Series
p_values = selector.p_values_

np.array(selected_features)

### forward feature selection

In [None]:
shap_model = CatBoostRegressor(verbose=False,iterations=250,use_best_model=True)
CV_features_arr_final,Metrics_best = regression_forward_feature_selection(shap_model,current_db_train,Index_col,5,0,current_db_train.drop(columns=[target_col,Index_col]).columns.values,target_col,"R2")

### 10 fold cross validation

In [None]:
model = "powershap"

if model == "forward":
    selected_features = ['value237', 'value378', 'value114', 'value273', 'value172', 'value170',
               'value3', 'value116', 'value291', 'value18', 'value226', 'value238', 'value53', 'value142', 'value194', 'value370', 
               'value299', 'value120', 'value35', 'value10', 'value264', 'value200', 'value316', 'value135', 'value13'] 
    
elif model =="borutashap":
    selected_features = ['value53', 'value25', 'value145', 'value116', 'value94', 'value190', 'value21', 'value23', 'value83', 
                         'value210', 'value118', 'value273', 'value282', 'value122', 'value207', 'value2', 'value226', 'value242', 
                         'value338', 'value52', 'value132', 'value320', 'value150', 'value140', 'value126', 'value146', 'value124', 
                         'value236', 'value220', 'value131', 'value4', 'value105', 'value138', 'value248', 'value35', 'value258', 
                         'value120', 'value100', 'value231', 'value265', 'value134', 'value378', 'value26', 'value215', 'value237', 
                         'value90', 'value72', 'value117', 'value222', 'value223', 'value160', 'value181', 'value339', 'value110', 
                         'value81', 'value307', 'value152', 'value201', 'value291', 'value33', 'value141', 'value211', 'value84', 
                         'value6', 'value216', 'value252', 'value173', 'value112', 'value114', 'value234', 'value280', 'value61', 
                         'value221', 'value171', 'value130', 'value241', 'value63', 'value95', 'value91', 'value13', 'value143', 
                         'value218', 'value266', 'value133', 'value256', 'value306', 'value135', 'value123', 'value212', 'value281', 
                         'value142', 'value151', 'value228', 'value232', 'value14', 'value96', 'value106', 'value235', 'value5', 
                         'value113', 'value85', 'value300', 'value318', 'value213', 'value292', 'value7', 'value172', 'value191', 
                         'value272', 'value224', 'value276', 'value125', 'value238', 'value264', 'value251', 'value275', 'value18', 
                         'value182', 'value298', 'value362', 'value111', 'value230', 'value246', 'value30', 'value200', 'value101', 
                         'value127', 'value136', 'value305', 'value283', 'value108', 'value8', 'value0', 'value183', 'value22', 
                         'value314', 'value115', 'value377', 'value170', 'value60', 'value382', 'value227', 'value16', 'value104', 
                         'value64', 'value299', 'value121', 'value3', 'value214', 'value308', 'value322', 'value174', 'value331', 
                         'value54', 'value233', 'value274', 'value44', 'value370', 'value180', 'value225', 'value102', 'value192']
    
elif model =="powershap":
    processed_shaps_df = pd.read_csv("results/CT_location_PowerSHAP_catboost_results_automatic_mode.csv")
    selected_features = processed_shaps_df[(processed_shaps_df.p_value<0.01)]["index"].values

elif model =="shapicant":
    #shapicant
    selected_features = ['value0', 'value2', 'value3', 'value4', 'value5', 'value8',
       'value18', 'value28', 'value29', 'value38', 'value47', 'value53',
       'value55', 'value60', 'value63', 'value88', 'value106', 'value108',
       'value110', 'value114', 'value115', 'value116', 'value118',
       'value132', 'value135', 'value136', 'value137', 'value138',
       'value145', 'value150', 'value167', 'value170', 'value172',
       'value182', 'value183', 'value197', 'value200', 'value209',
       'value210', 'value212', 'value213', 'value215', 'value225',
       'value226', 'value227', 'value228', 'value230', 'value233',
       'value237', 'value238', 'value241', 'value251', 'value264',
       'value269', 'value270', 'value272', 'value273', 'value280',
       'value291', 'value295', 'value300', 'value310', 'value311',
       'value318', 'value319', 'value322', 'value334', 'value338',
       'value341', 'value367', 'value370', 'value377', 'value378',
       'value382']

elif model =="f_test":
    #f_classif p value = 0.01
    selected_features = list(current_db_train.columns.values[2:-1][np.where(f_regression(current_db_train[current_db_train.columns.values[2:-1]],current_db_train[target_col])[1]<0.01)[0]])

elif model =="default":
    selected_features = list(current_db_train.columns.values[2:-1])

print(len(selected_features))

CB_model = CatBoostRegressor(verbose=False,iterations=250,random_seed=2,use_best_model=True)

scores_cv_train,scores_cv_test = benchmark_regression_cross_validation(Model = CB_model,Inp_db = current_db_train.copy(deep=True),index_col=Index_col,folds=10,RS=0,features = selected_features,target_col = target_col)

print(model)
print("TRAIN")
for key in scores_cv_train:
    print(str(key)+": "+str(np.round(np.mean(scores_cv_train[key]),3))+" ("+str(np.round(np.std(scores_cv_train[key]),3))+")")
print(50*"=")
print("TEST")
for key in scores_cv_test:
    print(str(key)+": "+str(np.round(np.mean(scores_cv_test[key]),3))+" ("+str(np.round(np.std(scores_cv_test[key]),3))+")")

### bootstrap testing

In [None]:
model = "powershap"

if model == "forward":
    selected_features = ['value237', 'value378', 'value114', 'value273', 'value172', 'value170',
               'value3', 'value116', 'value291', 'value18', 'value226', 'value238', 'value53', 'value142', 'value194', 'value370', 
               'value299', 'value120', 'value35', 'value10', 'value264', 'value200', 'value316', 'value135', 'value13'] 
    
elif model =="borutashap":
    selected_features = ['value2', 'value171', 'value32', 'value141', 'value252', 'value192', 'value133', 'value8', 'value155', 
                         'value265', 'value135', 'value223', 'value292', 'value290', 'value151', 'value112', 'value377', 'value233', 
                         'value26', 'value220', 'value30', 'value140', 'value230', 'value248', 'value211', 'value85', 'value172', 
                         'value221', 'value22', 'value145', 'value331', 'value251', 'value3', 'value222', 'value131', 'value13', 
                         'value160', 'value370', 'value114', 'value228', 'value276', 'value16', 'value0', 'value111', 'value117', 
                         'value280', 'value104', 'value154', 'value273', 'value134', 'value237', 'value224', 'value212', 'value5', 
                         'value83', 'value116', 'value184', 'value120', 'value182', 'value136', 'value242', 'value235', 'value267', 
                         'value190', 'value215', 'value339', 'value84', 'value371', 'value14', 'value241', 'value35', 'value214', 
                         'value298', 'value61', 'value299', 'value275', 'value300', 'value110', 'value281', 'value291', 'value161', 
                         'value274', 'value362', 'value201', 'value308', 'value91', 'value4', 'value53', 'value81', 'value34', 
                         'value103', 'value183', 'value207', 'value174', 'value283', 'value226', 'value52', 'value122', 'value258', 
                         'value146', 'value150', 'value127', 'value288', 'value92', 'value105', 'value232', 'value236', 'value101', 
                         'value225', 'value6', 'value227', 'value170', 'value216', 'value118', 'value64', 'value191', 'value180', 
                         'value7', 'value256', 'value113', 'value213', 'value259', 'value63', 'value132', 'value123', 'value312', 
                         'value181', 'value138', 'value378', 'value200', 'value210', 'value125', 'value369', 'value106', 'value264', 
                         'value90', 'value282', 'value307', 'value130', 'value219', 'value152', 'value126', 'value244', 
                         'value173', 'value142', 'value143', 'value124', 'value60', 'value257', 'value234', 
                         'value272', 'value115', 'value108', 'value266', 'value18', 'value44', 'value33', 'value314', 'value100', 'value42', 'value231', 'value260', 'value320']
    
elif model =="powershap":
    processed_shaps_df = pd.read_csv("results/CT_location_PowerSHAP_catboost_results_automatic_mode.csv")
    selected_features = processed_shaps_df[(processed_shaps_df.p_value<0.01)]["index"].values


elif model =="shapicant":
    #shapicant
    selected_features = ['value0', 'value2', 'value3', 'value4', 'value5', 'value8',
       'value18', 'value28', 'value29', 'value38', 'value47', 'value53',
       'value55', 'value60', 'value63', 'value88', 'value106', 'value108',
       'value110', 'value114', 'value115', 'value116', 'value118',
       'value132', 'value135', 'value136', 'value137', 'value138',
       'value145', 'value150', 'value167', 'value170', 'value172',
       'value182', 'value183', 'value197', 'value200', 'value209',
       'value210', 'value212', 'value213', 'value215', 'value225',
       'value226', 'value227', 'value228', 'value230', 'value233',
       'value237', 'value238', 'value241', 'value251', 'value264',
       'value269', 'value270', 'value272', 'value273', 'value280',
       'value291', 'value295', 'value300', 'value310', 'value311',
       'value318', 'value319', 'value322', 'value334', 'value338',
       'value341', 'value367', 'value370', 'value377', 'value378',
       'value382']

elif model =="f_test":
    #f_classif p value = 0.01
    selected_features = list(current_db_train.columns.values[2:-1][np.where(f_regression(current_db_train[current_db_train.columns.values[2:-1]],current_db_train[target_col])[1]<0.01)[0]])

elif model =="default":
    selected_features = list(current_db_train.columns.values[2:-1])

X_train = current_db_train[selected_features]
Y_train = current_db_train[target_col]

X_test = current_db_test[selected_features]
Y_test = current_db_test[target_col]

CB_model = CatBoostRegressor(verbose=100,iterations=250,random_seed=2)#,per_float_feature_quantization=['1:border_count=1024'])
CB_model.fit(X_train,Y_train)
        
scores_cv_test = test_bootstrap_eval_regres(Model = CB_model,test_df = current_db_test.copy(deep=True),bootstrap_its=100,RS=1,features = selected_features,target_col = target_col)

print(model)
print("TEST")
for key in scores_cv_test:
    print(str(key)+": "+str(np.round(np.mean(scores_cv_test[key]),3))+" ("+str(np.round(np.std(scores_cv_test[key]),3))+")")


# Appliances Energy Production Data Set

### Read dataset

In [None]:
current_db = pd.read_csv("data/energydata_complete.csv")
current_db = current_db.reset_index()

Index_col = "index"
target_col = "Appliances"

train_idx,val_idx = train_test_split(current_db[Index_col],test_size=0.25,random_state = 1)
current_db_train = current_db[current_db[Index_col].isin(train_idx)]
current_db_test = current_db[current_db[Index_col].isin(val_idx)]

In [None]:
current_db

### powershap

In [None]:
selector = PowerShap(
    model = CatBoostRegressor(verbose=0, n_estimators=250,use_best_model=True),
    power_iterations=10,automatic=True, limit_automatic=10,verbose=True,target_col=target_col,index_col=Index_col,
)
selector.fit(current_db_train[list(current_db_train.columns.values[3:])], current_db_train[target_col])
t = selector._processed_shaps_df
#t.reset_index().to_csv("results/appliances_PowerSHAP_catboost_results_automatic_mode.csv",index=False)


### borutashap

In [None]:
model = CatBoostRegressor(verbose=False,iterations=250)

# if classification is False it is a Regression problem
Feature_Selector = BorutaShap(model=model,
                              importance_measure='shap',
                              classification=False)

Feature_Selector.fit(X=current_db_train[list(current_db_train.columns.values[3:])], y=current_db_train[target_col], sample=False,
                        train_or_test = 'test', normalize=True,verbose=True)
subset = Feature_Selector.Subset()

### shapicant

In [None]:
Inp_db = current_db_train.copy(deep=True)
train_idx,val_idx = train_test_split(Inp_db[Index_col],test_size=0.2,random_state = 0)

X_train = Inp_db[Inp_db[Index_col].isin(train_idx)].copy(deep=True)[list(current_db_train.columns.values[3:])]
X_val = Inp_db[Inp_db[Index_col].isin(val_idx)].copy(deep=True)[list(current_db_train.columns.values[3:])]
Y_train = Inp_db[Inp_db[Index_col].isin(train_idx)][target_col]

# LightGBM in RandomForest-like mode (with rows subsampling), without columns subsampling
model = CatBoostRegressor(verbose=False,iterations=250,use_best_model=False)

# This is the class (not its instance) of SHAP's TreeExplainer
explainer_type = shap.TreeExplainer

# Use PandasSelector with 100 iterations
selector = shapicant.PandasSelector(model, explainer_type, random_state=42)

# Run the feature selection
# If we provide a validation set, SHAP values are computed on it, otherwise they are computed on the training set
# We can also provide additional parameters to the underlying estimator's fit method through estimator_params
selector.fit(X_train, Y_train, X_validation=X_val)#, estimator_params={"categorical_feature": None})

# Just get the features list
selected_features = selector.get_features()

# We can also get the p-values as pandas Series
p_values = selector.p_values_

np.array(selected_features)

### forward feature selection

In [None]:
shap_model = CatBoostRegressor(verbose=False,iterations=250,use_best_model=True)
CV_features_arr_final,Metrics_best = regression_forward_feature_selection(shap_model,current_db_train,Index_col,5,0,current_db_train.columns.values[3:],target_col,"R2")

### 10 fold cross-validation

In [None]:
model = "powershap"

if model == "forward":
    selected_features = ['lights', 'T9', 'Press_mm_hg', 'T_out', 'RH_2', 'T4', 'T8', 'RH_8', 'RH_5', 'RH_4', 'T7', 'Tdewpoint', 'T6']
    
elif model =="borutashap":
    selected_features = ['RH_3', 'T5', 'RH_8', 'T4', 'Tdewpoint', 'T3', 'lights', 'RH_1', 
                         'T6', 'T2', 'RH_2', 'RH_4', 'T9', 'RH_5', 'RH_7', 'T8', 'RH_9', 'T7', 'RH_6', 'T_out', 'T1', 'Press_mm_hg', 'RH_out', 'Windspeed']
    
elif model =="powershap":
    processed_shaps_df = pd.read_csv("results/appliances_PowerSHAP_catboost_results_automatic_mode.csv")
    selected_features = processed_shaps_df[(processed_shaps_df.p_value<0.01)]["index"].values

elif model =="shapicant":
    #shapicant
    selected_features = ['lights', 'T1', 'RH_1', 'RH_2', 'T3', 'RH_3', 'RH_6', 'RH_8',
       'Press_mm_hg', 'Windspeed']

elif model =="f_test":
    #f_classif p value = 0.01
    selected_features = list(current_db_train.columns.values[3:][np.where(f_regression(current_db_train[current_db_train.columns.values[3:]],current_db_train[target_col])[1]<0.01)[0]])

elif model =="default":
    selected_features = list(current_db_train.columns.values[3:])

print(len(selected_features))

CB_model = CatBoostRegressor(verbose=False,iterations=250,random_seed=2)

scores_cv_train,scores_cv_test = benchmark_regression_cross_validation(Model = CB_model,Inp_db = current_db_train.copy(deep=True),index_col=Index_col,folds=10,RS=0,features = selected_features,target_col = target_col)

print(model)
print("TRAIN")
for key in scores_cv_train:
    print(str(key)+": "+str(np.round(np.mean(scores_cv_train[key]),3))+" ("+str(np.round(np.std(scores_cv_train[key]),3))+")")
print(50*"=")
print("TEST")
for key in scores_cv_test:
    print(str(key)+": "+str(np.round(np.mean(scores_cv_test[key]),3))+" ("+str(np.round(np.std(scores_cv_test[key]),3))+")")

### bootstrap testing

In [None]:
model = "powershap"

if model == "forward":
    selected_features = ['lights', 'T9', 'Press_mm_hg', 'T_out', 'RH_2', 'T4', 'T8', 'RH_8', 'RH_5', 'RH_4', 'T7', 'Tdewpoint', 'T6']
    
elif model =="borutashap":
    selected_features = ['RH_3', 'T5', 'RH_8', 'T4', 'Tdewpoint', 'T3', 'lights', 'RH_1', 
                         'T6', 'T2', 'RH_2', 'RH_4', 'T9', 'RH_5', 'RH_7', 'T8', 'RH_9', 'T7', 'RH_6', 'T_out', 'T1', 'Press_mm_hg', 'RH_out', 'Windspeed']
    
elif model =="powershap":
    processed_shaps_df = pd.read_csv("results/appliances_PowerSHAP_catboost_results_automatic_mode.csv")
    selected_features = processed_shaps_df[(processed_shaps_df.p_value<0.01)]["index"].values

elif model =="shapicant":
    #shapicant
    selected_features = ['lights', 'T1', 'RH_1', 'RH_2', 'T3', 'RH_3', 'RH_6', 'RH_8',
       'Press_mm_hg', 'Windspeed']

elif model =="f_test":
    #f_classif p value = 0.01
    selected_features = list(current_db_train.columns.values[3:][np.where(f_regression(current_db_train[current_db_train.columns.values[3:]],current_db_train[target_col])[1]<0.01)[0]])

elif model =="default":
    selected_features = list(current_db_train.columns.values[3:])

print(len(selected_features))

X_train = current_db_train[selected_features]
Y_train = current_db_train[target_col]

X_test = current_db_test[selected_features]
Y_test = current_db_test[target_col]

CB_model = CatBoostRegressor(verbose=False,iterations=250,random_seed=2)#,per_float_feature_quantization=['1:border_count=1024'])
CB_model.fit(X_train,Y_train)
        
scores_cv_test = test_bootstrap_eval_regres(Model = CB_model,test_df = current_db_test.copy(deep=True),bootstrap_its=1000,RS=1,features = selected_features,target_col = target_col)

print(model)
print("TEST")
for key in scores_cv_test:
    print(str(key)+": "+str(np.round(np.mean(scores_cv_test[key]),3))+" ("+str(np.round(np.std(scores_cv_test[key]),3))+")")
