# BASE 0 - CA

Information:

* Database: base 0
* Predicted variable: Corrupción Amplia
* Predictor variables: Only SIAF and Canon
* Type of prediction: Clasification
* Period of training: 2016-2020

## 1. Cargar librerías, módulos y datos

In [49]:
import warnings
warnings.filterwarnings( 'ignore' )

In [50]:
import pandas as  pd
import numpy as np
import pickle
import joblib
from importlib.machinery import SourceFileLoader

In [51]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [52]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, f1_score, matthews_corrcoef
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import KFold

In [53]:
from econml.grf import RegressionForest
from lightgbm import LGBMClassifier

In [54]:
fun = SourceFileLoader( 'funciones', r'..\..\..\code\modules\funciones.py' ).load_module()
vn  = SourceFileLoader( 'variables_nombres', r'..\..\..\code\modules\variables_nombres.py' ).load_module()

In [55]:
from sklearn.datasets import make_classification
import pandas as pd

# Generar un dataset de clasificación binaria
X, y = make_classification(n_samples=100, n_features=20, n_classes=2, random_state=42)

# Convertir a DataFrame para mejor manejo
x_train = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(1, 21)])
y_train = pd.DataFrame(y)
y_train = y_train.rename( columns = {0:'target'})

## 2. Realizar la partición en conjunto de entrenamiento y prueba

In [56]:
x_train, x_test, y_train, y_test = train_test_split( x_train,
                                                     y_train,
                                                     test_size    = 0.3,
                                                     random_state = 2023 )

## 3. Implementar métodos de muestreo

Se implementan los métodos SMOTE, SMOTE Tomek-Links y Naive Random Oversampling

In [57]:
x_train_s, x_train_st, x_train_nro, y_train_s, y_train_st, y_train_nro = fun.resampling( x_train, y_train )

In [58]:
x_train.name = 'x_train_o'
x_train_s.name = 'x_train_s'
x_train_st.name = 'x_train_st'
x_train_nro.name = 'x_train_nro'

In [59]:
check_data      = np.zeros( ( 4, 4 ) )

check_data[ 0 ] = [ x_train.shape[ 0 ], x_train.shape[ 1 ],
                    y_train.value_counts()[ 0 ], y_train.value_counts()[ 1 ] ]

check_data[ 1 ] = [ x_train_s.shape[ 0 ], x_train_s.shape[ 1 ],
                    y_train_s.value_counts()[ 0 ], y_train_s.value_counts()[ 1 ] ]

check_data[ 2 ] = [ x_train_st.shape[ 0 ], x_train_st.shape[ 1 ],
                    y_train_st.value_counts()[ 0 ], y_train_st.value_counts()[ 1 ] ]

check_data[ 3 ] = [ x_train_nro.shape[ 0 ], x_train_nro.shape[ 1 ],
                    y_train_nro.value_counts()[ 0 ], y_train_nro.value_counts()[ 1 ] ]

colnames        = [ 'observaciones', 'variables', 'Nro. No', 'Nro. Si' ]

rownames        = [ 'Original',
                    'SMOTE',
                    'SOMTE Tomek',
                    'NRS' ]

table_check_data = pd.DataFrame( check_data, columns = colnames )
table_check_data.index = rownames
table_check_data

Unnamed: 0,observaciones,variables,Nro. No,Nro. Si
Original,70.0,20.0,35.0,35.0
SMOTE,70.0,20.0,35.0,35.0
SOMTE Tomek,66.0,20.0,33.0,33.0
NRS,70.0,20.0,35.0,35.0


## 4. Implementar los modelos de clasificación

In [60]:
o_20 = 20*x_train.shape[ 1 ]/100
o_30 = 30*x_train.shape[ 1 ]/100
o_40 = 40*x_train.shape[ 1 ]/100

print( o_20, o_30, o_40, sep = '\n' )

4.0
6.0
8.0


In [61]:
# models = {

#     'Logistic Regression'      : { 'model'      : LogisticRegression( random_state = 2023, n_jobs = -1 ) },

#     'Logistic Lasso'           : { 'model'      : LogisticRegressionCV( random_state = 2023, penalty = 'l1', solver = 'saga', n_jobs = -1, Cs = [ 0.001, 0.01, 0.1, 1, 10, 100 ] ) },

#     'Logistic Ridge'           : { 'model'      : LogisticRegressionCV( random_state = 2023, penalty = 'l2', solver = 'saga', n_jobs = -1, Cs = [ 0.001, 0.01, 0.1, 1, 10, 100 ] ) },

#     'Logistic Elastic Net'     : { 'model'      : LogisticRegressionCV( random_state = 2023, penalty = 'elasticnet', solver = 'saga', l1_ratios = [ 0.5 ], n_jobs = -1, Cs = [ 0.001, 0.01, 0.1, 1, 10, 100 ] ) },

#     'Random Forest Classifier' : { 'model'      : RandomForestClassifier( random_state = 2023, n_jobs = -1 ),
#                                    'grid_params': { 'n_estimators': [ 250, 500, 1000 ], 'max_depth': [ 10, 20, 30 ], 'max_features': [ 2864, 4295, 5727 ] } },

#     'XGboost Classifier'       : { 'model'      : XGBClassifier( random_state = 2023, use_label_encoder = False, objective = 'binary:logistic', verbosity = 0, learning_rate = 0.1, n_jobs = -1 ),
#                                    'grid_params': { 'n_estimators': [ 250, 500, 1000 ], 'max_depth': [ 1, 2 ], 'max_features': [ 2864, 4295, 5727 ] } },


#     'LGBMClassifier'           : { 'model'      : LGBMClassifier( random_state = 2023, n_jobs = -1 ),
#                                    'grid_params': { 'n_estimators': [ 250, 500, 1000 ], 'max_depth': [ 1, 2 ] } },


#         }

models_regression_forest = {
    'Regression Forest' : { 'model'      : RegressionForest( random_state = 2023, n_jobs = -1 ),
                            'grid_params': { 'n_estimators': [ 252 ], 'max_depth': [ 10, 15 ] } }
        }

In [62]:
models_path  = r'..\..\..\output\ejecucion_11\models'
results_path = r'..\..\..\output\ejecucion_11\results'
vars_path    = r'..\..\..\output\ejecucion_11\vars'
gs_path      = r'..\..\..\output\ejecucion_11\gridsearch_results'

In [63]:
x_train_list = [ x_train, x_train_s, x_train_st, x_train_nro ]
y_train_list = [ y_train, y_train_s, y_train_st, y_train_nro ]
path_list    = [ models_path, results_path, vars_path, gs_path ]

In [64]:
sufix   = 'ca'

In [65]:
import pandas as pd
import numpy as np
from glob import glob
import os
from importlib.machinery import SourceFileLoader
import xlsxwriter

# import variables_nombres as vn
# vn  = SourceFileLoader( 'variables_nombres', 'variables_nombres.py' ).load_module()

import variables_nombres as vn


from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, f1_score, matthews_corrcoef, classification_report
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import KFold
from sklearn.feature_selection import VarianceThreshold

import pickle
import joblib
import matplotlib.pyplot as plt

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from collections import Counter


In [66]:
def extract_suffix(name):
    '''
    Objetivo: 
        - Extrae el sufijo del nombre del conjunto de entrenamiento.
          Ejemplo: 'x_train_st' -> 'st'
    '''
    parts = name.split('_')
    if len(parts) > 2:
        return '_'.join(parts[2:])
    return 'original'

# def test_regression_forest(models, x_train_list, y_train_list, x_test, y_test, path_list, sufix):

#     '''
#     Objetivo:

#         - Implementar el modelo Regression Forest adaptado para una clasificación binaria

#     Input:

#         - models      : Diccionario que especifica el modelo de Regressión Forest y los parámetros de grid search.
#         - x_train_list: Lista de conjuntos de entrenamiento con las variables predictoras. La lista debe
#                         seguir el siguiente orden: Original, SMOTE, SMOTE Tomek-Links y Naive Random 
#                         Oversampling. Ejemplo: x_train_list = [ x_train, x_train_s, x_train_st, x_train_nro ]
#         - y_train_list: Lista de conjuntos de entrenamiento con la variable predicha. La lista debe
#                         seguir el siguiente orden: Original, SMOTE, SMOTE Tomek-Links y Naive Random 
#                         Oversampling. Ejemplo: y_train_list = [ y_train, y_train_s, y_train_st, y_train_nro ]
#         - x_test      : Conjunto de prueba con las variables predictoras
#         - y_test      : Cnjunto de prueba con la variable predicha
#         - path_list   : Lista de paths donde se guardarán los archivo output. Se asume que la lista de paths
#                         tendrá cuatro elementos, los cuales son: path en el que se guardan los modelos preentrenados
#                         (primero), path en el que se guardan las métricas de desempeño (segundo), path en el que se
#                         guardan las  listas de variables con importancia/coeficientes (tercero) y path en el que se 
#                         guardan los resultados de grid search (cuarto). Se asume que se sigue el orden mencionado en 
#                         paréntesis.

#     Output:

#         - resultados               : Pandas dataframe con las métricas de los distintos modelos implementados.
#         - modelos entrenados       : todos los modelos entrenados se guardan en formato joblib en el path 
#                                      especificado
#         - lista de variables       : Listas de variables que muestra la importancia (en el caso de los métodos 
#                                      basados en árboles) o coeficientes (en el caso de los métodos lineares) de 
#                                      las variables predictoras. Se muestran en formato de tabla. 
#         - resultados de grid search: Detalles sobre el ajuste del algoritmo de Grid Search. Se muestran en 
#                                      formato de tabla.
                                     
#     '''

#     threshold_range = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

#     for path in path_list:
#         if not os.path.exists(path):
#             os.makedirs(path)
    
#     # Inicializar el ExcelWriter fuera del loop de thresholds
#     excel_path = f'{path_list[1]}/results_{sufix}_reg_forest_all_thresholds.xlsx'
#     with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
#         for threshold in threshold_range:
#             results = {
                
#                 'Model'             : [],
                
#                 'accuracy_train'    : [],
#                 'log_loss_train'    : [],
#                 'roc_auc_train'     : [],
#                 'f1_train'          : [],
#                 'f1_train_si'       : [],
#                 'f1_train_no'       : [],        
#                 'MCC_train'         : [],               
                
#                 'accuracy_test'     : [],
#                 'log_loss_test'     : [],
#                 'roc_auc_test'      : [],
#                 'f1_test'           : [],
#                 'f1_test_si'        : [],
#                 'f1_test_no'        : [],        
#                 'MCC_test'          : [],
                
#                 'Grid_Search_Params': []
                
#             }

#             for model_name, model_params in models.items():
#                 if 'model' not in model_params:
#                     raise ValueError(f'Model is not defined for {model_name}')

#                 model = model_params['model']
#                 grid_params = model_params.get('grid_params', None)

#                 for index, (x_train, y_train) in enumerate(zip(x_train_list, y_train_list)):
#                     train_suffix = extract_suffix( x_train.name ) 
#                     pred_vars      = x_train.columns.to_list()

#                     cv = KFold(n_splits=5, shuffle=True, random_state=2023)
#                     grid_search = GridSearchCV(model, grid_params, cv=cv, scoring='roc_auc')
#                     grid_search.fit(x_train, y_train)
#                     best_model = grid_search.best_estimator_
#                     best_params = grid_search.best_params_

#                     # Calcular predicciones y métricas para el threshold actual
#                     y_pred_train_class = (best_model.predict(x_train) >= threshold).astype(int)
#                     y_pred_test_class = (best_model.predict(x_test) >= threshold).astype(int)
#                     y_pred_train_proba = best_model.predict(x_train)
#                     y_pred_test_proba = best_model.predict(x_test)

#                     # Export models
#                     joblib.dump( best_model, f'{ path_list[ 0 ] }/model_{ sufix }_{ model_name }_{ train_suffix }.joblib' )

#                     # Export features importance
#                     feature_importances = best_model.feature_importances_
#                     vars_df             = pd.DataFrame( {'Var': pred_vars, 'Importance Score': feature_importances } )
#                     vars_df             = vars_df.reindex( vars_df[ 'Importance Score' ].abs().sort_values( ascending = False ).index )
#                     vars_df.to_excel( f'{ path_list[ 2 ] }/varlist_{ sufix }_{ model_name }_{ train_suffix }.xlsx' )

#                     # Agregar resultados al diccionario
#                     report_train = classification_report(y_train, y_pred_train_class, output_dict=True)
#                     report_test = classification_report(y_test, y_pred_test_class, output_dict=True)
                    
#                     # Calcular métricas para el conjunto de entrenamiento
#                     accuracy_train = accuracy_score(y_train, y_pred_train_class)
#                     log_loss_train = log_loss(y_train, y_pred_train_proba)  # Asegúrate de usar probabilidades aquí
#                     roc_auc_train = roc_auc_score(y_train, y_pred_train_proba)
#                     f1_score_train = f1_score(y_train, y_pred_train_class, average='macro')
#                     f1_score_train_si = report_train['1']['f1-score']  # Asumiendo que '1' representa la clase 'si'
#                     f1_score_train_no = report_train['0']['f1-score']  # Asumiendo que '0' representa la clase 'no'
#                     mcc_score_train = matthews_corrcoef(y_train, y_pred_train_class)

#                     # Calcular métricas para el conjunto de pruebas
#                     accuracy_test = accuracy_score(y_test, y_pred_test_class)
#                     log_loss_test = log_loss(y_test, y_pred_test_proba) 
#                     roc_auc_test = roc_auc_score(y_test, y_pred_test_proba)
#                     f1_score_test = f1_score(y_test, y_pred_test_class, average='macro')
#                     f1_score_test_si = report_test['1']['f1-score']  # Asumiendo que '1' representa la clase 'si'
#                     f1_score_test_no = report_test['0']['f1-score']  # Asumiendo que '0' representa la clase 'no'
#                     mcc_score_test = matthews_corrcoef(y_test, y_pred_test_class)

#                     # Actualizar el diccionario de resultados
#                     results[ 'Model' ].append( f'{ model_name }_{ train_suffix }' )
                    
#                     results[ 'accuracy_train' ].append( round( accuracy_train, 3 ) )            
#                     results[ 'log_loss_train' ].append( round( log_loss_train, 3 ) )
#                     results[ 'roc_auc_train' ].append( round( roc_auc_train, 3 ) )
#                     results[ 'f1_train' ].append( round( f1_score_train, 3 ) )
#                     results[ 'f1_train_si' ].append( round( f1_score_train_si, 3 ) )
#                     results[ 'f1_train_no' ].append( round( f1_score_train_no, 3 ) )
#                     results[ 'MCC_train' ].append( round( mcc_score_train, 3 ) )               
                    
#                     results[ 'accuracy_test' ].append( round( accuracy_test, 3 ) )
#                     results[ 'log_loss_test' ].append( round( log_loss_test, 3 ) )
#                     results[ 'roc_auc_test' ].append( round( roc_auc_test, 3 ) )
#                     results[ 'f1_test' ].append( round( f1_score_test, 3 ) )
#                     results[ 'f1_test_si' ].append( round( f1_score_test_si, 3 ) )
#                     results[ 'f1_test_no' ].append( round( f1_score_test_no, 3 ) )   
#                     results[ 'MCC_test' ].append( round( mcc_score_test, 3 ) )   
                    
#                     results[ 'Grid_Search_Params' ].append( best_params ) 

#             # Convertir el diccionario de resultados a DataFrame
#             results_df_total = pd.DataFrame(results)
#             print(results_df_total)
#             results_df_total = results_df_total.sort_values( by = 'f1_test', ascending = False )
#             results_df_total.to_excel(writer, sheet_name=f'Threshold_{threshold}')

In [67]:
def test_regression_forest(models, x_train_list, y_train_list, x_test, y_test, path_list, sufix):

    '''
    Objetivo:

        - Implementar el modelo Regression Forest adaptado para una clasificación binaria

    Input:

        - models      : Diccionario que especifica el modelo de Regressión Forest y los parámetros de grid search.
        - x_train_list: Lista de conjuntos de entrenamiento con las variables predictoras. La lista debe
                        seguir el siguiente orden: Original, SMOTE, SMOTE Tomek-Links y Naive Random 
                        Oversampling. Ejemplo: x_train_list = [ x_train, x_train_s, x_train_st, x_train_nro ]
        - y_train_list: Lista de conjuntos de entrenamiento con la variable predicha. La lista debe
                        seguir el siguiente orden: Original, SMOTE, SMOTE Tomek-Links y Naive Random 
                        Oversampling. Ejemplo: y_train_list = [ y_train, y_train_s, y_train_st, y_train_nro ]
        - x_test      : Conjunto de prueba con las variables predictoras
        - y_test      : Cnjunto de prueba con la variable predicha
        - path_list   : Lista de paths donde se guardarán los archivo output. Se asume que la lista de paths
                        tendrá cuatro elementos, los cuales son: path en el que se guardan los modelos preentrenados
                        (primero), path en el que se guardan las métricas de desempeño (segundo), path en el que se
                        guardan las  listas de variables con importancia/coeficientes (tercero) y path en el que se 
                        guardan los resultados de grid search (cuarto). Se asume que se sigue el orden mencionado en 
                        paréntesis.

    Output:

        - resultados               : Pandas dataframe con las métricas de los distintos modelos implementados.
        - modelos entrenados       : todos los modelos entrenados se guardan en formato joblib en el path 
                                     especificado
        - lista de variables       : Listas de variables que muestra la importancia (en el caso de los métodos 
                                     basados en árboles) o coeficientes (en el caso de los métodos lineares) de 
                                     las variables predictoras. Se muestran en formato de tabla. 
        - resultados de grid search: Detalles sobre el ajuste del algoritmo de Grid Search. Se muestran en 
                                     formato de tabla.
                                     
    '''

    columns   = [ 'no', 'si' ]
    threshold_range = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

    for path in path_list:
        if not os.path.exists(path):
            os.makedirs(path)
    
    results = {
        'Model'             : [],
        
        'accuracy_train'    : [],
        'log_loss_train'    : [],
        'roc_auc_train'     : [],
        'f1_train'          : [],
        'f1_train_si'       : [],
        'f1_train_no'       : [],        
        'MCC_train'         : [],               
        
        'accuracy_test'     : [],
        'log_loss_test'     : [],
        'roc_auc_test'      : [],
        'f1_test'           : [],
        'f1_test_si'        : [],
        'f1_test_no'        : [],        
        'MCC_test'          : [],
        
        'Grid_Search_Params': []
        
    }

    for model_name, model_params in models.items():
        if 'model' not in model_params:
            raise ValueError(f'Model is not defined for {model_name}')

        model = model_params['model']
        grid_params = model_params.get('grid_params', None)

        for index, (x_train, y_train) in enumerate(zip(x_train_list, y_train_list)):
            train_suffix = extract_suffix( x_train.name ) 
            pred_vars      = x_train.columns.to_list()

            cv = KFold(n_splits=5, shuffle=True, random_state=2023)
            grid_search = GridSearchCV(model, grid_params, cv=cv, scoring='r2')
            grid_search.fit(x_train, y_train)
            best_model = grid_search.best_estimator_
            best_params = grid_search.best_params_

            # Export models
            joblib.dump( best_model, f'{ path_list[ 0 ] }/model_{ sufix }_{ model_name }_{ train_suffix }.joblib' )

            # Export Grid Search Results
            results_gs  = pd.DataFrame( grid_search.cv_results_ )
            results_gs.to_excel( f'{ path_list[ 3 ] }/gs_{ sufix }_{ model_name }_{ train_suffix }.xlsx' )

            # Export features importance
            feature_importances = best_model.feature_importances_
            vars_df             = pd.DataFrame( {'Var': pred_vars, 'Importance Score': feature_importances } )
            vars_df             = vars_df.reindex( vars_df[ 'Importance Score' ].abs().sort_values( ascending = False ).index )
            vars_df.to_excel( f'{ path_list[ 2 ] }/varlist_{ sufix }_{ model_name }_{ train_suffix }.xlsx' )

            y_pred_train_proba = best_model.predict(x_train)
            y_pred_test_proba = best_model.predict(x_test)

            for threshold in threshold_range:

                # Calcular predicciones y métricas para el threshold actual
                y_pred_train_class = (best_model.predict(x_train) >= threshold).astype(int)
                y_pred_test_class = (best_model.predict(x_test) >= threshold).astype(int)

                # Agregar resultados al diccionario
                report_train = classification_report( y_train, y_pred_train_class, target_names = columns, output_dict = True )
                report_test = classification_report( y_test, y_pred_test_class, target_names = columns, output_dict = True )  
                
                # Calcular métricas para el conjunto de entrenamiento
                accuracy_train = accuracy_score(y_train, y_pred_train_class)
                log_loss_train = log_loss(y_train, y_pred_train_proba)
                roc_auc_train = roc_auc_score(y_train, y_pred_train_proba)
                f1_score_train = f1_score(y_train, y_pred_train_class, average='macro')
                f1_score_train_si = report_train['si']['f1-score']
                f1_score_train_no = report_train['no']['f1-score']
                mcc_score_train = matthews_corrcoef(y_train, y_pred_train_class)

                # Calcular métricas para el conjunto de pruebas
                accuracy_test = accuracy_score(y_test, y_pred_test_class)
                log_loss_test = log_loss(y_test, y_pred_test_proba) 
                roc_auc_test = roc_auc_score(y_test, y_pred_test_proba)
                f1_score_test = f1_score(y_test, y_pred_test_class, average='macro')
                f1_score_test_si = report_test['si']['f1-score']
                f1_score_test_no = report_test['no']['f1-score'] 
                mcc_score_test = matthews_corrcoef(y_test, y_pred_test_class)

                # Actualizar el diccionario de resultados
                results[ 'Model' ].append( f'{threshold}_{ model_name }_{ train_suffix }' )
                
                results[ 'accuracy_train' ].append( round( accuracy_train, 3 ) )            
                results[ 'log_loss_train' ].append( round( log_loss_train, 3 ) )
                results[ 'roc_auc_train' ].append( round( roc_auc_train, 3 ) )
                results[ 'f1_train' ].append( round( f1_score_train, 3 ) )
                results[ 'f1_train_si' ].append( round( f1_score_train_si, 3 ) )
                results[ 'f1_train_no' ].append( round( f1_score_train_no, 3 ) )
                results[ 'MCC_train' ].append( round( mcc_score_train, 3 ) )               
                
                results[ 'accuracy_test' ].append( round( accuracy_test, 3 ) )
                results[ 'log_loss_test' ].append( round( log_loss_test, 3 ) )
                results[ 'roc_auc_test' ].append( round( roc_auc_test, 3 ) )
                results[ 'f1_test' ].append( round( f1_score_test, 3 ) )
                results[ 'f1_test_si' ].append( round( f1_score_test_si, 3 ) )
                results[ 'f1_test_no' ].append( round( f1_score_test_no, 3 ) )   
                results[ 'MCC_test' ].append( round( mcc_score_test, 3 ) )   
                
                results[ 'Grid_Search_Params' ].append( best_params )  

        # Convertir el diccionario de resultados a DataFrame
        results_df_total = pd.DataFrame(results)
        results_df_total = results_df_total.sort_values( by = 'f1_test', ascending = False )

    return results_df_total

In [68]:
# Regression Forest Model

resultados_rf = test_regression_forest( models_regression_forest, x_train_list, y_train_list, x_test, y_test, path_list, sufix )
resultados_rf.to_excel( r'..\..\..\output\ejecucion_11\results\base0_ca_regression_forest.xlsx' )