# 1. Load data

In [1]:
import warnings
warnings.filterwarnings( 'ignore' )

In [30]:
import pandas as  pd
import numpy as np
import pickle
import joblib
import os

In [3]:
import functions as fn

In [112]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, f1_score, matthews_corrcoef
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import KFold

In [5]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [6]:
from sklearn.datasets import make_classification

X, y = make_classification(
    # n_samples=100,
    n_features=5,
    n_informative=3,
    n_classes=2,
    random_state=999,
    weights = [0.25, 0.75],
)

In [7]:
import pandas as pd

# Create DataFrame with features as columns
dataset = pd.DataFrame(X)
# give custom names to the features
dataset.columns = ['X1', 'X2', 'X3', 'X4', 'X5']
# Now add the label as a column
dataset['y'] = y

dataset

Unnamed: 0,X1,X2,X3,X4,X5,y
0,0.124407,-2.516167,-0.642083,-0.546166,2.020140,1
1,-0.676661,1.561602,0.939160,0.174470,-0.867781,1
2,-0.986671,-0.138833,0.917341,0.974462,1.004820,1
3,0.094244,-1.883036,-0.492748,-0.660872,1.460955,1
4,-0.111260,0.768110,0.225981,-0.667605,-0.731197,1
...,...,...,...,...,...,...
95,0.190516,-1.427720,-0.499187,-0.878646,0.949568,0
96,-0.725680,1.271393,0.934481,0.369618,-0.539265,1
97,-1.338530,-1.958133,0.786235,-1.014356,2.457490,0
98,-1.615453,0.247027,1.482545,-0.983854,0.715300,0


In [93]:
dataset[ 'y' ].value_counts()

1    75
0    25
Name: y, dtype: int64

In [8]:
pred_vars  = [ col for col in dataset.columns if col != 'y' ]

x_train, x_test, y_train, y_test = train_test_split( dataset[ pred_vars ], 
                                                     dataset[ 'y' ], 
                                                     test_size    = 0.25,
                                                     random_state = 2023 )

In [9]:
x_train_s, x_train_st, x_train_nro, y_train_s, y_train_st, y_train_nro = fn.resampling( x_train, y_train )

# 2. Models implementation

### 2.1. Regresión

En este caso, se predice la variable 'craest_3c'

In [87]:
models = {
    
    'Logistic Regression'      : { 'model'      : LogisticRegression( random_state = 2023 ) },
    
    'Logistic Lasso'           : { 'model'      : LogisticRegressionCV( random_state = 2023, penalty = 'l1', solver = 'saga' ),
                                   'grid_params': { 'Cs': [ 0.001, 0.01, 0.1, 1, 10, 100 ] } },
    
#     'Logistic Ridge'           : { 'model'      : LogisticRegressionCV( random_state = 2023, penalty = 'l2', solver = 'saga' ),
#                                    'grid_params': { 'Cs': [ 0.001, 0.01, 0.1, 1, 10, 100 ] } },
    
#     'Random Forest Classifier' : { 'model'      : RandomForestClassifier( random_state = 2023 ),
#                                    'grid_params': { 'n_estimators': [ 250, 500, 1000 ], 'max_depth': [ 10, 20, 30 ] } },
    
    'XGboost Classifier'       : { 'model'      : XGBClassifier( random_state = 2023 ),
                                   'grid_params': { 'n_estimators': [ 250, 500, 1000 ], 'max_depth': [ 1, 2 ] } }
    
        }

In [117]:
path_models = r'..\..\..\output\ejecucion_5\models\base0'
path_vars   = r'..\..\..\output\ejecucion_5\results\base0'
path_gs     = r'..\..\..\output\ejecucion_5\gridsearch_results\base0'

In [118]:
x_train_list = [ x_train, x_train_s, x_train_st, x_train_nro ]
y_train_list = [ y_train, y_train_s, y_train_st, y_train_nro ]

In [119]:
path_list = [ path_models, path_vars, path_gs ]

In [45]:
for index, ( x_train, y_train ) in enumerate( zip( x_train_list, y_train_list ) ):
    print( index, x_train.shape )

0 (108, 5)
1 (108, 5)
2 (104, 5)
3 (108, 5)


In [46]:
y_train_st.value_counts()

1    52
0    52
Name: y, dtype: int64

Parámetros adicionales:

* Path para exportar modelos preentrenados
* Path para exportar listas de variables
* Path para exportar grid_search results


In [124]:
def test_models_classification( models, x_train_list, y_train_list, x_test, y_test, path_list ):
                               
    
    results = {
        
        'Model'             : [],
        'accuracy_train'    : [],
        'accuracy_test'     : [],
        'log_loss_train'    : [],
        'log_loss_test'     : [],
        'roc_auc_train'     : [],
        'roc_auc_test'      : [],
        'f1_train'          : [],
        'f1_test'           : [],
        'MCC_train'         : [],
        'MCC_test'          : [],
        'Grid_Search_Params': []
        
    }
    
    for path in path_list:
        if not os.path.exists( path ):
            os.makedirs( path )
    
    for model_name, model_params in models.items():
        
        if 'model' in model_params:
            model = model_params[ 'model' ]
        else:
            raise ValueError( f'Model is not defined for { model_name }' )
        
        if 'grid_params' in model_params:
            grid_params = model_params[ 'grid_params' ]
        else:
            grid_params = None
            
            
        for index, ( x_train, y_train ) in enumerate( zip( x_train_list, y_train_list ) ):
            
            pred_vars      = x_train.columns.to_list()
            variables_dict = {}

            if grid_params is not None:

                cv          = KFold( n_splits = 5, shuffle = True, random_state = 2023 )
                grid_search = GridSearchCV( model, grid_params, cv = cv )

                grid_search.fit( x_train, y_train )
                
                results_gs  = pd.DataFrame( grid_search.cv_results_ )
                results_gs.to_excel( f'{ path_list[ 2 ] }/gs_{ model_name }_{ index }.xlsx' )

                best_model  = grid_search.best_estimator_
                best_params = grid_search.best_params_

                y_pred_train_class = best_model.predict( x_train )
                y_pred_train_proba = best_model.predict_proba( x_train )[ :, 1 ]            

                y_pred_test_class  = best_model.predict( x_test )
                y_pred_test_proba  = best_model.predict_proba( x_test )[ :, 1 ]
                
                joblib.dump( best_model, f'{ path_list[ 0 ] }/model_{ model_name }_{ index }.joblib' )

                if hasattr( best_model, 'feature_importances_' ):
                    
                    feature_importances = best_model.feature_importances_
                    vars_df             = pd.DataFrame( {'Var': pred_vars, 'Importance Score': feature_importances } )
                    vars_df             = vars_df.reindex( vars_df[ 'Importance Score' ].abs().sort_values( ascending = False ).index )
                    vars_df.to_excel( f'{ path_list[ 1 ] }/varlist_{ model_name }_{ index }.xlsx' )

                elif hasattr( best_model, 'coef_' ):
                    
                    coefficients = best_model.coef_[ 0 ]
                    vars_df      = pd.DataFrame( {'Var': best_model.feature_names_in_, 'Coefficient': coefficients } )
                    vars_df      = vars_df.reindex( vars_df[ 'Coefficient' ].abs().sort_values( ascending = False ).index )
                    vars_df.to_excel( f'{ path_list[ 1 ] }/varlist_{ model_name }_{ index }.xlsx' )

            else:
                model.fit( x_train, y_train )

                best_params  = 'No grid search'

                y_pred_train_class = model.predict( x_train )
                y_pred_train_proba = model.predict_proba( x_train )[ :, 1 ]            

                y_pred_test_class  = model.predict( x_test )
                y_pred_test_proba  = model.predict_proba( x_test )[ :, 1 ]
                
                joblib.dump( model, f'{ path_models }/{ model_name }_{ index }.joblib' )

                coefficients = model.coef_[ 0 ]
                vars_df      = pd.DataFrame( {'Var': model.feature_names_in_, 'Coefficient': coefficients } )
                vars_df      = vars_df.reindex( vars_df[ 'Coefficient' ].abs().sort_values( ascending = False ).index )
                vars_df.to_excel( f'{ path_list[ 1 ] }/varlist_{ model_name }_{ index }.xlsx' )

            accuracy_train  = accuracy_score( y_train, y_pred_train_class )
            log_loss_train  = log_loss( y_train, y_pred_train_class )
            roc_auc_train   = roc_auc_score( y_train, y_pred_train_proba )
            f1_score_train  = f1_score( y_train, y_pred_train_class, average = 'macro' )
            mcc_score_train = matthews_corrcoef( y_train, y_pred_train_class )

            accuracy_test   = accuracy_score( y_test, y_pred_test_class )
            log_loss_test   = log_loss( y_test, y_pred_test_class )
            roc_auc_test    = roc_auc_score( y_test, y_pred_test_proba )
            f1_score_test   = f1_score( y_test, y_pred_test_class, average = 'macro' )
            mcc_score_test  = matthews_corrcoef( y_test, y_pred_test_class )

            results[ 'Model' ].append( f'{ model_name }_{ index }' )
            results[ 'accuracy_train' ].append( round( accuracy_train, 3 ) )
            results[ 'accuracy_test' ].append( round( accuracy_test, 3 ) )
            results[ 'log_loss_train' ].append( round( log_loss_train, 3 ) )
            results[ 'log_loss_test' ].append( round( log_loss_test, 3 ) )
            results[ 'roc_auc_train' ].append( round( roc_auc_train, 3 ) )
            results[ 'roc_auc_test' ].append( round( roc_auc_test, 3 ) )
            results[ 'f1_train' ].append( round( f1_score_train, 3 ) )
            results[ 'f1_test' ].append( round( f1_score_test, 3 ) )
            results[ 'MCC_train' ].append( round( mcc_score_train, 3 ) )
            results[ 'MCC_test' ].append( round( mcc_score_test, 3 ) )          
            results[ 'Grid_Search_Params' ].append( best_params )
        
    results_df = pd.DataFrame( results )
    results_df = results_df.sort_values( by = 'f1_test', ascending = False )

    return results_df

In [125]:
%%time

results_reg = test_models_classification( models, x_train_list, y_train_list, x_test, y_test, path_list )

CPU times: total: 1min 54s
Wall time: 25.6 s


In [126]:
results_reg

Unnamed: 0,Model,accuracy_train,accuracy_test,log_loss_train,log_loss_test,roc_auc_train,roc_auc_test,f1_train,f1_test,MCC_train,MCC_test,Grid_Search_Params
8,XGboost Classifier_0,1.0,0.92,0.0,2.883,1.0,0.857,1.0,0.851,1.0,0.702,"{'max_depth': 2, 'n_estimators': 250}"
10,XGboost Classifier_2,1.0,0.92,0.0,2.883,1.0,0.845,1.0,0.851,1.0,0.702,"{'max_depth': 2, 'n_estimators': 250}"
11,XGboost Classifier_3,1.0,0.92,0.0,2.883,1.0,0.857,1.0,0.851,1.0,0.702,"{'max_depth': 2, 'n_estimators': 250}"
9,XGboost Classifier_1,1.0,0.88,0.0,4.325,1.0,0.917,1.0,0.751,1.0,0.51,"{'max_depth': 2, 'n_estimators': 500}"
1,Logistic Regression_1,0.407,0.6,21.359,14.417,0.47,0.56,0.404,0.504,-0.187,0.089,No grid search
2,Logistic Regression_2,0.404,0.6,21.488,14.417,0.476,0.56,0.398,0.504,-0.196,0.089,No grid search
0,Logistic Regression_0,0.639,0.68,13.016,11.534,0.636,0.452,0.637,0.5,0.28,0.01,No grid search
3,Logistic Regression_3,0.639,0.68,13.016,11.534,0.636,0.452,0.637,0.5,0.28,0.01,No grid search
4,Logistic Lasso_0,0.63,0.68,13.35,11.534,0.638,0.452,0.628,0.5,0.262,0.01,{'Cs': 100}
7,Logistic Lasso_3,0.63,0.68,13.35,11.534,0.638,0.452,0.628,0.5,0.262,0.01,{'Cs': 100}


In [123]:
results_reg.to_excel( 'results.xlsx' )

In [67]:
model1 = RandomForestClassifier( random_state = 2023 )
model1.fit( x_train, y_train )
vars_dict = { var: score for var, score in zip( pred_vars, model1.feature_importances_ ) }

In [70]:
vars_dict = { var: score for var, score in zip( pred_vars, model1.feature_importances_ ) }
vars_df   = pd.DataFrame( vars_dict, index = pred_vars )

In [71]:
vars_dict

{'X1': 0.30447100486630746,
 'X2': 0.14130582154778112,
 'X3': 0.19052687487653105,
 'X4': 0.21725349580220096,
 'X5': 0.14644280290717948}

In [72]:
vars_df

Unnamed: 0,X1,X2,X3,X4,X5
X1,0.304471,0.141306,0.190527,0.217253,0.146443
X2,0.304471,0.141306,0.190527,0.217253,0.146443
X3,0.304471,0.141306,0.190527,0.217253,0.146443
X4,0.304471,0.141306,0.190527,0.217253,0.146443
X5,0.304471,0.141306,0.190527,0.217253,0.146443


In [74]:
importances = model1.feature_importances_

# Crear un DataFrame a partir de las importancias y variables
vars_df = pd.DataFrame({'Variable': pred_vars, 'Score': importances})

In [75]:
vars_df

Unnamed: 0,Variable,Score
0,X1,0.304471
1,X2,0.141306
2,X3,0.190527
3,X4,0.217253
4,X5,0.146443


In [83]:
model2 = LogisticRegression( random_state = 2023 )
model2.fit( x_train, y_train )
vars_dict = { var: score for var, score in zip( model2.feature_names_in_, model2.coef_[ 0 ] ) }

In [84]:
vars_dict

{'X1': 0.12059142810644019,
 'X2': 0.13756248895016432,
 'X3': -0.07442007135896472,
 'X4': 0.18600183273230658,
 'X5': -0.1684039377415832}

In [85]:
importances = model2.coef_[ 0 ]

# Crear un DataFrame a partir de las importancias y variables
vars_df = pd.DataFrame({'Variable': model2.feature_names_in_, 'Score': importances})

In [86]:
vars_df

Unnamed: 0,Variable,Score
0,X1,0.120591
1,X2,0.137562
2,X3,-0.07442
3,X4,0.186002
4,X5,-0.168404


In [81]:
importances[ 0 ]

array([ 0.12059143,  0.13756249, -0.07442007,  0.18600183, -0.16840394])