# BASE 0 - CI

Information:

* Database: base 0
* Predicted variable: Corrupción Intensa
* Predictor variables: Only SIAF and Canon
* Type of prediction: Clasification
* Periodo: 2016-2020

## 1. Cargar librerías, módulos y datos

In [1]:
import warnings
warnings.filterwarnings( 'ignore' )

In [2]:
import pandas as  pd
import numpy as np
import pickle
import joblib
import matplotlib.pyplot as plt
from importlib.machinery import SourceFileLoader

In [3]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, f1_score, matthews_corrcoef
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import KFold

In [5]:
fun = SourceFileLoader( 'funciones', r'..\..\..\code\modules\funciones.py' ).load_module()
vn  = SourceFileLoader( 'variables_nombres', r'..\..\..\code\modules\variables_nombres.py' ).load_module()

In [6]:
path = r'..\..\..\input\preprocessed_data\base0_siaf.csv'
data = pd.read_csv( path )

## 2. Realizar la partición en conjunto de entrenamiento y prueba

In [13]:
dep_var    = [ 'corrup_intensa' ]
other_vars = [ 'monto_examinado', 'monto_auditado', 'monto_objeto_servicio', 
               'monto_corrup1', 'monto_corrup2', 'tipo_control', 'corrup_amplia',
               'per_corrup1', 'per_corrup2', '_monto', 'monto_', 'year', 'ubigeo' ]

pred_vars  = [ col for col in data.columns if col not in dep_var and col not in other_vars ]

x_train, x_test, y_train, y_test = train_test_split( data[ pred_vars ], 
                                                     data[ 'corrup_intensa' ], 
                                                     test_size    = 0.3,
                                                     random_state = 2023 )

In [14]:
x_train_columns = x_train.columns.to_list()
x_train_columns = pd.DataFrame( x_train_columns, columns = [ 'colname' ] )
x_train_columns.to_excel( r'..\..\..\code\prediction\colnames_b0_ci_siaf.xlsx' )

## 3. Implementar métodos de muestreo

Se implementan los métodos SMOTE, SMOTE Tomek-Links y Naive Random Oversampling

In [15]:
x_train_s, x_train_st, x_train_nro, y_train_s, y_train_st, y_train_nro = fun.resampling( x_train, y_train )

## 4. Implementar los modelos de clasificación

In [16]:
o_20 = 20*x_train.shape[ 1 ]/100
o_30 = 30*x_train.shape[ 1 ]/100
o_40 = 40*x_train.shape[ 1 ]/100

print( o_20, o_30, o_40, sep = '\n' )

2946.4
4419.6
5892.8


In [17]:
models = {
    
    'Logistic Regression'      : { 'model'      : LogisticRegression( random_state = 2023, n_jobs = -1 ) },
    
#     'Logistic Lasso'           : { 'model'      : LogisticRegressionCV( random_state = 2023, penalty = 'l1', solver = 'saga', n_jobs = -1 ),
#                                    'grid_params': { 'Cs': [ 0.001, 0.01, 0.1, 1, 10, 100 ] } },
    
#     'Logistic Ridge'           : { 'model'      : LogisticRegressionCV( random_state = 2023, penalty = 'l2', solver = 'saga', n_jobs = -1 ),
#                                    'grid_params': { 'Cs': [ 0.001, 0.01, 0.1, 1, 10, 100 ] } },
    
#     'Logistic Elastic Net'     : { 'model'      : LogisticRegressionCV( random_state = 2023, penalty = 'elasticnet', solver = 'saga', l1_ratios = [ 0.5 ], n_jobs = -1 ),
#                                    'grid_params': { 'Cs': [ 0.001, 0.01, 0.1, 1, 10, 100 ] } },
    
    'Random Forest Classifier' : { 'model'      : RandomForestClassifier( random_state = 2023, n_jobs = -1 ),
                                   'grid_params': { 'n_estimators': [ 250, 500, 1000 ], 'max_depth': [ 10, 20, 30 ], 'max_features': [ 2946, 4420, 5893 ] } },
    
    'XGboost Classifier'       : { 'model'      : XGBClassifier( random_state = 2023, use_label_encoder = False, objective = 'binary:logistic', verbosity = 0, learning_rate = 0.1, n_jobs = -1 ),
                                   'grid_params': { 'n_estimators': [ 250, 500, 1000 ], 'max_depth': [ 1, 2 ], 'max_features': [ 2946, 4420, 5893 ] } }
    
        }

In [18]:
models_path = r'..\..\..\output\ejecucion_6\models\base0'
vars_path   = r'..\..\..\output\ejecucion_6\results\base0'
gs_path     = r'..\..\..\output\ejecucion_6\gridsearch_results\base0'

In [19]:
x_train_list = [ x_train, x_train_s, x_train_st, x_train_nro ]
y_train_list = [ y_train, y_train_s, y_train_st, y_train_nro ]
path_list    = [ models_path, vars_path, gs_path ]

In [20]:
sufix = 'ci'

In [21]:
resultados = fun.test_models_classification( models, x_train_list, y_train_list, x_test, y_test, path_list, sufix )
resultados.to_excel( r'..\..\..\output\ejecucion_6\results\base0\base0_ci.xlsx' )

In [22]:
# resultados.to_excel( r'..\..\..\output\ejecucion_6\results\base0\base0_ci.xlsx' )

In [25]:
# base0_ci_tb  = pd.read_excel( r'..\..\..\output\ejecucion_6\results\base0\base0_ci_tb.xlsx' )
# base0_ci_reg = pd.read_excel( r'..\..\..\output\ejecucion_6\results\base0\base0_ci_reg.xlsx' )

In [27]:
# base0_ci = pd.concat( [ base0_ci_tb, base0_ci_reg ], axis = 0 )

Unnamed: 0.1,Unnamed: 0,Model,accuracy_train,accuracy_test,log_loss_train,log_loss_test,roc_auc_train,roc_auc_test,f1_train,f1_test,MCC_train,MCC_test,Grid_Search_Params
0,9,XGboost Classifier_1,0.976,0.662,0.818,11.662,0.999,0.642,0.976,0.625,0.953,0.25,"{'max_depth': 2, 'max_features': 2946, 'n_esti..."
1,5,Random Forest Classifier_1,0.978,0.648,0.744,12.164,0.999,0.654,0.978,0.615,0.957,0.23,"{'max_depth': 30, 'max_features': 2946, 'n_est..."
2,6,Random Forest Classifier_2,0.979,0.645,0.728,12.276,0.999,0.644,0.979,0.607,0.958,0.215,"{'max_depth': 30, 'max_features': 2946, 'n_est..."
3,11,XGboost Classifier_3,0.976,0.643,0.818,12.331,0.998,0.655,0.976,0.607,0.953,0.213,"{'max_depth': 2, 'max_features': 2946, 'n_esti..."
4,10,XGboost Classifier_2,0.977,0.645,0.804,12.276,0.999,0.645,0.977,0.602,0.954,0.205,"{'max_depth': 2, 'max_features': 2946, 'n_esti..."
5,7,Random Forest Classifier_3,0.978,0.651,0.744,12.052,0.999,0.654,0.978,0.601,0.957,0.207,"{'max_depth': 30, 'max_features': 5893, 'n_est..."
6,8,XGboost Classifier_0,0.873,0.656,4.38,11.885,0.951,0.663,0.854,0.597,0.72,0.205,"{'max_depth': 1, 'max_features': 2946, 'n_esti..."
7,4,Random Forest Classifier_0,0.942,0.656,2.011,11.885,0.994,0.659,0.935,0.589,0.873,0.194,"{'max_depth': 10, 'max_features': 2946, 'n_est..."
8,2,Logistic Regression_2,0.789,0.598,7.295,13.894,0.885,0.599,0.789,0.579,0.578,0.166,No grid search
9,0,Logistic Regression_0,0.776,0.612,7.731,13.392,0.836,0.606,0.748,0.573,0.5,0.147,No grid search


In [29]:
# base0_ci = base0_ci.sort_values( by = [ 'f1_test' ], ascending = False )
# base0_ci.to_excel( r'..\..\..\output\ejecucion_6\results\base0\base0_ci.xlsx' )