# BASE 0 - CI

Information:

* Database: base 0
* Predicted variable: Corrupción Intensa
* Predictor variables: Only SIAF and Canon
* Type of prediction: Clasification
* Periodo: 2016-2020

## 1. Cargar librerías, módulos y datos

In [1]:
import warnings
warnings.filterwarnings( 'ignore' )

In [2]:
import pandas as  pd
import numpy as np
import pickle
import joblib
import matplotlib.pyplot as plt
from importlib.machinery import SourceFileLoader

In [3]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, f1_score, matthews_corrcoef
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import KFold

In [5]:
fun = SourceFileLoader( 'funciones', r'..\..\..\code\modules\funciones.py' ).load_module()
vn  = SourceFileLoader( 'variables_nombres', r'..\..\..\code\modules\variables_nombres.py' ).load_module()

In [6]:
path = r'..\..\..\input\preprocessed_data\base0_siaf.csv'
data = pd.read_csv( path )

## 2. Realizar la partición en conjunto de entrenamiento y prueba

In [7]:
dep_var    = [ 'corrup_intensa' ]
other_vars = [ 'monto_examinado', 'monto_auditado', 'monto_objeto_servicio', 
               'monto_corrup1', 'monto_corrup2', 'tipo_control', 'corrup_amplia',
               'per_corrup1', 'per_corrup2', '_monto', 'monto_', 'year', 'ubigeo' ]

pred_vars  = [ col for col in data.columns if col not in dep_var and col not in other_vars ]

x_train, x_test, y_train, y_test = train_test_split( data[ pred_vars ], 
                                                     data[ 'corrup_intensa' ], 
                                                     test_size    = 0.3,
                                                     random_state = 2023 )

In [8]:
x_train_columns = x_train.columns.to_list()
x_train_columns = pd.DataFrame( x_train_columns, columns = [ 'colname' ] )
x_train_columns.to_excel( r'..\..\..\code\prediction\colnames_b0_ci_siaf.xlsx' )

## 3. Implementar métodos de muestreo

Se implementan los métodos SMOTE, SMOTE Tomek-Links y Naive Random Oversampling

In [9]:
x_train_s, x_train_st, x_train_nro, y_train_s, y_train_st, y_train_nro = fun.resampling( x_train, y_train )

In [10]:
check_data      = np.zeros( ( 4, 4 ) )

check_data[ 0 ] = [ x_train.shape[ 0 ], x_train.shape[ 1 ], 
                    y_train.value_counts()[ 0 ], y_train.value_counts()[ 1 ] ]

check_data[ 1 ] = [ x_train_s.shape[ 0 ], x_train_smote.shape[ 1 ], 
                    x_train_s.value_counts()[ 0 ], y_train_smote.value_counts()[ 1 ] ]

check_data[ 2 ] = [ x_train_st.shape[ 0 ], x_train_smote_tomek.shape[ 1 ],
                    x_train_st.value_counts()[ 0 ], y_train_smote_tomek.value_counts()[ 1 ] ]

check_data[ 3 ] = [ x_train_ros.shape[ 0 ], x_train_ros.shape[ 1 ],
                    y_train_ros.value_counts()[ 0 ], y_train_ros.value_counts()[ 1 ] ]

colnames        = [ 'observaciones', 'variables', 'Nro. No', 'Nro. Si' ]

rownames        = [ 'Original',
                    'SMOTE',
                    'SOMTE Tomek',
                    'NRS' ]

table_check_data = pd.DataFrame( check_data, columns = colnames )
table_check_data.index = rownames
table_check_data

KeyError: 'corrup_intensa'

In [19]:
check_data      = np.zeros( ( 4, 4 ) )

check_data[ 0 ] = [ x_train.shape[ 0 ], x_train.shape[ 1 ], 
                    y_train.value_counts()[ 0 ], y_train.value_counts()[ 1 ] ]

check_data[ 1 ] = [ x_train_s.shape[ 0 ], x_train_s.shape[ 1 ], 
                    y_train_s.value_counts()[ 0 ], y_train_s.value_counts()[ 1 ] ]

check_data[ 2 ] = [ x_train_st.shape[ 0 ], x_train_st.shape[ 1 ],
                    y_train_st.value_counts()[ 0 ], y_train_st.value_counts()[ 1 ] ]

check_data[ 3 ] = [ x_train_nro.shape[ 0 ], x_train_nro.shape[ 1 ],
                    y_train_nro.value_counts()[ 0 ], y_train_nro.value_counts()[ 1 ] ]

colnames        = [ 'observaciones', 'variables', 'Nro. No', 'Nro. Si' ]

rownames        = [ 'Original',
                    'SMOTE',
                    'SOMTE Tomek',
                    'NRS' ]

table_check_data = pd.DataFrame( check_data, columns = colnames )
table_check_data.index = rownames
table_check_data

Unnamed: 0,observaciones,variables,Nro. No,Nro. Si
Original,1443.0,14732.0,514.0,929.0
SMOTE,1858.0,14732.0,929.0,929.0
SOMTE Tomek,1808.0,14732.0,904.0,904.0
NRS,1858.0,14732.0,929.0,929.0


## 4. Implementar los modelos de clasificación

In [16]:
o_20 = 20*x_train.shape[ 1 ]/100
o_30 = 30*x_train.shape[ 1 ]/100
o_40 = 40*x_train.shape[ 1 ]/100

print( o_20, o_30, o_40, sep = '\n' )

2946.4
4419.6
5892.8


In [17]:
models = {
    
    'Logistic Regression'      : { 'model'      : LogisticRegression( random_state = 2023, n_jobs = -1 ) },
    
    'Logistic Lasso'           : { 'model'      : LogisticRegressionCV( random_state = 2023, penalty = 'l1', solver = 'saga', n_jobs = -1, Cs = [ 0.001, 0.01, 0.1, 1, 10, 100 ] ) },
    
    'Logistic Ridge'           : { 'model'      : LogisticRegressionCV( random_state = 2023, penalty = 'l2', solver = 'saga', n_jobs = -1, Cs = [ 0.001, 0.01, 0.1, 1, 10, 100 ] ) },
    
    'Logistic Elastic Net'     : { 'model'      : LogisticRegressionCV( random_state = 2023, penalty = 'elasticnet', solver = 'saga', l1_ratios = [ 0.5 ], n_jobs = -1, Cs = [ 0.001, 0.01, 0.1, 1, 10, 100 ] ) },
    
    'Random Forest Classifier' : { 'model'      : RandomForestClassifier( random_state = 2023, n_jobs = -1 ),
                                   'grid_params': { 'n_estimators': [ 250, 500, 1000 ], 'max_depth': [ 10, 20, 30 ], 'max_features': [ 2946, 4420, 5893 ] } },
    
    'XGboost Classifier'       : { 'model'      : XGBClassifier( random_state = 2023, use_label_encoder = False, objective = 'binary:logistic', verbosity = 0, learning_rate = 0.1, n_jobs = -1 ),
                                   'grid_params': { 'n_estimators': [ 250, 500, 1000 ], 'max_depth': [ 1, 2 ], 'max_features': [ 2946, 4420, 5893 ] } }
    
        }

In [18]:
models_path = r'..\..\..\output\ejecucion_6\models\base0'
vars_path   = r'..\..\..\output\ejecucion_6\results\base0'
gs_path     = r'..\..\..\output\ejecucion_6\gridsearch_results\base0'

In [19]:
x_train_list = [ x_train, x_train_s, x_train_st, x_train_nro ]
y_train_list = [ y_train, y_train_s, y_train_st, y_train_nro ]
path_list    = [ models_path, vars_path, gs_path ]

In [20]:
sufix = 'ci'

In [21]:
resultados = fun.test_models_classification( models, x_train_list, y_train_list, x_test, y_test, path_list, sufix )
resultados.to_excel( r'..\..\..\output\ejecucion_6\results\base0\base0_ci.xlsx' )