# BASE 0 - CI

Information:

* Database: base 0
* Predicted variable: Corrupción Intensa
* Predictor variables: SIAF/Canon, RENAMU y Política
* Type of prediction: Clasification
* Period of training: 2016-2020
* Methods: Linear, Regularization & Random Forest

## 1. Cargar librerías, módulos y datos

In [1]:
!pip install imblearn
!pip install xgboost
!pip install sklearn==1.0
!pip install joblib==1.1.0
!pip install scikit-learn==1.0.2
!pip install xlsxwriter
!pip install econml

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0
[31mERROR: Could not find a version that satisfies the requirement sklearn==1.0 (from versions: 0.0, 0.0.post1, 0.0.post2, 0.0.post4, 0.0.post5, 0.0.post7, 0.0.post9, 0.0.post10, 0.0.post11, 0.0.post12)[0m[31m
[0m[31mERROR: No matching distribution found for sklearn==1.0[0m[31m
[0mCollecting joblib==1.1.0
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.0/307.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: joblib
  Attempting uninstall: joblib
    Found existing installation: joblib 1.3.2
    Uninstalling joblib-1.3.2:
      Successfully uninstalled joblib-1.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following

In [2]:
import warnings
warnings.filterwarnings( 'ignore' )

In [3]:
import pandas as  pd
import numpy as np
import pickle
import joblib
from importlib.machinery import SourceFileLoader

In [4]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, f1_score, matthews_corrcoef
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import KFold

In [6]:
from econml.grf import RegressionForest
from lightgbm import LGBMClassifier

In [7]:
from google.colab import drive
drive.mount('/content/drive')

import sys
import xlsxwriter
sys.path.append('/content/drive/MyDrive/Corruption_Paper_SIAF/code/modules')

Mounted at /content/drive


In [8]:
import funciones as fun
import variables_nombres as vn

In [9]:
path = r'/content/drive/MyDrive/Corruption_Paper_SIAF/input/preprocessed_data/base0.csv'
data = pd.read_csv( path )

## 2. Realizar la partición en conjunto de entrenamiento y prueba

In [10]:
dep_var    = [ 'corrup_intensa' ]
other_vars = [ 'monto_examinado', 'monto_auditado', 'monto_objeto_servicio',
               'monto_corrup1', 'monto_corrup2', 'tipo_control', 'corrup_amplia',
               'per_corrup1', 'per_corrup2', '_monto', 'monto_', 'year', 'ubigeo' ]

pred_vars  = [ col for col in data.columns if col not in dep_var and col not in other_vars ]

x_train, x_test, y_train, y_test = train_test_split( data[ pred_vars ],
                                                     data[ 'corrup_intensa' ],
                                                     test_size    = 0.3,
                                                     random_state = 2023,
                                                     stratify = data[ 'corrup_intensa' ] )

In [11]:
# x_train_columns = x_train.columns.to_list()
# x_train_columns = pd.DataFrame( x_train_columns, columns = [ 'colname' ] )
# x_train_columns.to_excel( r'..\..\..\code\prediction\ejecucion_7\colnames_b0_ci_siaf.xlsx' )

## 3. Implementar métodos de muestreo

Se implementan los métodos SMOTE, SMOTE Tomek-Links y Naive Random Oversampling

In [12]:
x_train_s, x_train_st, x_train_nro, y_train_s, y_train_st, y_train_nro = fun.resampling( x_train, y_train )

In [13]:
x_train.name = 'x_train_o'
x_train_s.name = 'x_train_s'
x_train_st.name = 'x_train_st'
x_train_nro.name = 'x_train_nro'

In [14]:
check_data      = np.zeros( ( 4, 4 ) )

check_data[ 0 ] = [ x_train.shape[ 0 ], x_train.shape[ 1 ],
                    y_train.value_counts()[ 0 ], y_train.value_counts()[ 1 ] ]

check_data[ 1 ] = [ x_train_s.shape[ 0 ], x_train_s.shape[ 1 ],
                    y_train_s.value_counts()[ 0 ], y_train_s.value_counts()[ 1 ] ]

check_data[ 2 ] = [ x_train_st.shape[ 0 ], x_train_st.shape[ 1 ],
                    y_train_st.value_counts()[ 0 ], y_train_st.value_counts()[ 1 ] ]

check_data[ 3 ] = [ x_train_nro.shape[ 0 ], x_train_nro.shape[ 1 ],
                    y_train_nro.value_counts()[ 0 ], y_train_nro.value_counts()[ 1 ] ]

colnames        = [ 'observaciones', 'variables', 'Nro. No', 'Nro. Si' ]

rownames        = [ 'Original',
                    'SMOTE',
                    'SOMTE Tomek',
                    'NRS' ]

table_check_data = pd.DataFrame( check_data, columns = colnames )
table_check_data.index = rownames
table_check_data

Unnamed: 0,observaciones,variables,Nro. No,Nro. Si
Original,967.0,14520.0,272.0,695.0
SMOTE,1390.0,14520.0,695.0,695.0
SOMTE Tomek,1226.0,14520.0,613.0,613.0
NRS,1390.0,14520.0,695.0,695.0


## 4. Implementar los modelos de clasificación

In [15]:
o_20 = 20*x_train.shape[ 1 ]/100
o_30 = 30*x_train.shape[ 1 ]/100
o_40 = 40*x_train.shape[ 1 ]/100

print( o_20, o_30, o_40, sep = '\n' )

2904.0
4356.0
5808.0


In [16]:
kfold   = KFold( 5, random_state = 2023, shuffle=True )
lambdas = 10**np.linspace( 8, -6, 100 )
scoring = 'f1_macro'

In [17]:
models = {

    'Logistic Regression'      : { 'model'      : LogisticRegression( random_state = 2023, n_jobs = -1 ) },

    'Logistic Lasso'           : { 'model'      : LogisticRegressionCV( random_state = 2023, penalty = 'l1', solver = 'saga', n_jobs = -1, Cs = lambdas, cv = kfold, scoring = scoring ) },

    'Logistic Ridge'           : { 'model'      : LogisticRegressionCV( random_state = 2023, penalty = 'l2', solver = 'saga', n_jobs = -1, Cs = lambdas, cv = kfold, scoring = scoring ) },

    'Logistic Elastic Net'     : { 'model'      : LogisticRegressionCV( random_state = 2023, penalty = 'elasticnet', solver = 'saga', l1_ratios = [ 0.5 ], n_jobs = -1, Cs = lambdas, cv = kfold, scoring = scoring ) }#,

    # 'Random Forest Classifier' : { 'model'      : RandomForestClassifier( random_state = 2023, n_jobs = -1 ),
                                  #  'grid_params': { 'n_estimators': [ 250, 500, 1000 ], 'max_depth': [ 20, 30 ], 'max_features': [ 2904, 4356, 5808 ] } },

    # 'XGboost Classifier'       : { 'model'      : XGBClassifier( random_state = 2023, use_label_encoder = False, objective = 'binary:logistic', verbosity = 0, learning_rate = 0.1, n_jobs = -1 ),
    #                                'grid_params': { 'n_estimators': [ 250, 500, 1000 ], 'max_depth': [ 1, 2 ], 'max_features': [ 2904, 4356, 5808 ] } },

    # 'LGBMClassifier'           : { 'model'      : LGBMClassifier( random_state = 2023, n_jobs = -1 ),
    #                                'grid_params': { 'n_estimators': [ 250, 500, 1000 ], 'max_depth': [ 1, 2 ] } },

        }

# models_regression_forest = {
#     'Regression Forest' : { 'model'      : RegressionForest( random_state = 2023, n_jobs = -1 ),
#                             'grid_params': { 'n_estimators': [ 252, 500, 1000 ], 'max_depth': [ 10, 20, 30 ] } }
#         }

In [18]:
models_path  = '/content/drive/MyDrive/Corruption_Paper_SIAF/output/ejecucion_10/models'
results_path = '/content/drive/MyDrive/Corruption_Paper_SIAF/output/ejecucion_10/results'
vars_path    = '/content/drive/MyDrive/Corruption_Paper_SIAF/output/ejecucion_10/vars'
gs_path      = '/content/drive/MyDrive/Corruption_Paper_SIAF/output/ejecucion_10/gridsearch_results'

In [19]:
x_train_list = [ x_train, x_train_s, x_train_st, x_train_nro ]
y_train_list = [ y_train, y_train_s, y_train_st, y_train_nro ]
path_list    = [ models_path, results_path, vars_path, gs_path ]

In [20]:
sufix   = 'ci'

In [21]:
# Main models

resultados = fun.test_models_classification( models, x_train_list, y_train_list, x_test, y_test, path_list, sufix )
resultados.to_excel( '/content/drive/MyDrive/Corruption_Paper_SIAF/output/ejecucion_10/results/base0_ci__lr.xlsx' )

In [22]:
# Regression Forest Model

# resultados_rf = fun.test_regression_forest( models_regression_forest, x_train_list, y_train_list, x_test, y_test, path_list, sufix )
# resultados_rf.to_excel( '/content/drive/MyDrive/Corruption_Paper_SIAF/output/ejecucion_10/results/base0_ci_regression_forest.xlsx' )