In [1]:
import warnings
warnings.filterwarnings( 'ignore' )

In [2]:
import pandas as  pd
import numpy as np
import pickle
import joblib
import matplotlib.pyplot as plt
from importlib.machinery import SourceFileLoader

In [3]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [28]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, f1_score, matthews_corrcoef
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [5]:
fun = SourceFileLoader( 'funciones', r'..\..\..\code\modules\funciones.py' ).load_module()
vn  = SourceFileLoader( 'variables_nombres', r'..\..\..\code\modules\variables_nombres.py' ).load_module()

In [6]:
path = r'..\..\..\input\preprocessed_data\base0.csv'
data = pd.read_csv( path )

## 2. Standard Scales

In [7]:
num_vars_total = vn.renamu_varibles_num + vn.siaf_variables + vn.politica_variables
num_vars       = [ var for var in data.columns if var in num_vars_total ] 
cat_vars       = [ var for var in data.columns if var not in num_vars ]

In [9]:
scaler     = StandardScaler()
scaled_arr = scaler.fit_transform( data[ num_vars ] )
scaled_df  = pd.DataFrame( scaled_arr, columns = num_vars )

In [10]:
df_final = pd.concat( [ scaled_df, data[ cat_vars ] ], axis = 1 )

In [13]:
dep_var    = [ 'corrup_intensa' ]
other_vars = [ 'monto_examinado', 'monto_auditado', 'monto_objeto_servicio', 
               'monto_corrup1', 'monto_corrup2', 'tipo_control', 'corrup_amplia',
               'per_corrup1', 'per_corrup2', '_monto', 'monto_', 'year', 'ubigeo' ]

pred_vars  = [ col for col in df_final.columns if col not in dep_var and col not in other_vars ]

x_train, x_test, y_train, y_test = train_test_split( df_final[ pred_vars ], 
                                                     df_final[ 'corrup_intensa' ], 
                                                     test_size    = 0.3,
                                                     random_state = 2023 )

In [21]:
x_train_s, x_train_st, x_train_nro, y_train_s, y_train_st, y_train_nro = fun.resampling( x_train, y_train )

In [22]:
model = RandomForestClassifier( random_state = 2023, 
                                n_jobs       = -1,
                                max_depth    = 30, 
                                max_features = 5808, 
                                n_estimators = 250 )

model.fit( x_train_nro, y_train_nro )

In [23]:
y_pred_class = model.predict( x_test )
y_pred_prob  = model.predict_proba( x_test )[ :, 1 ]

In [26]:
y_pred_class

array([1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 0., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 0., 1., 0., 0., 1., 1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1.,
       1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 1., 1., 1.,
       1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0.,
       1., 1., 0., 1., 1.

In [32]:
columns       = [ 'no', 'si' ]
rf_report     = classification_report( y_test, y_pred_class, target_names = columns, output_dict = True )

accuracy_     = accuracy_score( y_test, y_pred_class )
log_loss_     = log_loss( y_test, y_pred_class )
roc_auc_      = roc_auc_score( y_test, y_pred_prob )
f1_score_     = f1_score( y_test, y_pred_class, average = 'macro' )
mcc_          = matthews_corrcoef( y_test, y_pred_class )
f1_score_si   = rf_report[ 'si' ][ 'f1-score' ]
f1_score_no   = rf_report[ 'no' ][ 'f1-score' ]

In [34]:
table = np.zeros( ( 1, 7 ) )

table[ 0 ] = [ accuracy_, log_loss_, roc_auc_, mcc_, f1_score_, f1_score_si, f1_score_no ]

colnames_table = [ "Overall_Accuracy", "Log_Loss", "ROC_AUC", "MCC", "F1 Global", "F1_Si", "F1_No" ]
rownames_table = [ "Optimal Random Forest" ] 

table = pd.DataFrame( table, columns = colnames_table )
table.index = rownames_table

table = table.round( 3 )
table

Unnamed: 0,Overall_Accuracy,Log_Loss,ROC_AUC,MCC,F1 Global,F1_Si,F1_No
Optimal Random Forest,0.759,8.685,0.692,0.315,0.654,0.845,0.462


## 2. Feature selection

In [36]:
select = SelectFromModel( model, threshold = 'median' )
select.fit( x_train_nro, y_train_nro )
x_train_nro_selected = select.transform( x_train_nro )
print( x_train_nro.shape )
print( x_train_nro_selected.shape )

(1366, 14520)
(1366, 7260)


In [47]:
select.get_support()
selected_feat = x_train_nro.columns[ ( select.get_support() ) ]

In [52]:
x_test_selected = x_test[ selected_feat ]

In [43]:
model_2 = RandomForestClassifier( random_state = 2023, 
                                  n_jobs       = -1,
                                  max_depth    = 30, 
                                  max_features = 5808, 
                                  n_estimators = 250 )

model_2.fit( x_train_nro_selected, y_train_nro )

In [53]:
y_pred_class_2 = model_2.predict( x_test_selected )
y_pred_prob_2  = model_2.predict_proba( x_test_selected )[ :, 1 ]

In [54]:
columns       = [ 'no', 'si' ]
rf_report_2   = classification_report( y_test, y_pred_class_2, target_names = columns, output_dict = True )

accuracy_2    = accuracy_score( y_test, y_pred_class_2 )
log_loss_2    = log_loss( y_test, y_pred_class_2 )
roc_auc_2     = roc_auc_score( y_test, y_pred_prob_2 )
f1_score_2    = f1_score( y_test, y_pred_class_2, average = 'macro' )
mcc_2         = matthews_corrcoef( y_test, y_pred_class_2 )
f1_score_si_2 = rf_report_2[ 'si' ][ 'f1-score' ]
f1_score_no_2 = rf_report_2[ 'no' ][ 'f1-score' ]

In [57]:
mcc_2

0.2602960376568211

In [58]:
table = np.zeros( ( 1, 7 ) )

table[ 0 ] = [ accuracy_2, log_loss_2, roc_auc_2, mcc_2, f1_score_2, f1_score_si_2, f1_score_no_2 ]

colnames_table = [ "Overall_Accuracy", "Log_Loss", "ROC_AUC", "MCC", "F1 Global", "F1_Si", "F1_No" ]
rownames_table = [ "Optimal Random Forest" ] 

table = pd.DataFrame( table, columns = colnames_table )
table.index = rownames_table

table = table.round( 3 )
table

Unnamed: 0,Overall_Accuracy,Log_Loss,ROC_AUC,MCC,F1 Global,F1_Si,F1_No
Optimal Random Forest,0.747,9.119,0.693,0.26,0.623,0.839,0.407
