In [1]:
#save as .pkl model
import joblib
from sklearn.ensemble import GradientBoostingClassifier
#import pipelines
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
#import 3 class classification dataset
from sklearn.datasets import load_iris
import pandas as pd
#split the data into train and test
from sklearn.model_selection import train_test_split
#import gridsearch
from sklearn.model_selection import GridSearchCV

#import logistic regression classifier
from sklearn.utils import class_weight
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_sample_weight
#import stratified shuffle split
from sklearn.model_selection import StratifiedShuffleSplit
#improt Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

from gmlutils.models.classification import Classification
from gmlutils.models_analysis.xai import Xai
from gmlutils.models_analysis.classification_analysis import Canalysis

  from pandas import MultiIndex, Int64Index


In [2]:
#read final.csv
df = pd.read_csv('final.csv')
df.classification_only_nationals=df.classification_only_nationals.astype('category')

df.drop(columns=["saldo_fr","classification_only_nationals"], inplace=True)
df.saldo_fr_sign=df.saldo_fr_sign.astype('category')
df.set_index('Fecha', inplace=True)
X=df.drop(columns=["saldo_fr_sign"])
y=df.saldo_fr_sign

### SPLIT THE DF STRATIFYING THE DATA

In [3]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=2023)
for train_index, test_index in split.split(X, y):
    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]

### Upsampling

In [5]:
SMOTE_ = SMOTE(random_state=2023)
X_train_, y_train_ = SMOTE_.fit_resample(X_train, y_train)

In [6]:
clf=Classification()

In [8]:
rf_smote=clf.RandomForest_Classifier(X_train_, y_train_, grid_params={'RFC__n_estimators': [200, 300], 'RFC__max_depth': [None], 'RFC__min_samples_split': [2], 'RFC__max_features': ['sqrt']})

 INFO: Agurments params must start as 'RFC__param'
INFO: Default params in Documentation for Random Forest are:  {'n_estimators': 100, 'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'bootstrap': True, 'oob_score': False, 'n_jobs': None, 'random_state': None, 'verbose': 0, 'warm_start': False, 'class_weight': None, 'ccp_alpha': 0.0, 'max_samples': None}

INFO: Default params RUN for this model are:  grid_params = {'RFC__n_estimators': [200, 300], 'RFC__max_depth': [None], 'RFC__min_samples_split': [2], 'RFC__max_features': ['sqrt']} scoring = accuracy criterion = gini bayes_n_iter = 30 class_weigth = None bayes_pbounds = None bayes_int_params = None ordinal_cat_cols = None random_state = None n_jobs = -1
Grid search is running
Fitting 10 folds for each of 2 candidates, totalling 20 fits


### Model Analysis

In [9]:
analysis=Canalysis(rf_smote, X_train_, y_train_, X_test, y_test)

In [11]:
analysis.confusion_matrix('train', labels=y_train_.cat.categories)

Confusion Matrix and Statistics
	   Prediction
 Reference   -1    0    1
        -1 8123    0    0
         0    0 8123    0
         1    0    0 8123

 Overall Multiclass Score Using Macro

Accuracy: 1.0
No Information Rate: 0.333
P-Value [Acc > NIR]: 0.0
Kappa: 1.0
Mcnemar's Test P-Value: 1.0
Precision: 1.0
Recall: 1.0
Balanced accuracy: 1.0
F1 Score: 1.0

Individual Class Scores
Class: -1
Recall: 1.0
Specificity: 1.0
Precision: 1.0
F1 Score: 1.0

Class: 0
Recall: 1.0
Specificity: 1.0
Precision: 1.0
F1 Score: 1.0

Class: 1
Recall: 1.0
Specificity: 1.0
Precision: 1.0
F1 Score: 1.0



Note that pos_label (set to -1) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to -1) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to -1) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_label] to specify a single positive class.


In [12]:
analysis.confusion_matrix('test', labels=y_train_.cat.categories)

Confusion Matrix and Statistics
	   Prediction
 Reference   -1   0    1
        -1 1893 130    8
         0   91 371  113
         1    9  92 1424

 Overall Multiclass Score Using Macro

Accuracy: 0.893
No Information Rate: 0.395
P-Value [Acc > NIR]: 0.0
Kappa: 0.823
Mcnemar's Test P-Value: 0.01
Precision: 0.832
Recall: 0.837
Balanced accuracy: 0.837
F1 Score: 0.835

Individual Class Scores
Class: -1
Recall: 0.932
Specificity: 0.952
Precision: 0.95
F1 Score: 0.941

Class: 0
Recall: 0.645
Specificity: 0.938
Precision: 0.626
F1 Score: 0.635

Class: 1
Recall: 0.934
Specificity: 0.954
Precision: 0.922
F1 Score: 0.928



Note that pos_label (set to -1) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to -1) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to -1) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_label] to specify a single positive class.


#### Permutation importance

In [None]:
analysis.permutation_importance(n_repeats=30)

#### SHAP IMPORTANCE

In [13]:
xai=Xai()

In [14]:
xai.shap_feature_importance(rf_smote,X_train_,y_train_)