<a href="https://colab.research.google.com/github/solcanalla/fiumark/blob/main/stacking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Modelo: Stacking

##Inicialización##

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [37]:
!git clone https://github.com/solcanalla/fiumark.git
!git pull origin main
import fiumark.preprocessing as pp

Cloning into 'fiumark'...
remote: Enumerating objects: 129, done.[K
remote: Counting objects:   0% (1/129)[Kremote: Counting objects:   1% (2/129)[Kremote: Counting objects:   2% (3/129)[Kremote: Counting objects:   3% (4/129)[Kremote: Counting objects:   4% (6/129)[Kremote: Counting objects:   5% (7/129)[Kremote: Counting objects:   6% (8/129)[Kremote: Counting objects:   7% (10/129)[Kremote: Counting objects:   8% (11/129)[Kremote: Counting objects:   9% (12/129)[Kremote: Counting objects:  10% (13/129)[Kremote: Counting objects:  11% (15/129)[Kremote: Counting objects:  12% (16/129)[Kremote: Counting objects:  13% (17/129)[Kremote: Counting objects:  14% (19/129)[Kremote: Counting objects:  15% (20/129)[Kremote: Counting objects:  16% (21/129)[Kremote: Counting objects:  17% (22/129)[Kremote: Counting objects:  18% (24/129)[Kremote: Counting objects:  19% (25/129)[Kremote: Counting objects:  20% (26/129)[Kremote: Counting objects:  21% (28/

##Pre procesamiento##

In [38]:
pd.options.display.max_columns = None
df = pp.get_dataset()
df = pp.knn_preprocessing(df)
X_train, X_test, y_train, y_test = pp.get_train_test_data(df)

##Entrenamiento##

In [39]:
from sklearn.ensemble import StackingClassifier

clf_1 = KNeighborsClassifier(weights= 'distance',n_neighbors= 39, metric='minkowski',leaf_size=40,algorithm='auto')
clf_2 = RandomForestClassifier(criterion='entropy',max_depth=6,max_features='log2',n_estimators=60)
clf_3 = XGBClassifier(max_depth=2,min_child_weight=2,n_estimators=60)
stacking_model = StackingClassifier(estimators=[('clf_1', clf_1), ('clf_2', clf_2),('clf_3',clf_3)])

In [40]:
from sklearn.model_selection import GridSearchCV

params = {
    'clf_2__n_estimators': np.arange(50,100,20),
    'clf_2__max_depth': np.arange(1, 8),
    'clf_3__max_depth': np.arange(1, 5),
    'clf_3__n_estimators': np.arange(60,100,20)
}

gscv = GridSearchCV(
    stacking_model, params,scoring='roc_auc', n_jobs=-1, cv=5, return_train_score=True
).fit(X_train, y_train)

In [41]:
print(f"Best score: {gscv.best_score_}")
print(f"Best params {gscv.best_params_}")

Best score: 0.8637944275811591
Best params {'clf_2__max_depth': 6, 'clf_2__n_estimators': 50, 'clf_3__max_depth': 2, 'clf_3__n_estimators': 60}


##Evaluación del modelo##

In [42]:
y_pred = gscv.predict(X_test)

##Precision recall y F1-score###

In [43]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.82      0.93      0.87        95
         1.0       0.87      0.71      0.78        66

    accuracy                           0.84       161
   macro avg       0.85      0.82      0.83       161
weighted avg       0.84      0.84      0.84       161



###Accuracy###

In [44]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8385093167701864

###Auc-Roc###

In [45]:
from sklearn.metrics import roc_auc_score
auc_roc = roc_auc_score(y_test, y_pred)
auc_roc

0.8192185007974482

##Predicción##

In [46]:
X_holdout = pd.read_csv('https://drive.google.com/uc?export=download&id=1I980-_K9iOucJO26SG5_M8RELOQ5VB6A')
X = pp.knn_preprocessing(X_holdout)
prediction = pd.DataFrame(gscv.predict(X),columns=['volveria'],dtype=int)
pred_formatted = pd.concat([X_holdout.id_usuario,prediction],axis=1)
pred_formatted.to_csv('stacking.csv')
pred_formatted

Unnamed: 0,id_usuario,volveria
0,650,0
1,234,0
2,31,0
3,552,0
4,411,0
...,...,...
85,354,0
86,82,0
87,172,0
88,8,0
