<a href="https://colab.research.google.com/github/solcanalla/fiumark/blob/main/stacking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Modelo: Stacking

##Inicialización##

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
!git clone https://github.com/solcanalla/fiumark.git
%cd fiumark
!git pull origin main
import preprocessing as pp

Cloning into 'fiumark'...
remote: Enumerating objects: 98, done.[K
remote: Counting objects: 100% (98/98), done.[K
remote: Compressing objects: 100% (81/81), done.[K
remote: Total 98 (delta 50), reused 41 (delta 16), pack-reused 0[K
Unpacking objects: 100% (98/98), done.
/content/fiumark
From https://github.com/solcanalla/fiumark
 * branch            main       -> FETCH_HEAD
Already up to date.


##Pre procesamiento##

In [3]:
pd.options.display.max_columns = None
df = pp.get_dataset()
df = pp.knn_preprocessing(df)
X_train, X_test, y_train, y_test = pp.get_train_test_data(df)

##Entrenamiento##

In [7]:
from sklearn.ensemble import StackingClassifier

clf_1 = KNeighborsClassifier(weights= 'distance',n_neighbors= 39, metric='minkowski',leaf_size=40,algorithm='auto')
clf_2 = RandomForestClassifier(criterion='entropy',max_depth=6,max_features='log2',n_estimators=60)
clf_3 = XGBClassifier(max_depth=2,min_child_weight=2,n_estimators=60)
stacking_model = StackingClassifier(estimators=[('clf_1', clf_1), ('clf_2', clf_2),('clf_3',clf_3)])

In [33]:
from sklearn.model_selection import GridSearchCV

params = {
    'clf_2__n_estimators': np.arange(50,100,20),
    'clf_2__max_depth': np.arange(1, 8),
    'clf_3__max_depth': np.arange(1, 5),
    'clf_3__n_estimators': np.arange(60,100,20)
}

gscv = GridSearchCV(
    stacking_model, params,scoring='roc_auc', n_jobs=-1, cv=5, return_train_score=True
).fit(X_train, y_train)

In [34]:
print(f"Best score: {gscv.best_score_}")
print(f"Best params {gscv.best_params_}")

Best score: 0.865090687858854
Best params {'clf_2__max_depth': 6, 'clf_2__n_estimators': 50, 'clf_3__max_depth': 2, 'clf_3__n_estimators': 60}


##Evaluación del modelo##

In [35]:
from sklearn.metrics import classification_report

print(classification_report(y_test, gscv.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.80      0.94      0.86        70
         1.0       0.89      0.67      0.76        51

    accuracy                           0.83       121
   macro avg       0.84      0.80      0.81       121
weighted avg       0.84      0.83      0.82       121



##Predicción##

In [36]:
X_holdout = pd.read_csv('https://drive.google.com/uc?export=download&id=1I980-_K9iOucJO26SG5_M8RELOQ5VB6A')
X = pp.knn_preprocessing(X_holdout)
prediction = pd.DataFrame(gscv.predict(X),columns=['volveria'],dtype=int)
pred_formatted = pd.concat([X_holdout.id_usuario,prediction],axis=1)
pred_formatted.to_csv('stacking.csv')
pred_formatted

Unnamed: 0,id_usuario,volveria
0,650,0
1,234,0
2,31,0
3,552,0
4,411,0
...,...,...
85,354,0
86,82,0
87,172,0
88,8,0
