In [1]:
import os

os.chdir('../')

import wandb

import pandas as pd
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [2]:
data = pd.read_csv('data/selected/data.csv')

In [3]:
X, y = data.drop('estado al egreso', axis=1), data['estado al egreso']

### Model Selection

In [4]:
results = []

models = [
    KNeighborsClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier(random_state=1),
    LogisticRegression(),
    SGDClassifier(),
    SVC(),
    GaussianNB(),
    MLPClassifier(),
    DecisionTreeClassifier()
]

In [5]:
from sklearn.metrics import make_scorer, fbeta_score

f2_scorer = make_scorer(fbeta_score, beta=2, pos_label=1)

In [6]:
for clf in models:

    model_name = clf.__class__.__name__

    pipe = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('estimator', clf)
    ])

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

    metrics = {
        'recall': cross_val_score(pipe, X, y, scoring='recall', cv=skf),
        'f1': cross_val_score(pipe, X, y, scoring='f1', cv=skf),
        'f2': cross_val_score(pipe, X, y, scoring=f2_scorer, cv=skf),
        'precision': cross_val_score(pipe, X, y, scoring='precision', cv=skf),
        'roc_auc': cross_val_score(pipe, X, y, scoring='roc_auc', cv=skf),
    }

    results.append({
        "Model": model_name,
        **{f"{metric.capitalize()} (mean)": f"{scores.mean():.3f}" for metric, scores in metrics.items()},
        **{f"{metric.capitalize()} (std)": f"{scores.std():.3f}" for metric, scores in metrics.items()}
    })

df_results = pd.DataFrame(results)
df_results.to_markdown('outputs/model_comparision.md')



|    | Model                      |   Recall (mean) |   F1 (mean) |   F2 (mean) |   Precision (mean) |   Roc_auc (mean) |   Recall (std) |   F1 (std) |   F2 (std) |   Precision (std) |   Roc_auc (std) |
|---:|:---------------------------|----------------:|------------:|------------:|-------------------:|-----------------:|---------------:|-----------:|-----------:|------------------:|----------------:|
|  0 | KNeighborsClassifier       |            0.82 |       0.793 |       0.806 |              0.797 |            0.973 |          0.157 |      0.092 |      0.127 |             0.129 |           0.021 |
|  1 | GradientBoostingClassifier |            0.82 |       0.788 |       0.845 |              0.841 |            0.964 |          0.223 |      0.095 |      0.115 |             0.158 |           0.021 |
|  2 | RandomForestClassifier     |            0.82 |       0.788 |       0.798 |              0.841 |            0.994 |          0.223 |      0.095 |      0.165 |             0.158 |           0.012 |
|  3 | LogisticRegression         |            0.77 |       0.776 |       0.767 |              0.847 |            0.992 |          0.204 |      0.105 |      0.163 |             0.148 |           0.012 |
|  4 | SGDClassifier              |            0.85 |       0.753 |       0.829 |              0.774 |            0.977 |          0.2   |      0.13  |      0.159 |             0.147 |           0.018 |
|  5 | SVC                        |            0.87 |       0.826 |       0.845 |              0.841 |            0.984 |          0.166 |      0.074 |      0.115 |             0.158 |           0.01  |
|  6 | GaussianNB                 |            0.86 |       0.816 |       0.834 |              0.841 |            0.983 |          0.196 |      0.099 |      0.148 |             0.158 |           0.022 |
|  7 | MLPClassifier              |            0.77 |       0.763 |       0.761 |              0.827 |            0.983 |          0.204 |      0.121 |      0.166 |             0.183 |           0.017 |
|  8 | DecisionTreeClassifier     |            0.79 |       0.725 |       0.751 |              0.763 |            0.868 |          0.22  |      0.101 |      0.154 |             0.224 |           0.045 |

In [7]:
from sklearn.model_selection import GridSearchCV

sgd = SGDClassifier(random_state=1, loss='log_loss')

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

param_grid = {
    'estimator__penalty': ['l2', 'l1', 'elasticnet'],
    'estimator__alpha': [1e-4, 1e-3, 1e-2],
    'estimator__max_iter': [1000, 2000],
    'estimator__tol': [1e-3, 1e-4],
    'estimator__class_weight': [None, 'balanced']
}

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('estimator', sgd)
])

grid = GridSearchCV(pipe, param_grid, scoring=f2_scorer, cv=skf)
grid.fit(X, y)

In [8]:
best_model = grid.best_estimator_
print(grid.best_score_)

0.9471731167383342


### Save model

In [9]:
import pickle as pkl

with open('models/sgdc.pkl', 'wb') as file:
    pkl.dump(best_model, file)