In [1]:
import os

os.chdir('../')

import wandb

import pandas as pd
import pickle as pkl

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV

In [2]:
data = pd.read_csv('data/selected/data.csv')

In [3]:
X, y = data.drop('estado al egreso', axis=1), data['estado al egreso']

### Model Selection

In [4]:
results = []

models = [
    KNeighborsClassifier(),
    LogisticRegression(),
    SGDClassifier(random_state=1),
    SVC(kernel='rbf'),
]

In [5]:
from sklearn.metrics import make_scorer, fbeta_score

f2_scorer = make_scorer(fbeta_score, beta=2, pos_label=1)

In [6]:
for clf in models:

    model_name = clf.__class__.__name__

    pipe = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('estimator', clf)
    ])

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

    metrics = {
        'recall': cross_val_score(pipe, X, y, scoring='recall', cv=skf),
        'f1': cross_val_score(pipe, X, y, scoring='f1', cv=skf),
        'f2': cross_val_score(pipe, X, y, scoring=f2_scorer, cv=skf),
        'precision': cross_val_score(pipe, X, y, scoring='precision', cv=skf),
        'roc_auc': cross_val_score(pipe, X, y, scoring='roc_auc', cv=skf),
    }

    results.append({
        "Model": model_name,
        **{f"{metric.capitalize()} (mean)": f"{scores.mean():.4f}" for metric, scores in metrics.items()},
        **{f"{metric.capitalize()} (std)": f"{scores.std():.4f}" for metric, scores in metrics.items()}
    })

df_results = pd.DataFrame(results)
df_results.to_markdown('outputs/model_comparision.md')

In [7]:
from IPython.display import Markdown, display

with open("outputs/model_comparision.md", "r", encoding="utf-8") as f:
    content = f.read()

display(Markdown(content))

|    | Model                |   Recall (mean) |   F1 (mean) |   F2 (mean) |   Precision (mean) |   Roc_auc (mean) |   Recall (std) |   F1 (std) |   F2 (std) |   Precision (std) |   Roc_auc (std) |
|---:|:---------------------|----------------:|------------:|------------:|-------------------:|-----------------:|---------------:|-----------:|-----------:|------------------:|----------------:|
|  0 | KNeighborsClassifier |            0.96 |      0.8974 |      0.9313 |             0.86   |           0.9657 |         0.08   |     0.0637 |     0.0569 |            0.1272 |          0.0359 |
|  1 | LogisticRegression   |            0.82 |      0.8029 |      0.8061 |             0.86   |           0.9946 |         0.2227 |     0.0895 |     0.1689 |            0.1272 |          0.0067 |
|  2 | SGDClassifier        |            0.83 |      0.8192 |      0.8206 |             0.8533 |           0.9946 |         0.1536 |     0.0486 |     0.1048 |            0.1293 |          0.0067 |
|  3 | SVC                  |            0.91 |      0.8688 |      0.8892 |             0.86   |           0.9837 |         0.1114 |     0.0382 |     0.0674 |            0.1272 |          0.0201 |

### KNN

In [8]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('estimator', knn)
])

params_grid = {
    'estimator__n_neighbors': [3, 5, 7, 9],
    'estimator__metric': ['euclidean', 'manhattan', 'minkowski'],
    'estimator__p': [1, 2, 3],
    'estimator__leaf_size': [20, 30, 40, 50],
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
grid = GridSearchCV(pipe, params_grid, scoring=f2_scorer, cv=skf)
grid.fit(X, y)

  _data = np.array(data, dtype=dtype, copy=copy,


In [9]:
best_model = grid.best_estimator_
print(grid.best_score_)

0.9312687312687313


In [10]:
with open('models/knn.pkl', 'wb') as file:
    pkl.dump(best_model, file)

### SVC

In [19]:
svc = SVC()

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('estimator', svc)
])

params_grid = {
    'estimator__C': [0.1, 1, 10, 50, 100, 200],
    'estimator__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1, 10],
    'estimator__kernel': ['rbf'],
    'estimator__probability': [True],
    'estimator__tol': [1e-4, 1e-3, 1e-2],
    'estimator__max_iter': [-1, 100, 200, 500],
    'estimator__class_weight': [None, 'balanced'], 
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
grid = GridSearchCV(pipe, params_grid, scoring=f2_scorer, cv=skf)
grid.fit(X, y)

  _data = np.array(data, dtype=dtype, copy=copy,


In [20]:
best_model = grid.best_estimator_
print(grid.best_score_)

0.9495744191396366


In [21]:
with open('models/svc.pkl', 'wb') as file:
    pkl.dump(best_model, file)