In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
data = pd.read_csv('../data/processed/clf/data.csv')

In [10]:
X, y = data.drop('Estado al egreso', axis=1), data['Estado al egreso']

In [11]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Model Selection

In [12]:
models = [
    KNeighborsClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier(random_state=1),
    LogisticRegression(),
    SGDClassifier(),
    SVC(),
    GaussianNB(),
    MLPClassifier(max_iter=1000),
    DecisionTreeClassifier()
]

In [13]:
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for model in models:
    
    recall_scores = cross_val_score(model, X_scaled, y, scoring='recall', cv=kf)
    precision_scores = cross_val_score(model, X_scaled, y, scoring='precision', cv=kf)
    f1_scores = cross_val_score(model, X_scaled, y, scoring='f1', cv=kf)
    roc_auc_scores = cross_val_score(model, X_scaled, y, scoring='roc_auc', cv=kf)
    
    print(f"{model.__class__.__name__} - f1_score: {f1_scores.mean():.2f} ± {f1_scores.std():.2f}, "
          f"roc_auc: {roc_auc_scores.mean():.2f} ± {roc_auc_scores.std():.2f}, "
          f"recall: {recall_scores.mean():.2f} +- {recall_scores.std():.2f}, "
          f"precision: {precision_scores.mean():.2f} +- {precision_scores.std():.2f}, "
        )

KNeighborsClassifier - f1_score: 0.85 ± 0.08, roc_auc: 0.92 ± 0.05, recall: 0.95 +- 0.08, precision: 0.79 +- 0.11, 
GradientBoostingClassifier - f1_score: 0.91 ± 0.04, roc_auc: 0.95 ± 0.04, recall: 0.94 +- 0.06, precision: 0.90 +- 0.10, 
RandomForestClassifier - f1_score: 0.93 ± 0.04, roc_auc: 0.97 ± 0.03, recall: 0.94 +- 0.04, precision: 0.92 +- 0.08, 
LogisticRegression - f1_score: 0.85 ± 0.08, roc_auc: 0.92 ± 0.06, recall: 0.87 +- 0.01, precision: 0.84 +- 0.14, 
SGDClassifier - f1_score: 0.78 ± 0.12, roc_auc: 0.88 ± 0.05, recall: 0.75 +- 0.20, precision: 0.79 +- 0.10, 
SVC - f1_score: 0.86 ± 0.08, roc_auc: 0.94 ± 0.03, recall: 0.90 +- 0.06, precision: 0.84 +- 0.12, 
GaussianNB - f1_score: 0.75 ± 0.05, roc_auc: 0.94 ± 0.02, recall: 1.00 +- 0.00, precision: 0.60 +- 0.07, 
MLPClassifier - f1_score: 0.92 ± 0.06, roc_auc: 0.96 ± 0.05, recall: 0.95 +- 0.05, precision: 0.88 +- 0.12, 
DecisionTreeClassifier - f1_score: 0.87 ± 0.01, roc_auc: 0.87 ± 0.04, recall: 0.87 +- 0.10, precision: 0.87