In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
os.chdir('../')

In [3]:
data = pd.read_csv('data/processed/clf/data.csv')

In [4]:
X, y = data.drop('Estado al egreso', axis=1), data['Estado al egreso']

In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
models = [
    KNeighborsClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier(random_state=1),
    LogisticRegression(),
    SGDClassifier(),
    SVC(),
    GaussianNB(),
    MLPClassifier(),
    DecisionTreeClassifier()
]

In [7]:
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for model in models:
    
    f1_scores = cross_val_score(model, X_scaled, y, scoring='f1', cv=kf)
    roc_auc_scores = cross_val_score(model, X_scaled, y, scoring='roc_auc', cv=kf)
    
    print(f"{model.__class__.__name__} - f1_score: {f1_scores.mean():.2f} ± {f1_scores.std():.2f}, "
          f"roc_auc: {roc_auc_scores.mean():.2f} ± {roc_auc_scores.std():.2f}")

KNeighborsClassifier - f1_score: 0.82 ± 0.04, roc_auc: 0.93 ± 0.02
GradientBoostingClassifier - f1_score: 0.91 ± 0.04, roc_auc: 0.96 ± 0.02
RandomForestClassifier - f1_score: 0.92 ± 0.03, roc_auc: 0.97 ± 0.01
LogisticRegression - f1_score: 0.84 ± 0.03, roc_auc: 0.92 ± 0.02
SGDClassifier - f1_score: 0.80 ± 0.02, roc_auc: 0.88 ± 0.02
SVC - f1_score: 0.88 ± 0.05, roc_auc: 0.94 ± 0.01
GaussianNB - f1_score: 0.70 ± 0.05, roc_auc: 0.92 ± 0.04




MLPClassifier - f1_score: 0.85 ± 0.02, roc_auc: 0.92 ± 0.03
DecisionTreeClassifier - f1_score: 0.89 ± 0.05, roc_auc: 0.87 ± 0.05




### Best Model

We see that RandomForestClassifier has the best performance

In [8]:
data = X.join(y)
data.to_csv('data/processed/clf/data.csv', index=False)