In [1]:
import os

os.chdir('../')

import wandb

import pandas as pd
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [2]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/ernestodavidserizeportela/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mernestoserize[0m ([33mernestoserize-constructor-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [14]:
data = pd.read_csv('data/selected/data.csv')

In [15]:
X, y = data.drop('estado al egreso', axis=1), data['estado al egreso']

In [19]:
results = []

models = [
    KNeighborsClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier(random_state=1),
    LogisticRegression(),
    SGDClassifier(),
    SVC(),
    GaussianNB(),
    MLPClassifier(),
    DecisionTreeClassifier()
]

In [20]:
for clf in models:

    model_name = clf.__class__.__name__

    pipe = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('estimator', clf)
    ])

    skf = StratifiedKFold(n_splits=5, shuffle=True)

    metrics = {
        'recall': cross_val_score(pipe, X, y, scoring='recall', cv=skf),
        'f1': cross_val_score(pipe, X, y, scoring='f1', cv=skf),
        'precision': cross_val_score(pipe, X, y, scoring='precision', cv=skf),
        'roc_auc': cross_val_score(pipe, X, y, scoring='roc_auc', cv=skf),
        'accuracy': cross_val_score(pipe, X, y, scoring='accuracy', cv=skf)
    }

    results.append({
        "Model": model_name,
        **{f"{metric.capitalize()} (mean)": f"{scores.mean():.3f}" for metric, scores in metrics.items()},
        **{f"{metric.capitalize()} (std)": f"{scores.std():.3f}" for metric, scores in metrics.items()}
    })

wandb.init(project='leuko-ml', name='models')

df_results = pd.DataFrame(results)
wandb.log({
    "Model Comparison": wandb.Table(dataframe=df_results),
})

wandb.finish()



In [21]:
df_results.to_markdown('outputs/model_comparision.md', index=False)

| Model                      |   Recall (mean) |   F1 (mean) |   Precision (mean) |   Roc_auc (mean) |   Accuracy (mean) |   Recall (std) |   F1 (std) |   Precision (std) |   Roc_auc (std) |   Accuracy (std) |
|:---------------------------|----------------:|------------:|-------------------:|-----------------:|------------------:|---------------:|-----------:|------------------:|----------------:|-----------------:|
| KNeighborsClassifier       |            0.95 |       0.88  |              0.831 |            0.976 |             0.952 |          0.1   |      0.075 |             0.162 |           0.039 |            0.03  |
| GradientBoostingClassifier |            0.78 |       0.779 |              0.831 |            0.974 |             0.914 |          0.22  |      0.149 |             0.162 |           0.032 |            0.036 |
| RandomForestClassifier     |            0.79 |       0.771 |              0.85  |            0.99  |             0.914 |          0.31  |      0.189 |             0.133 |           0.009 |            0.036 |
| LogisticRegression         |            0.81 |       0.788 |              0.867 |            0.986 |             0.895 |          0.097 |      0.054 |             0.163 |           0.012 |            0.056 |
| SGDClassifier              |            0.59 |       0.813 |              0.867 |            0.982 |             0.933 |          0.222 |      0.07  |             0.163 |           0.022 |            0.038 |
| SVC                        |            0.91 |       0.859 |              0.848 |            0.975 |             0.924 |          0.111 |      0.093 |             0.189 |           0.035 |            0.038 |
| GaussianNB                 |            0.9  |       0.873 |              0.841 |            0.981 |             0.943 |          0.122 |      0.117 |             0.158 |           0.011 |            0.056 |
| MLPClassifier              |            0.91 |       0.848 |              0.82  |            0.993 |             0.933 |          0.111 |      0.055 |             0.107 |           0.015 |            0.065 |
| DecisionTreeClassifier     |            0.78 |       0.767 |              0.867 |            0.855 |             0.876 |          0.22  |      0.186 |             0.163 |           0.022 |            0.049 |