In [1]:
import os

os.chdir('../')

import wandb

import pandas as pd
import pickle as pkl

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [None]:
wandb.login()

### Download selected data

In [None]:
run = wandb.init(
    name='leuko-ml',
    job_type='download data',
    config={'dataset': 'acute-stroke-selected:v2'}
)
artifact = run.use_artifact('ernestoserize-constructor-university/leuko-ml/acute-stroke-selected:v2', type='dataset')
artifact_dir = artifact.download()


In [2]:
data = pd.read_csv('artifacts/acute-stroke-selected:v2/new_data.csv')

In [3]:
X, y = data.drop('discharge_status', axis=1), data['discharge_status']

### Model Selection

In [8]:
results = []

models = [
    KNeighborsClassifier(),
    LogisticRegression(random_state=1),
    SGDClassifier(random_state=1),
    SVC(kernel='rbf'),
    SVC(kernel='sigmoid')
]

### F2 Score
We use the F2-score as the primary evaluation metric to emphasize recall over precision, which is crucial in our imbalanced dataset where identifying the positive class (label 1) correctly is more important than minimizing false positives.

In [5]:
from sklearn.metrics import make_scorer, fbeta_score

f2_scorer = make_scorer(fbeta_score, beta=2, pos_label=1)

In [None]:
for clf in models:

    model_name = clf.__class__.__name__

    pipe = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('estimator', clf)
    ])

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

    model_run = wandb.init(
        project="leuko-ml",
        name=model_name,
        group='model_comparison',
        config={
            "model_name": model_name,
            "scaler": "StandardScaler",
            "cv": 'StratifiedKFold(n_splits=5, shuffle=True, random_state=1)',
        },
        reinit=True
    )

    metrics = {
        'recall': cross_val_score(pipe, X, y, scoring='recall', cv=skf),
        'f1': cross_val_score(pipe, X, y, scoring='f1', cv=skf),
        'f2': cross_val_score(pipe, X, y, scoring=f2_scorer, cv=skf),
        'precision': cross_val_score(pipe, X, y, scoring='precision', cv=skf),
        'roc_auc': cross_val_score(pipe, X, y, scoring='roc_auc', cv=skf),
    }

    for metric, scores in metrics.items():
        wandb.log({
            f"{metric}_mean": scores.mean(),
            f"{metric}_std": scores.std()
        })

    results.append({
        "Model": model_name,
        **{f"{metric.capitalize()} (mean)": f"{scores.mean():.4f}" for metric, scores in metrics.items()},
        **{f"{metric.capitalize()} (std)": f"{scores.std():.4f}" for metric, scores in metrics.items()}
    })

wandb.finish()

df_results = pd.DataFrame(results)
df_results.to_markdown('outputs/model_comparision.md')

In [10]:
from IPython.display import Markdown, display

with open("outputs/model_comparision.md", "r", encoding="utf-8") as f:
    content = f.read()

display(Markdown(content))

|    | Model                |   Recall (mean) |   F1 (mean) |   F2 (mean) |   Precision (mean) |   Roc_auc (mean) |   Recall (std) |   F1 (std) |   F2 (std) |   Precision (std) |   Roc_auc (std) |
|---:|:---------------------|----------------:|------------:|------------:|-------------------:|-----------------:|---------------:|-----------:|-----------:|------------------:|----------------:|
|  0 | KNeighborsClassifier |            0.96 |      0.8974 |      0.9313 |             0.86   |           0.9657 |         0.08   |     0.0637 |     0.0569 |            0.1272 |          0.0359 |
|  1 | LogisticRegression   |            0.82 |      0.8029 |      0.8061 |             0.86   |           0.9946 |         0.2227 |     0.0895 |     0.1689 |            0.1272 |          0.0067 |
|  2 | SGDClassifier        |            0.83 |      0.8192 |      0.8206 |             0.8533 |           0.9946 |         0.1536 |     0.0486 |     0.1048 |            0.1293 |          0.0067 |
|  3 | SVC                  |            0.91 |      0.8688 |      0.8892 |             0.86   |           0.9837 |         0.1114 |     0.0382 |     0.0674 |            0.1272 |          0.0201 |
|  4 | SVC                  |            0.95 |      0.8613 |      0.9075 |             0.8171 |           0.9699 |         0.1    |     0.088  |     0.0725 |            0.1662 |          0.0378 |

### Analysis

SVC with sigmoid kernel and KNeighborsClassifier demonstrate the highest F2-scores (0.9075 and 0.9313, respectively) and strong recall values (0.95 and 0.96), indicating their potential for optimized performance. Their relatively low standard deviations also suggest consistent results across folds, making them suitable candidates for hyperparameter tuning.

### KNN

Wandb sweep to track metrics for each hyper-parameters combination

In [15]:
def sweep_knn(config=None):
    
    with wandb.init(config=config):
        
        config = wandb.config

        knn = KNeighborsClassifier(
            n_neighbors=config.n_neighbors,
            metric=config.metric,
            p=config.p,
            leaf_size=config.leaf_size
        )

        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('estimator', knn)
        ])

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
        f2_scorer = make_scorer(fbeta_score, beta=2, pos_label=1)
        f2_scores = cross_val_score(pipe, X, y, scoring=f2_scorer, cv=skf)

        wandb.log({
            'f2_score': f2_scores.mean(),
            'f2_std': f2_scores.std()
        })

In [None]:
sweep_config = {
    'method': 'grid',
    'metric': {
        'name': 'f2_score',
        'goal': 'maximize'
    },
    'parameters': {
        'n_neighbors': {'values': [3, 5, 7, 9]},
        'metric': {'values': ['euclidean', 'manhattan', 'minkowski']},
        'p': {'values': [1, 2, 3]},
        'leaf_size': {'values': [20, 30, 40, 50]}
    }
}

sweep_id = wandb.sweep(sweep_config, project='leuko-ml')
wandb.agent(sweep_id, function=sweep_knn)

In [7]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('estimator', knn)
])

params_grid = {
    'estimator__n_neighbors': [3, 5, 7, 9],
    'estimator__metric': ['euclidean', 'manhattan', 'minkowski'],
    'estimator__p': [1, 2, 3],
    'estimator__leaf_size': [20, 30, 40, 50],
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
grid = GridSearchCV(pipe, params_grid, scoring=f2_scorer, cv=skf)
grid.fit(X, y)

In [8]:
best_model = grid.best_estimator_
print(grid.best_score_)

0.9312687312687313


In [9]:
with open('models/knn.pkl', 'wb') as file:
    pkl.dump(best_model, file)

### SVC sigmoid kernel

##### Track metrics for each hyper-parameter combination

In [10]:
def sweep_svc(config=None):
    
    with wandb.init(config=config):
        
        config = wandb.config

        svc = SVC(
            C=config.C,
            gamma=config.gamma,
            kernel=config.kernel,
            probability=config.probability,
            tol=config.tol,
            class_weight=config.class_weight
        )

        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('estimator', svc)
        ])

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
        f2_scorer = make_scorer(fbeta_score, beta=2, pos_label=1)
        f2_scores = cross_val_score(pipe, X, y, scoring=f2_scorer, cv=skf)

        wandb.log({
            'f2_score': f2_scores.mean(),
            'f2_std': f2_scores.std()
        })

In [None]:
sweep_config = {
    'method': 'grid',
    'metric': {
        'name': 'f2_score',
        'goal': 'maximize'
    },
    'parameters': {
        'C': {'values': [0.1, 0.2, 0.3, 0.4, 0.5]},
        'gamma': {'values': ['scale', 'auto', 0.001, 0.01, 0.1, 1, 10]},
        'kernel': {'values': ['rbf', 'sigmoid']},
        'probability': {'values':  [True]},
        'tol': {'values':  [1e-4, 1e-3, 1e-2]},
        'class_weight': {'values':   [None, 'balanced']}
    }
}

sweep_id = wandb.sweep(sweep_config, project='leuko-ml')
wandb.agent(sweep_id, function=sweep_svc)

In [15]:
svc = SVC()

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('estimator', svc)
])

params_grid = {
    'estimator__C': [0.1, 0.2, 0.3, 0.4, 0.5],
    'estimator__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1, 10],
    'estimator__kernel': ['rbf', 'sigmoid', 'poly'],
    'estimator__probability': [True],
    'estimator__tol': [1e-4, 1e-3, 1e-2],
    'estimator__class_weight': [None, 'balanced'], 
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
grid = GridSearchCV(pipe, params_grid, scoring=f2_scorer, cv=skf)
grid.fit(X, y)

In [11]:
best_model = grid.best_estimator_
print(grid.best_score_)

0.9566969262621436


In [12]:
with open('models/svc.pkl', 'wb') as file:
    pkl.dump(best_model, file)

### Conclusions

After hyperparameter tuning, SVC kernel surpasses KNeighborsClassifier in F2-score performance (0.9566 vs. 0.9313), indicating a more effective balance between precision and recall. Given this improvement, SVC will be selected for further analysis.

### Wandb models artifacts

In [None]:
wandb.init(
    project='leuko-ml',
    name='knn-tuned',
)

artifact = wandb.Artifact(
    name='knn',
    type='model',
    metadata={
        'model': 'KNeighborsClassifier',
        'scaler': 'StandardScaler',
        'best_f2_score': 0.9312687312687313,
        'leaf_size': 20,
        'metric': 'euclidean',
        'p': 1,
        'n_neighbors': 5,
    },
)

artifact.add_file('models/knn.pkl')

wandb.log_artifact(artifact)

wandb.finish()

In [None]:
wandb.init(
    project='leuko-ml',
    name='svc-tuned',
)

artifact = wandb.Artifact(
    name='svc',
    type='model',
    metadata={
        'model': 'SVC',
        'scaler': 'StandardScaler',
        'best_f2_score': 0.9566969262621436,
        'C': 0.4,
        'probability': True,
        'tol': 0.0001,
        'kernel': 'sigmoid'
    },
)

artifact.add_file('models/svc.pkl')

wandb.log_artifact(artifact)

wandb.finish()