In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from matplotlib import style

from pathlib import Path
from utils import read_data

style.use("seaborn-v0_8")

RECOMPUTE = True

project_dir = Path(".")
data_dir = project_dir / "data"

data_slo = read_data(data_dir=data_dir, cohort="slo", version="final", recompute=RECOMPUTE)
data_por = read_data(data_dir=data_dir, cohort="por", version="final", recompute=RECOMPUTE)

data_raw = pd.concat([data_slo, data_por], axis=0)
data_raw.sample(3, random_state=3)

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from utils import X_COLUMN_ORDER
from utils import COLUMN_ORDER
from utils import Y_COLUMN
from utils import BINARY_CATEGORICAL_COLUMNS
from utils import MULTI_CATEGORICAL_COLUMNS
from utils import CLASS_NAMES
from utils import impute_and_scale_data

In [None]:
data, scaling_info = impute_and_scale_data(data_raw, mask_predicate=lambda row: row["cohort"] == "slo")

In [None]:
from utils import train_model_and_cv
from utils import X_COLUMNS
from sklearn.linear_model import LogisticRegression

BASE_MODEL = LogisticRegression(random_state=0, penalty="l2", max_iter=100)
PARAM_GRID = {
    "class_weight": ["balanced", None],
    "C": [1 / 128, 1 / 64, 1 / 32, 1 / 16, 1 / 8, 1 / 4, 1.0, 4.0, 16.0, 64.0, 128.0],
    "fit_intercept": [True, False],
}
RANDOM_STATE = 3
SIZE_TEST_SPLIT = 0.4

data["split"] = "test"
indices_train_val, _ = train_test_split(
    data[data["cohort"] == "slo"].index,
    test_size=SIZE_TEST_SPLIT,
    random_state=RANDOM_STATE,
    stratify=data[data["cohort"] == "slo"][Y_COLUMN],
)
data.loc[indices_train_val, "split"] = "train_val"

model, df_cv = train_model_and_cv(
    model=BASE_MODEL,
    param_grid=PARAM_GRID,
    X=data[data["split"] == "train_val"][X_COLUMNS],
    y=data[data["split"] == "train_val"][Y_COLUMN],
    cv=3,
    scoring="roc_auc",
)

In [None]:
from sklearn.metrics import roc_auc_score, classification_report
import numpy as np
from utils import filter_by_metadata

for cohort, version, split in [
    ("slo", "final", "train_val"),
    ("slo", "final", "test"),
    ("por", "final", "test"),
]:
    print(f"Split: {split}, '{cohort}', {version}")
    data_subset = filter_by_metadata(data, cohort=cohort, version=version, split=split)
    
    y_true = data_subset[Y_COLUMN]
    y_pred = model.predict(data_subset[X_COLUMNS])
    report = classification_report(y_true, y_pred, target_names=CLASS_NAMES)
    print(report)

    print("AUC:", roc_auc_score(
        y_true=data_subset[Y_COLUMN],
        y_score=model.predict_proba(data_subset[X_COLUMNS])[:, 1],
    ))
    print("\n")

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

from utils import compute_metrics

# Sensitivity = TP Rate = Recall = TP / P
# Specificity = TN Rate = TN / N

metrics = []
for t in np.linspace(0.01, 1.00, 100):
    recall_pos_slo, recall_neg_slo, precision_pos_slo = compute_metrics(
        filter_by_metadata(data, cohort="slo", version="final", split="test"),
        model=model,
        threshold=t
    )

    recall_pos_por, recall_neg_por, precision_pos_por = compute_metrics(
        filter_by_metadata(data, cohort="por", version="final", split="test"),
        model=model,
        threshold=t
    )

    metrics.append({
        "threshold": t,
        # Slovenia
        "specificity (slo/test)": recall_neg_slo,
        "sensitivity/recall (slo/test)": recall_pos_slo,
        "precision (slo/test)": precision_pos_slo,
        # Portugal
        "specificity (por/test)": recall_neg_por,
        "sensitivity/recall (por/test)": recall_pos_por,
        "precision (por/test)": precision_pos_por,
    })

metrics_df = pd.DataFrame(metrics)

import matplotlib.pyplot as plt
from matplotlib import style

fig, ax = plt.subplots(figsize=(8, 6))
ax.set_title("Specificity-Sensitivity Curve")
ax.set_xlabel("Sensivity")
ax.set_ylabel("Specificity")
ax.plot(
    metrics_df["sensitivity/recall (slo/test)"],
    metrics_df["specificity (slo/test)"],
    color="tab:green",
    label="SLO (Test split)",
)
ax.plot(
    metrics_df["sensitivity/recall (por/test)"],
    metrics_df["specificity (por/test)"],
    color="tab:red",
    label="POR (Test split)",
)
plt.legend(loc="lower left")
metrics_df.to_excel(project_dir / "specificity_sensitivity.xlsx")

metrics_df[metrics_df["specificity (slo/test)"] > 0.949]

In [None]:
assert model.coef_.shape == (1, len(model.feature_names_in_))

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

pd.DataFrame({
    "feature_name": model.feature_names_in_,
    "weight": model.coef_[0],
}).sort_values("weight", ascending=False)

In [None]:
assert (data_raw.index == data.index).all()

data_raw["split"] = data["split"]
data_raw["predicted_probability"] = pd.Series(model.predict_proba(data[X_COLUMNS])[:, 1], index=data.index)
data_raw.to_excel(project_dir / "model_split_probability.xlsx")
data_raw.head(3)

In [None]:
from sklearn.metrics import PrecisionRecallDisplay
import matplotlib.pyplot as plt


fig, ax = plt.subplots(figsize=(12, 8))
ax.set_title("Precision-Recall Curve")
for cohort, version, split, kwargs in [
    ("slo", "final", "train_val", {"color": "tab:green", "alpha": 0.5}),
    ("slo", "final", "test", {"color": "tab:green", "alpha": 1.0}),
    ("por", "final", "test", {"color": "tab:red", "alpha": 0.5}),    
    # ("slo", 2.0, "train_val", {"color": "tab:green", "alpha": 0.5}),
    # ("slo", 2.0, "test", {"color": "tab:green", "alpha": 1.0}),
    # ("por", 2.0, "test", {"color": "tab:red", "alpha": 0.5}),
    # ("por", 3.0, "test", {"color": "tab:red", "alpha": 1.0}),
]:    
    print(f"Split: {split}, {cohort}/{version}")
    data_subset = filter_by_metadata(data, cohort=cohort, version=version, split=split)
    
    y_true = data_subset[Y_COLUMN]
    y_pred = model.predict_proba(data_subset[X_COLUMNS])[:, 1]

    display = PrecisionRecallDisplay.from_predictions(
        y_true=y_true,
        y_pred=y_pred,
        name=f"{split}, {cohort}/{version}",
        plot_chance_level=False,
        drop_intermediate=True,
        ax=ax,
        **kwargs
    )