In [None]:
import os
from dotenv import load_dotenv

load_dotenv()
os.chdir("../..")

DATA_DIR = os.getenv("DATA_DIR")
OUTPUT_DIR = os.getenv("OUTPUT_DIR")

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score
from xgboost import XGBClassifier

from data_processing.label import Label
from evaluation.eval import Evaluator
from data_processing.load_data import load_data
from data_processing.split import train_val_split

In [None]:
label_path = os.path.join(DATA_DIR, "labels/labels.csv")
embedding_path = os.path.join(
    OUTPUT_DIR, "resnet18/slide_embeddings/resnet_slide_embeds_GAP.pkl"
)
fold_path = os.path.join(DATA_DIR, "folds.json")

df = load_data(
    label_path=label_path, embedding_path=embedding_path, fold_path=fold_path
)

In [None]:
# map specimen id to a list of WSIs
specs = df.groupby("specimen_id").groups
specs = {k: list(v) for k, v in specs.items()}

# get list of slides within each fold
slide_folds = df.groupby("fold").groups
slide_folds = [list(slides) for slides in slide_folds.values()]

In [None]:
specimen_df = df.reset_index()[
    ["specimen_id", "bowens", "scc", "bcc", "na"]
].drop_duplicates(subset=["specimen_id"])

spec_freqs = {
    label: specimen_df[label].value_counts(normalize=True).iloc[1]
    for label in Label._member_names_
}

In [None]:
X = df["embedding"]
y = df["label"]

In [None]:
auroc_keys = [k + "_auroc" for k in ["benign", "bowens", "bcc", "scc"]]
auprc_keys = [k + "_auprc" for k in ["benign", "bowens", "bcc", "scc"]]

In [None]:
def crossval(
    clf,
    folds: list,
    y_onehot: pd.DataFrame,
    exp_name: str,
    foundation_model: str,
    aggregator_model: str,
    classifier_model: str,
) -> pd.DataFrame:
    evaluator = Evaluator(Label)
    results = pd.DataFrame(
        columns=["foundation_model", "aggregator", "classifier", "fold"]
        + auroc_keys
        + auprc_keys
    )

    for i, val_fold_indices in enumerate(folds):
        # fit the classifier on the train data and extract probs
        X_train, y_train, X_val, _ = train_val_split(
            X, y, val_fold_indices, False
        )
        clf.fit(X_train, y_train)
        probs = clf.predict_proba(X_val)

        # probs are on slide-level - need specimen level
        _, probs = Evaluator.get_spec_level_probs(val_fold_indices, probs)

        # get onehot labels for val set
        y_onehot_val = (
            y_onehot.loc[val_fold_indices]
            .drop_duplicates(subset=["specimen_id"])[Label._member_names_]
            .to_numpy()
        )

        evaluator.fold(probs, y_onehot_val, i, len(folds))

        auroc = roc_auc_score(
            y_onehot_val, probs, average=None, multi_class="ovr"
        )
        auroc_dict = {auroc_keys[i]: v for i, v in enumerate(auroc)}

        auprc = average_precision_score(y_onehot_val, probs, average=None)
        auprc_dict = {auprc_keys[i]: v for i, v in enumerate(auprc)}

        model_details = {}
        model_details["foundation_model"] = foundation_model
        model_details["aggregator"] = aggregator_model
        model_details["classifier"] = classifier_model
        model_details["fold"] = i
        model_details = model_details | auroc_dict | auprc_dict
        details_df = pd.Series(model_details)
        results = pd.concat(
            [results, details_df.to_frame().T], ignore_index=True
        )

    evaluator.finalize(spec_freqs)
    evaluator.save_figs(exp_name)
    return results

In [None]:
y_onehot = df[Label._member_names_ + ["specimen_id"]]

In [None]:
clf = XGBClassifier(objective="multi:softmax", num_class=4)
xgb = crossval(
    clf,
    slide_folds,
    y_onehot,
    "resnet18/global_pooling/resnet18-xgb",
    "resnet18",
    "GAP",
    "xgb",
)

In [None]:
clf = LogisticRegression(max_iter=1000, solver="saga")
lr = crossval(
    clf,
    slide_folds,
    y_onehot,
    "resnet18/global_pooling/resnet18-lr",
    "resnet18",
    "GAP",
    "lr",
)

In [None]:
final_results = pd.concat((xgb, lr), ignore_index=True)
final_results

In [None]:
final_results.to_csv(
    "outputs/experiments_by_fold.csv", sep="|", mode="a", header=False
)