In [None]:
import os
from dotenv import load_dotenv

load_dotenv()
os.chdir("..")

DATA_DIR = os.getenv("DATA_DIR")
OUTPUT_DIR = os.getenv("OUTPUT_DIR")

In [None]:
from utils.load_data import load_data

label_path = os.path.join(DATA_DIR, "labels/labels.csv")
embedding_path = os.path.join(
    OUTPUT_DIR, "prism/slide_embeddings/prism_slide_embeds.pkl"
)
fold_path = os.path.join(DATA_DIR, "folds.json")

df = load_data(
    label_path=label_path, embedding_path=embedding_path, fold_path=fold_path
)

In [None]:
# map specimen id to a list of WSIs
specs = df.groupby("specimen_id").groups
specs = {k: list(v) for k, v in specs.items()}

# get list of slides within each fold
slide_folds = df.groupby("fold").groups
slide_folds = [list(slides) for slides in slide_folds.values()]

In [None]:
from data_models.Label import Label

specimen_df = df.reset_index()[
    ["specimen_id", "bowens", "scc", "bcc", "na"]
].drop_duplicates(subset=["specimen_id"])

spec_freqs = {
    label: specimen_df[label].value_counts(normalize=True).iloc[1]
    for label in Label._member_names_
}

In [None]:
X = df["embedding"]
y = df["label"]

In [None]:
import pandas as pd

from utils.split import train_val_split
from utils.eval import Evaluator, get_spec_level_probs


def crossval(clf, folds: list, y_onehot: pd.DataFrame, exp_name: str):
    evaluator = Evaluator(Label)

    for i, val_fold_indices in enumerate(folds):
        # fit the classifier on the train data and extract probs
        X_train, y_train, X_val, _ = train_val_split(
            X, y, val_fold_indices, False
        )
        clf.fit(X_train, y_train)
        probs = clf.predict_proba(X_val)

        # probs are on slide-level - need specimen level
        _, probs = get_spec_level_probs(val_fold_indices, probs)

        # get onehot labels for val set
        y_onehot_val = (
            y_onehot.loc[val_fold_indices]
            .drop_duplicates(subset=["specimen_id"])[Label._member_names_]
            .to_numpy()
        )

        evaluator.fold(probs, y_onehot_val, i, len(folds))

    evaluator.finalize(spec_freqs)
    evaluator.save_figs(exp_name)

In [None]:
y_onehot = df[Label._member_names_ + ["specimen_id"]]

In [None]:
from xgboost import XGBClassifier

clf = XGBClassifier(objective="multi:softmax", num_class=4)
crossval(clf, slide_folds, y_onehot, "prism/perceiver/prism-xgb")

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000, solver="saga")
crossval(clf, slide_folds, y_onehot, "prism/perceiver/prism-lr")

In [None]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss="log_loss")

crossval(clf, slide_folds, y_onehot, "prism/perceiver/prism-sgd_lr")