In [None]:
DATA_DIR = "/opt/gpudata/skin-cancer/data"
OUTPUT_DIR = "/opt/gpudata/skin-cancer/outputs"

In [None]:
# load cross validation folds
import json
import os

with open(os.path.join(DATA_DIR, "folds.json"), "r") as f:
    folds = json.load(f)

In [None]:
# load slide embeddings
import pickle

with open(
    os.path.join(OUTPUT_DIR, "gigapath_slide_embeds_pool.pkl"), "rb"
) as f:
    embeds = pickle.load(f)

In [None]:
# map specimen id to a list of WSIs
specs = {}
for slide in embeds.keys():
    key = slide[:6]
    if specs.get(key) is not None:
        specs[key].append(slide)
    else:
        specs[key] = [slide]

In [None]:
# instead, just add "fold" col to dataframe and groupby to get grouped data

slide_folds = [[] for _ in range(5)]
for i, fold in enumerate(folds):
    for spec in fold:
        slide_folds[i].extend(specs[spec])

In [None]:
# load data labels
import pandas as pd

labels = pd.read_csv(os.path.join(DATA_DIR, "labels/labels.csv"))
labels["specimenid"] = labels["specimenid"].astype("string")

In [None]:
# create dataframe with slide data
import numpy as np

slide_ids = list(embeds.keys())
specimen_ids = [slide_id[:6] for slide_id in slide_ids]
embeddings = [embeds[slide_id] for slide_id in slide_ids]
df = pd.DataFrame(
    {
        "slide_id": slide_ids,
        "specimen_id": specimen_ids,
        "embedding": embeddings,
    }
)
df["slide_id"] = df["slide_id"].astype("string")
df["specimen_id"] = df["specimen_id"].astype("string")
df["embedding"] = df["embedding"].apply(lambda x: np.array(x.squeeze(0)))

In [None]:
# merge slide data dataframe with labels dataframe
df = df.merge(
    labels[["specimenid", "bowens1", "scc1", "bcc1"]],
    how="right",
    left_on="specimen_id",
    right_on="specimenid",
)
df = df.set_index("slide_id")

In [None]:
from enum import IntEnum


class Label(IntEnum):
    na = 0
    bowens = 1
    bcc = 2
    scc = 3


def get_labels(x):
    if x["bowens1"].item() == 1:
        return Label.bowens.value
    elif x["bcc1"].item() == 1:
        return Label.bcc.value
    elif x["scc1"].item() == 1:
        return Label.scc.value
    else:
        return Label.na.value

In [None]:
X = df["embedding"]
y = df[["bowens1", "scc1", "bcc1"]]
y = y.apply(get_labels, axis=1)

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# clf = LogisticRegression(max_iter=500)
clf = XGBClassifier(objective="multi:softmax", num_class=4)

val_set = np.arange(len(slide_folds) - 1, -1, -1)
for i in range(len(slide_folds)):
    val_fold = slide_folds[val_set[i]]

    X_val = X.loc[val_fold]
    X_train = X[X.index.difference(X_val.index)]
    X_val = X_val.to_list()
    X_train = X_train.to_list()

    y_val = y.loc[val_fold]
    y_train = y[y.index.difference(y_val.index)]

    clf.fit(X_train, y_train)
    preds = clf.predict(X_val)
    probs = clf.predict_proba(X_val)
    score = accuracy_score(y_val, preds)
    print(score)
    print(classification_report(y_val, preds))