In [None]:
import os
from dotenv import load_dotenv

load_dotenv()
os.chdir("..")

DATA_DIR = os.environ["DATA_DIR"]

In [None]:
from utils.load_data import load_data

data = load_data(
    label_path=os.path.join(DATA_DIR, "labels/labels.csv"),
    fold_path=os.path.join(DATA_DIR, "folds.json"),
)

In [None]:
import pandas as pd

labels_by_patient = (
    data.groupby("patient_id")["label"]
    .agg(lambda x: pd.Series.mode(x)[0])
    .to_frame()
)

data = data.merge(
    labels_by_patient,
    left_on="patient_id",
    right_index=True,
    suffixes=("_spec", "_pat"),
)

In [None]:
agg_func = lambda x: sum(x) / len(x)

In [None]:
by_spec = data.groupby("fold").agg(
    n=("fold", "count"),
    bowens=("bowens", agg_func),
    bcc=("bcc", agg_func),
    scc=("scc", agg_func),
    cancerous=("na", lambda x: 1 - agg_func(x)),
    non_cancerous=("na", agg_func),
)

In [None]:
totals = (
    data.loc[:, ["bowens", "scc", "bcc", "na"]]
    .agg(agg_func)
    .to_frame()
    .T.rename(
        columns={
            "na": "non_cancerous",
        }
    )
)
totals["n"] = len(data)
totals["cancerous"] = 1 - totals["non_cancerous"]
totals.index = ["Agg"]

In [None]:
# print the data characteristics at the specimen level
pd.concat((by_spec, totals))

In [None]:
from utils.Label import Label

by_fold_and_pat = data.groupby(["fold", "patient_id"]).agg(
    bowens=(
        "label_pat",
        lambda x: 1 if x.iloc[0] == Label.bowens.value else 0,
    ),
    bcc=("label_pat", lambda x: 1 if x.iloc[0] == Label.bcc.value else 0),
    scc=("label_pat", lambda x: 1 if x.iloc[0] == Label.scc.value else 0),
    cancerous=("label_pat", lambda x: 0 if x.iloc[0] == Label.na.value else 1),
    non_cancerous=(
        "label_pat",
        lambda x: 1 if x.iloc[0] == Label.na.value else 0,
    ),
)

In [None]:
by_pat = (
    by_fold_and_pat.reset_index()
    .groupby("fold")
    .agg(
        n=("fold", "count"),
        bowens=("bowens", agg_func),
        bcc=("bcc", agg_func),
        scc=("scc", agg_func),
        cancerous=("cancerous", agg_func),
        non_cancerous=("non_cancerous", agg_func),
    )
)

In [None]:
totals = (
    by_fold_and_pat.loc[
        :, ["bowens", "scc", "bcc", "cancerous", "non_cancerous"]
    ]
    .agg(agg_func)
    .to_frame()
    .T
)
totals["n"] = len(by_fold_and_pat)
totals.index = ["Agg"]

In [None]:
# print the data characteristics at the patient level
pd.concat((by_pat, totals))