In [2]:
import os
from dotenv import load_dotenv

load_dotenv()
os.chdir("..")

DATA_DIR = os.environ["DATA_DIR"]

In [5]:
from utils.load_data import load_data

data = load_data(
    label_path=os.path.join(DATA_DIR, "labels/labels.csv"),
    fold_path=os.path.join(DATA_DIR, "folds.json"),
)

In [7]:
import pandas as pd

labels_by_patient = (
    data.groupby("patient_id")["label"]
    .agg(lambda x: pd.Series.mode(x)[0])
    .to_frame()
)

data = data.merge(
    labels_by_patient,
    left_on="patient_id",
    right_index=True,
    suffixes=("_spec", "_pat"),
)

In [13]:
agg_func = lambda x: sum(x) / len(x)

In [14]:
by_spec = data.groupby("fold").agg(
    n=("fold", "count"),
    bowens=("bowens", agg_func),
    bcc=("bcc", agg_func),
    scc=("scc", agg_func),
    cancerous=("na", lambda x: 1 - agg_func(x)),
    non_cancerous=("na", agg_func),
)

In [18]:
totals = (
    data.loc[:, ["bowens", "scc", "bcc", "na"]]
    .agg(agg_func)
    .to_frame()
    .T.rename(
        columns={
            "na": "non_cancerous",
        }
    )
)
totals["n"] = len(data)
totals["cancerous"] = 1 - totals["non_cancerous"]
totals.index = ["Agg"]

In [19]:
# print the data characteristics at the specimen level
pd.concat((by_spec, totals))

Unnamed: 0,n,bowens,bcc,scc,cancerous,non_cancerous
0,104,0.288462,0.201923,0.067308,0.557692,0.442308
1,105,0.32381,0.2,0.07619,0.6,0.4
2,107,0.28972,0.233645,0.056075,0.579439,0.420561
3,126,0.31746,0.222222,0.071429,0.611111,0.388889
4,111,0.324324,0.189189,0.063063,0.576577,0.423423
Agg,553,0.309222,0.209765,0.066908,0.585895,0.414105


In [21]:
from utils.Label import Label

by_fold_and_pat = data.groupby(["fold", "patient_id"]).agg(
    bowens=(
        "label_pat",
        lambda x: 1 if x.iloc[0] == Label.bowens.value else 0,
    ),
    bcc=("label_pat", lambda x: 1 if x.iloc[0] == Label.bcc.value else 0),
    scc=("label_pat", lambda x: 1 if x.iloc[0] == Label.scc.value else 0),
    cancerous=("label_pat", lambda x: 0 if x.iloc[0] == Label.na.value else 1),
    non_cancerous=(
        "label_pat",
        lambda x: 1 if x.iloc[0] == Label.na.value else 0,
    ),
)

In [24]:
by_pat = (
    by_fold_and_pat.reset_index()
    .groupby("fold")
    .agg(
        n=("fold", "count"),
        bowens=("bowens", agg_func),
        bcc=("bcc", agg_func),
        scc=("scc", agg_func),
        cancerous=("cancerous", agg_func),
        non_cancerous=("non_cancerous", agg_func),
    )
)

In [25]:
totals = (
    by_fold_and_pat.loc[
        :, ["bowens", "scc", "bcc", "cancerous", "non_cancerous"]
    ]
    .agg(agg_func)
    .to_frame()
    .T
)
totals["n"] = len(by_fold_and_pat)
totals.index = ["Agg"]

In [26]:
# print the data characteristics at the patient level
pd.concat((by_pat, totals))

Unnamed: 0,n,bowens,bcc,scc,cancerous,non_cancerous
0,91,0.274725,0.186813,0.043956,0.505495,0.494505
1,90,0.3,0.188889,0.044444,0.533333,0.466667
2,92,0.304348,0.206522,0.043478,0.554348,0.445652
3,91,0.351648,0.164835,0.043956,0.56044,0.43956
4,91,0.307692,0.186813,0.043956,0.538462,0.461538
Agg,455,0.307692,0.186813,0.043956,0.538462,0.461538
