In [32]:
import json
import pickle as pkl
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from pathlib import Path
import pickle as pkl
import os 
def make_dataset_tuple(my_df, phase_label=-1):
    test_l, train_l, val_l = list(), list(), list()
    for i, e in my_df.iterrows():
        row = {}
        row["image"] = e["vol"]
        row["lbl"] = e["lbl"]
        row["phase"] = phase_label
        if e["split"] == "train":
            train_l.append(row)
        if e["split"] == "test":
            test_l.append(test_l)
        if e["split"] == "val":
            val_l.append(val_l)
    tup = (train_l, val_l, test_l)
    return tup

In [33]:
#This file simply contains the paths to  the kits21 and STU datasets 
with open("../../secrets_folder/paths_info.json","r") as f: 
    path_dir = json.load(f)


In [34]:
kits_meta_path = os.path.join(path_dir['KITS21DIR'],'kits.json')

# Processing Kits datasets 
- Kits data contains a json file specifying case ids we use that to specify the paths to our images 

In [35]:
with open(kits_meta_path, "r") as f:
    kits_info = json.load(f)

In [36]:
kits_df = pd.DataFrame(kits_info)

In [37]:
# this is where the images were downloaded to
data_dir = path_dir['KITS21DIR'] 

In [38]:
# kits is organized in a verify simple structure so we can find imaging and segmentation using
kits_df["vol"] = kits_df["case_id"].apply(
    lambda x: os.path.join(data_dir, x, "imaging.nii.gz")
)
kits_df["lbl"] = kits_df["case_id"].apply(
    lambda x: os.path.join(data_dir, x, "segmentation.nii.gz")
)

In [39]:
# data is split into train,test,validation usign this seed
train, val = train_test_split(sorted(kits_df["case_id"].unique()), random_state=1996)
val, test = train_test_split(val, random_state=1996)

In [40]:
kits_df["split"] = "ukw"
kits_df.loc[kits_df["case_id"].isin(train), "split"] = "train"
kits_df.loc[kits_df["case_id"].isin(val), "split"] = "val"
kits_df.loc[kits_df["case_id"].isin(test), "split"] = "test"

In [41]:
# make the data into the form expected by the monai dataloader
# we make the phase label 1 as the entire kits is contrast phase
kits_tup = make_dataset_tuple(kits_df, phase_label=1)

In [43]:
# save it to your datasets file
with open("./datasets/kits21.pkl", "wb") as f:
    pkl.dump(kits_tup, f)

# Processing the STU Dataset 
- In this case we have a folder containing image and masks with a covnenient naming structure

In [44]:
stu_dataset_dir =  path_dir['STUDIR']

In [46]:
images = [str(e) for e in Path(stu_dataset_dir).rglob("*image.nii.gz")]

In [47]:
masks = [e.replace("image", "mask") for e in images]

In [48]:
stu_df = pd.DataFrame({"vol": images, "lbl": masks})

In [49]:
def get_stu_pid(s):
    return s.split("/")[-1].split("_")[0]

In [50]:
stu_df["case_id"] = stu_df["vol"].apply(get_stu_pid)

In [51]:
# same stratergy for splitting
train, val = train_test_split(sorted(stu_df["case_id"].unique()), random_state=1996)
val, test = train_test_split(val, random_state=1996)

In [52]:
stu_df["split"] = "ukw"
stu_df.loc[stu_df["case_id"].isin(train), "split"] = "train"
stu_df.loc[stu_df["case_id"].isin(val), "split"] = "val"
stu_df.loc[stu_df["case_id"].isin(test), "split"] = "test"

In [53]:
# we make the phase label 0 because
stu_tup = make_dataset_tuple(stu_df, phase_label=0)

In [55]:
with open("./datasets/stu.pkl", "wb") as f:
    pkl.dump(stu_tup, f)

In [56]:
# check step1 for making the combined dataset