In [27]:
import yaml
import albumentations as albu
import cv2
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from albumentations.pytorch import ToTensorV2
from pathlib import Path

In [2]:
class SomicDataset(Dataset):
    def __init__(self, cfg: dict, augs: albu.Compose) -> None:

        self.augs = augs
        self.base = Path(cfg["base"])
        self.stems = []

        df = pd.read_csv(self.base / "info.csv")
        for query in cfg["query"]:
            stem = df.query(query)["stem"]
            self.stems += stem.to_list()

    def __getitem__(self, idx: int):

        stem = self.stems[idx]
        img = cv2.imread(str(self.base / f"images/{stem}.jpg"))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        mask = cv2.imread(str(self.base / f"masks/{stem}.png"), cv2.IMREAD_GRAYSCALE)
        mask = np.expand_dims(mask, axis=-1)
        label = 0 if mask.sum() == 0 else 1

        data = self.augs(image=img, mask=mask)
        data["mask"] = data["mask"].permute(2, 0, 1)
        data["label"] = label
        data["stem"] = stem
        return data

    def __len__(self) -> int:

        return len(self.stems)


In [4]:
augs = albu.Compose([
    albu.Resize(height=256, width=256, always_apply=False, p=1),
    albu.Normalize(always_apply=False, p=1),
    ToTensorV2()
])

<br>

## Supervised Dataset

In [25]:
with open("H_tobu_supervised.yaml") as f:
    cfg = yaml.load(f, Loader=yaml.FullLoader)
    
train_dataset = SomicDataset(cfg["train"], augs)
test_dataset = SomicDataset(cfg["test"], augs)

1781

<br>

## Semisupervised Dataset

In [30]:
with open("H_tobu_semisupervised.yaml") as f:
    cfg = yaml.load(f, Loader=yaml.FullLoader)
    
labeled_train_dataset = SomicDataset(cfg["labeled_train"], augs)
unlabeled_train_dataset = SomicDataset(cfg["unlabeled_train"], augs)
test_dataset = SomicDataset(cfg["test"], augs)
print(f"number of labeled train dataset: {len(labeled_train_dataset)}")
print(f"number of unlabeled train dataset: {len(unlabeled_train_dataset)}")

number of labeled train dataset: 121
number of unlabeled train dataset: 700


<br>

## Unsupervised Dataset

In [31]:
with open("H_tobu_unsupervised.yaml") as f:
    cfg = yaml.load(f, Loader=yaml.FullLoader)
    
train_dataset = SomicDataset(cfg["train"], augs)
test_dataset = SomicDataset(cfg["test"], augs)