In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pyyaml pandas scikit-learn albumentations segmentation-models-pytorch -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.8/154.8 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import numpy as np
from tqdm import tqdm

# --- 1. Configuration ---
# --- (Update these paths to match your Drive) ---
CONFIG = {
    "MASK_DIR": "/content/drive/MyDrive/CAF-GAN/data/masks_512x512/",
    "TRAIN_CSV_PATH": "/content/drive/MyDrive/CAF-GAN/data/splits/train.csv",
    "IMG_SIZE": 512,
    "BATCH_SIZE": 32,  # Use a larger batch size for faster calculation
    "NUM_WORKERS": 2
}

print("Starting calculation of mask area statistics...")
print(f"Loading masks from: {CONFIG['MASK_DIR']}")

# --- 2. Simplified Dataset ---
# This dataset *only* loads the ground-truth masks
class MaskAreaDataset(Dataset):
    def __init__(self, df, mask_dir, img_size):
        self.df = df
        self.mask_dir = mask_dir
        self.img_size = img_size

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        dicom_id = row['dicom_id']
        mask_path = os.path.join(self.mask_dir, f"{dicom_id}.png")

        try:
            # Load the 512x512 mask
            mask = np.array(Image.open(mask_path).convert("L"), dtype=np.float32)

            # Normalize mask to 0.0-1.0 range
            mask[mask == 255.0] = 1.0

            # Add channel dimension (H, W) -> (1, H, W)
            return torch.from_numpy(mask).unsqueeze(0)

        except Exception as e:
            print(f"Warning: Could not load mask {mask_path}. Skipping. Error: {e}")
            return None

# Custom collate function to filter out None values from failed loads
def custom_collate(batch):
    batch = list(filter(lambda x: x is not None, batch))
    if not batch:
        return torch.Tensor()
    return torch.utils.data.dataloader.default_collate(batch)

# --- 3. Main Calculation Script ---
def calculate_stats():
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {DEVICE}")

    train_df = pd.read_csv(CONFIG['TRAIN_CSV_PATH'])

    dataset = MaskAreaDataset(train_df, CONFIG['MASK_DIR'], CONFIG['IMG_SIZE'])

    loader = DataLoader(
        dataset,
        batch_size=CONFIG['BATCH_SIZE'],
        shuffle=False,  # No need to shuffle for this
        num_workers=CONFIG['NUM_WORKERS'],
        collate_fn=custom_collate
    )

    all_area_percents = []
    total_pixels = CONFIG['IMG_SIZE'] * CONFIG['IMG_SIZE']

    print(f"Iterating over {len(dataset)} masks...")
    for masks in tqdm(loader):
        if masks.nelement() == 0:
            continue

        masks = masks.to(DEVICE)  # (B, 1, H, W)

        # Calculate sum of pixels for each mask in the batch
        # (B, 1, H, W) -> (B,)
        mask_area_pixels = masks.sum(dim=[1, 2, 3])

        # Calculate area as a percentage
        mask_area_percent = mask_area_pixels / total_pixels

        all_area_percents.append(mask_area_percent.cpu())

    # Concatenate all batch results into one big tensor
    all_area_percents = torch.cat(all_area_percents)

    # Calculate final mean and std
    mean_val = torch.mean(all_area_percents).item()
    std_val = torch.std(all_area_percents).item()

    print("\n" + "="*50)
    print("✅ CALCULATION COMPLETE ✅")
    print(f"\nTotal valid masks processed: {len(all_area_percents)}")

    print("\nCopy these values into your 'CONFIG' dictionary in the GAN training script:")
    print(f"\"PLAUSIBLE_LUNG_AREA_MEAN\": {mean_val:.6f},")
    print(f"\"PLAUSIBLE_LUNG_AREA_STD\": {std_val:.6f},")
    print("="*50)

if __name__ == "__main__":
    calculate_stats()

Starting calculation of mask area statistics...
Loading masks from: /content/drive/MyDrive/CAF-GAN/data/masks_512x512/
Using device: cuda
Iterating over 1399 masks...


100%|██████████| 44/44 [10:49<00:00, 14.77s/it]


✅ CALCULATION COMPLETE ✅

Total valid masks processed: 1399

Copy these values into your 'CONFIG' dictionary in the GAN training script:
"PLAUSIBLE_LUNG_AREA_MEAN": 0.220646,
"PLAUSIBLE_LUNG_AREA_STD": 0.066277,



