In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pyyaml pandas scikit-learn albumentations segmentation-models-pytorch -q

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/154.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m154.8/154.8 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


In [6]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchvision.models as models
import torchvision.utils as vutils
import pandas as pd
from tqdm import tqdm
import numpy as np
from PIL import Image
import albumentations as A
from albumentations.pytorch import ToTensorV2
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, confusion_matrix
import torch.nn.functional as F
import segmentation_models_pytorch as smp
import warnings

warnings.filterwarnings("ignore")

# -------------------------------------------------------------------
# CONFIG: UPDATE THESE VALUES
# -------------------------------------------------------------------
CONFIG = {
    # --- Paths ---
    "GENERATOR_CHECKPOINT": "/content/drive/MyDrive/CAF-GAN/outputs/caf_gan_final/caf_gan_generator_final.pth",
    "CSEG_CHECKPOINT": "/content/drive/MyDrive/CAF-GAN/outputs/cseg_512/best_cseg_512.pth",
    "REAL_DATA_CSV_TEST": "/content/drive/MyDrive/CAF-GAN/data/splits/test.csv",
    "IMAGE_DIR_REAL": "/content/drive/MyDrive/CAF-GAN/mimic-cxr-jpg-2.0.0/files/",

    # --- Synthetic Data Generation ---
    "SYNTHETIC_DATA_DIR": "/content/drive/MyDrive/CAF-GAN/data/synthetic_images_final/",
    "SYNTHETIC_CSV_PATH": "/content/drive/MyDrive/CAF-GAN/data/synthetic_images_final/labels.csv",
    "NUM_SYNTHETIC_IMAGES": 5000,  # Generate a robust dataset
    "GENERATION_BATCH_SIZE": 16,     # Smaller batch for 512x512 generation

    # --- Downstream Classifier Training (TS-TR) ---
    "CLASSIFIER_OUTPUT_DIR": "/content/drive/MyDrive/CAF-GAN/outputs/downstream_classifier_final/",
    "CLASSIFIER_MODEL_NAME": "best_synth_trained_classifier.pth",
    "CLASSIFIER_IMG_SIZE": 256,    # ResNet prefers 224 or 256
    "CLASSIFIER_BATCH_SIZE": 32,
    "CLASSIFIER_EPOCHS": 15,
    "CLASSIFIER_LR": 0.0001,

    # --- Generator Architecture (MUST MATCH TRAINING) ---
    "TARGET_IMG_SIZE": 512,
    "LATENT_DIM": 512,
    "CHANNELS": 3,
    "BASE_CHANNELS": 512,

    # --- Clinical Plausibility (MUST MATCH TRAINING) ---
    # !!! IMPORTANT: Fill these with the values you calculated !!!
    "PLAUSIBLE_LUNG_AREA_MEAN": 0.220646, # <--- PUT YOUR CALCULATED MEAN HERE
    "PLAUSIBLE_LUNG_AREA_STD": 0.066277,   # <--- PUT YOUR CALCULATED STD HERE

    # --- System ---
    "DEVICE": "cuda" if torch.cuda.is_available() else "cpu",
    "NUM_WORKERS": 2
}

os.makedirs(CONFIG['SYNTHETIC_DATA_DIR'], exist_ok=True)
os.makedirs(CONFIG['CLASSIFIER_OUTPUT_DIR'], exist_ok=True)

# -------------------------------------------------------------------
# ARCHITECTURE: PROGRESSIVE CAF-GAN (Must match training script)
# -------------------------------------------------------------------
class PixelNorm(nn.Module):
    def __init__(self): super().__init__(); self.epsilon = 1e-8
    def forward(self, x):
        return x / torch.sqrt(torch.mean(x ** 2, dim=1, keepdim=True) + self.epsilon)

class WSConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
        self.scale = (2 / (in_channels * (kernel_size ** 2))) ** 0.5
        self.bias = self.conv.bias; self.conv.bias = None
        nn.init.normal_(self.conv.weight);
        if self.bias is not None: nn.init.zeros_(self.bias)
    def forward(self, x):
        out = self.conv(x * self.scale)
        if self.bias is not None: out = out + self.bias.view(1, self.bias.shape[0], 1, 1)
        return out

class InjectNoise(nn.Module):
    def __init__(self, channels):
        super().__init__(); self.weight = nn.Parameter(torch.zeros(1, channels, 1, 1))
    def forward(self, x):
        noise = torch.randn((x.shape[0], 1, x.shape[2], x.shape[3]), device=x.device)
        return x + self.weight * noise

class AdaIN(nn.Module):
    def __init__(self, channels, w_dim):
        super().__init__()
        self.instance_norm = nn.InstanceNorm2d(channels)
        self.style_scale = nn.Linear(w_dim, channels)
        self.style_bias = nn.Linear(w_dim, channels)
    def forward(self, x, w):
        x = self.instance_norm(x)
        style_scale = self.style_scale(w).unsqueeze(2).unsqueeze(3)
        style_bias = self.style_bias(w).unsqueeze(2).unsqueeze(3)
        return style_scale * x + style_bias

class MappingNetwork(nn.Module):
    def __init__(self, z_dim, w_dim):
        super().__init__()
        layers = [PixelNorm()]
        for i in range(8):
            layers.append(nn.Linear(z_dim if i == 0 else w_dim, w_dim))
            if i < 7: layers.append(nn.ReLU())
        self.mapping = nn.Sequential(*layers)
    def forward(self, x): return self.mapping(x)

class GenBlock(nn.Module):
    def __init__(self, in_channels, out_channels, w_dim):
        super().__init__()
        self.conv1 = WSConv2d(in_channels, out_channels); self.conv2 = WSConv2d(out_channels, out_channels)
        self.leaky = nn.LeakyReLU(0.2, inplace=True); self.inject_noise1 = InjectNoise(out_channels)
        self.inject_noise2 = InjectNoise(out_channels); self.adain1 = AdaIN(out_channels, w_dim)
        self.adain2 = AdaIN(out_channels, w_dim)
    def forward(self, x, w):
        x = self.leaky(self.inject_noise1(self.conv1(x))); x = self.adain1(x, w)
        x = self.leaky(self.inject_noise2(self.conv2(x))); x = self.adain2(x, w)
        return x

class Generator(nn.Module):
    def __init__(self, z_dim, w_dim, base_channels, img_channels=3):
        super().__init__()
        self.starting_const = nn.Parameter(torch.randn(1, base_channels, 4, 4))
        self.map = MappingNetwork(z_dim, w_dim)
        self.initial_conv = WSConv2d(base_channels, base_channels, kernel_size=3, padding=1)
        self.leaky = nn.LeakyReLU(0.2, inplace=True)
        self.factors = [512, 512, 512, 256, 128, 64, 32, 16]
        self.prog_blocks = nn.ModuleList(); self.to_rgbs = nn.ModuleList()
        self.to_rgbs.append(WSConv2d(self.factors[0], img_channels, kernel_size=1, padding=0))
        for i in range(1, len(self.factors)):
            in_c = self.factors[i-1]; out_c = self.factors[i]
            self.prog_blocks.append(GenBlock(in_c, out_c, w_dim))
            self.to_rgbs.append(WSConv2d(out_c, img_channels, kernel_size=1, padding=0))

    def forward(self, z, alpha, steps):
        w = self.map(z); batch = z.shape[0]
        x = self.starting_const.repeat(batch, 1, 1, 1); x = self.initial_conv(x); x = self.leaky(x)
        if steps == 0: return torch.tanh(self.to_rgbs[0](x))
        prev = None
        for step in range(1, steps + 1):
            prev = x; x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=False)
            x = self.prog_blocks[step - 1](x, w)
        final_out = self.to_rgbs[steps](x)
        if alpha < 1.0 and prev is not None:
            prev_rgb = self.to_rgbs[steps - 1](prev)
            prev_rgb_upsampled = F.interpolate(prev_rgb, scale_factor=2, mode='bilinear', align_corners=False)
            out = alpha * final_out + (1.0 - alpha) * prev_rgb_upsampled
        else: out = final_out
        return torch.tanh(out)

# -------------------------------------------------------------------
# DATASET DEFINITIONS
# -------------------------------------------------------------------

# Dataset for the *synthetic* images
class SyntheticDataset(Dataset):
    def __init__(self, df, image_dir, transform):
        self.df = df
        self.image_dir = image_dir
        self.transform = transform
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.image_dir, row['image_id'])
        # Load as RGB for ResNet
        image = np.array(Image.open(image_path).convert("RGB"))
        label = torch.tensor(row['Pneumonia'], dtype=torch.float32)
        if self.transform:
            image = self.transform(image=image)['image']
        return image, label.unsqueeze(0)

# Dataset for the *real* test images
class RealTestDataset(Dataset):
    def __init__(self, df, image_dir, transform=None):
        self.df = df
        self.image_dir = image_dir
        self.transform = transform
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        subject_id = str(row['subject_id']); study_id = str(row['study_id']); dicom_id = row['dicom_id']
        image_path = os.path.join(self.image_dir, f'p{subject_id[:2]}', f'p{subject_id}', f's{study_id}', f'{dicom_id}.jpg')
        # Load as RGB for ResNet
        image = np.array(Image.open(image_path).convert("RGB"))
        if self.transform:
            image = self.transform(image=image)['image']
        label = torch.tensor(row['Pneumonia'], dtype=torch.float32)
        race = row['race_group']
        return image, label.unsqueeze(0), race

# -------------------------------------------------------------------
# HELPER FUNCTION (from training script)
# -------------------------------------------------------------------
def calculate_clinical_loss_score(fake_masks, config):
    """Calculates the L_clinical (L_area) score for a batch of masks."""
    fake_masks_prob = torch.sigmoid(fake_masks)
    total_pixels = fake_masks_prob.shape[2] * fake_masks_prob.shape[3]
    mask_area_percent = fake_masks_prob.sum(dim=[1, 2, 3]) / total_pixels

    mean_area = config["PLAUSIBLE_LUNG_AREA_MEAN"]
    std_area = config["PLAUSIBLE_LUNG_AREA_STD"]

    # Calculate L1 distance from the mean, normalized by std
    area_loss_scores = F.l1_loss(mask_area_percent, torch.tensor(mean_area, device=mask_area_percent.device), reduction='none') / std_area
    return area_loss_scores # (B,)

# -------------------------------------------------------------------
# EVALUATION STEP 1: GENERATE SYNTHETIC DATA
# -------------------------------------------------------------------
def generate_synthetic_data(config, device):
    print("--- 1. Generating Synthetic Dataset ---")

    # Load Generator
    netG = Generator(
        CONFIG['LATENT_DIM'],
        CONFIG['LATENT_DIM'],
        CONFIG['BASE_CHANNELS'],
        CONFIG['CHANNELS']
    ).to(device)

    # Load the *full* generator state_dict
    netG.load_state_dict(torch.load(config['GENERATOR_CHECKPOINT'], map_location=device))
    netG.eval()
    print(f"‚úÖ Generator loaded from {config['GENERATOR_CHECKPOINT']}")

    # Create balanced labels for the synthetic data (for robust downstream training)
    num_positive = config['NUM_SYNTHETIC_IMAGES'] // 2
    labels = np.array([1] * num_positive + [0] * (config['NUM_SYNTHETIC_IMAGES'] - num_positive))
    np.random.shuffle(labels)

    image_ids = []; generated_labels = []

    # This transform resizes the generated 512x512 to 256x256 for the *classifier*
    resize_transform = A.Compose([
        A.Resize(config['CLASSIFIER_IMG_SIZE'], config['CLASSIFIER_IMG_SIZE'], interpolation=Image.LANCZOS)
    ])

    with torch.no_grad():
        for i in tqdm(range(0, config['NUM_SYNTHETIC_IMAGES'], config['GENERATION_BATCH_SIZE']), desc="Generating Images"):
            batch_size = min(config['GENERATION_BATCH_SIZE'], config['NUM_SYNTHETIC_IMAGES'] - i)

            # Noise vector for progressive GAN is (B, Z_DIM)
            noise = torch.randn(batch_size, config['LATENT_DIM'], device=device)

            # Generate at full 512x512 resolution (steps=7, alpha=1.0)
            fake_imgs_512 = netG(noise, alpha=1.0, steps=7)

            for j in range(batch_size):
                img_idx = i + j
                image_id = f"synth_{img_idx:05d}.jpg"

                # Convert tensor to CPU, range [0, 1], then to numpy [0, 255]
                img_512_np = fake_imgs_512[j].mul(0.5).add(0.5).clamp(0, 1).permute(1, 2, 0).cpu().numpy()
                img_512_np = (img_512_np * 255).astype(np.uint8)

                # Resize to 256x256 using Albumentations/PIL
                img_256_np = resize_transform(image=img_512_np)['image']

                # Save the 256x256 image
                Image.fromarray(img_256_np).save(os.path.join(config['SYNTHETIC_DATA_DIR'], image_id))

                image_ids.append(image_id)
                generated_labels.append(labels[img_idx])

    synthetic_df = pd.DataFrame({'image_id': image_ids, 'Pneumonia': generated_labels})
    synthetic_df.to_csv(config['SYNTHETIC_CSV_PATH'], index=False)
    print(f"‚úÖ Generated {config['NUM_SYNTHETIC_IMAGES']} images, resized to {config['CLASSIFIER_IMG_SIZE']}x{config['CLASSIFIER_IMG_SIZE']}, and saved labels.")

# -------------------------------------------------------------------
# EVALUATION STEP 2: TRAIN DOWNSTREAM CLASSIFIER (TS)
# -------------------------------------------------------------------
def train_downstream_classifier(config, device):
    print("\n--- 2. Training Downstream Classifier on Synthetic Data ---")

    transform = A.Compose([
        # Synthetic data is already 256x256
        A.HorizontalFlip(p=0.5),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ])

    synth_df = pd.read_csv(config['SYNTHETIC_CSV_PATH'])
    train_dataset = SyntheticDataset(synth_df, config['SYNTHETIC_DATA_DIR'], transform)
    train_loader = DataLoader(train_dataset, batch_size=config['CLASSIFIER_BATCH_SIZE'], shuffle=True, num_workers=config['NUM_WORKERS'])

    model = models.resnet50(weights='IMAGENET1K_V1')
    model.fc = nn.Linear(model.fc.in_features, 1)
    model.to(device)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=config['CLASSIFIER_LR'])
    best_loss = float('inf')

    for epoch in range(config['CLASSIFIER_EPOCHS']):
        model.train()
        running_loss = 0.0
        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config['CLASSIFIER_EPOCHS']}")
        for images, labels in loop:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        avg_loss = running_loss / len(train_loader)
        if avg_loss < best_loss:
             best_loss = avg_loss
             torch.save(model.state_dict(), os.path.join(config['CLASSIFIER_OUTPUT_DIR'], config['CLASSIFIER_MODEL_NAME']))
             print(f"   ‚ú® Saved new best model with loss: {avg_loss:.4f}")

    print("‚úÖ Downstream classifier training complete.")
    # Load the best model for evaluation
    model.load_state_dict(torch.load(os.path.join(config['CLASSIFIER_OUTPUT_DIR'], config['CLASSIFIER_MODEL_NAME'])))
    return model

# -------------------------------------------------------------------
# EVALUATION STEP 3: TEST ON REAL DATA (TR) - (Utility & Fairness)
# -------------------------------------------------------------------
def evaluate_on_real_data(classifier, config, device):
    print("\n--- 3. Evaluating on Real Test Data (Utility & Fairness) ---")

    transform = A.Compose([
        A.Resize(config['CLASSIFIER_IMG_SIZE'], config['CLASSIFIER_IMG_SIZE']),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ])

    test_df = pd.read_csv(config['REAL_DATA_CSV_TEST'])
    test_dataset = RealTestDataset(test_df, config['IMAGE_DIR_REAL'], transform=transform)
    test_loader = DataLoader(test_dataset, batch_size=config['CLASSIFIER_BATCH_SIZE'], shuffle=False, num_workers=config['NUM_WORKERS'])

    classifier.eval()
    all_preds, all_labels, all_races = [], [], []

    with torch.no_grad():
        for images, labels, races in tqdm(test_loader, desc="Evaluating on Real Data"):
            images, labels = images.to(device), labels.to(device)
            outputs = classifier(images)
            preds = torch.sigmoid(outputs).cpu().numpy().flatten()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy().flatten())
            all_races.extend(races)

    df_results = pd.DataFrame({'label': all_labels, 'pred_prob': all_preds, 'race': all_races})
    df_results['prediction'] = (df_results['pred_prob'] > 0.5).astype(int)

    # --- üéØ Objective 1: Report on UTILITY ---
    auc = roc_auc_score(df_results['label'], df_results['pred_prob'])
    accuracy = accuracy_score(df_results['label'], df_results['prediction'])
    f1 = f1_score(df_results['label'], df_results['prediction'])

    print("\n" + "="*50)
    print("üìä EVALUATION REPORT (TS-TR)")
    print(f"Trained on {config['NUM_SYNTHETIC_IMAGES']} synthetic images. Evaluated on {len(df_results)} real test images.")
    print("\nüéØ Objective 1: Downstream Utility")
    print(f"AUC: {auc:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-Score: {f1:.4f}")

    # --- ‚öñÔ∏è Objective 2: Report on FAIRNESS ---
    tpr_per_group = {}
    positive_cases = df_results[df_results['label'] == 1]

    print("\n‚öñÔ∏è Objective 2: Fairness (Equal Opportunity)")
    print("True Positive Rate (TPR) by Group:")
    for group in sorted(positive_cases['race'].unique()):
        group_df = positive_cases[positive_cases['race'] == group]
        if len(group_df) == 0:
            print(f"  - {group}: N/A (0 positive samples)")
            continue

        # Calculate TPR
        tp = group_df['prediction'].sum()
        total_positives = len(group_df)
        tpr = tp / total_positives
        tpr_per_group[group] = tpr
        print(f"  - {group}: {tpr:.4f}  (TP: {tp} / Total: {total_positives})")

    if tpr_per_group:
        eod = max(tpr_per_group.values()) - min(tpr_per_group.values())
        print(f"\nEqual Opportunity Difference (Max TPR - Min TPR): {eod:.4f}")
    else:
        print("\n**Could not calculate Equal Opportunity Difference.**")

# -------------------------------------------------------------------
# EVALUATION STEP 4: EVALUATE CLINICAL PLAUSIBILITY
# -------------------------------------------------------------------
def evaluate_clinical_plausibility(config, device):
    print("\n--- 4. Evaluating Clinical Plausibility (L_area) ---")

    # 1. Load Generator
    netG = Generator(
        CONFIG['LATENT_DIM'],
        CONFIG['LATENT_DIM'],
        CONFIG['BASE_CHANNELS'],
        CONFIG['CHANNELS']
    ).to(device)
    netG.load_state_dict(torch.load(config['GENERATOR_CHECKPOINT'], map_location=device))
    netG.eval()
    print(f"‚úÖ Generator loaded from {config['GENERATOR_CHECKPOINT']}")

    # 2. Load Cseg
    Cseg = smp.Unet("resnet34", in_channels=3, classes=1).to(device)
    Cseg.load_state_dict(torch.load(config['CSEG_CHECKPOINT'], map_location=device))
    Cseg.eval()
    print(f"‚úÖ Cseg loaded from {config['CSEG_CHECKPOINT']}")

    num_eval_images = 1000 # Evaluate on a large sample
    all_clinical_scores = []

    with torch.no_grad():
        for i in tqdm(range(0, num_eval_images, config['GENERATION_BATCH_SIZE']), desc="Evaluating Plausibility"):
            batch_size = min(config['GENERATION_BATCH_SIZE'], num_eval_images - i)
            noise = torch.randn(batch_size, config['LATENT_DIM'], device=device)

            # Generate at full 512x512
            fake_imgs_512 = netG(noise, alpha=1.0, steps=7)

            # Get segmentation masks from Cseg
            fake_masks = Cseg(fake_imgs_512)

            # Get the plausibility score (L_area) for each image
            scores = calculate_clinical_loss_score(fake_masks, config)
            all_clinical_scores.append(scores.cpu())

    all_clinical_scores = torch.cat(all_clinical_scores)
    mean_score = torch.mean(all_clinical_scores).item()
    std_score = torch.std(all_clinical_scores).item()

    print("\n## ü©∫ Objective 3: Clinical Plausibility (L_area Score)")
    print(f"Evaluated on {num_eval_images} synthetic images.")
    print(f"Mean L_area Score: {mean_score:.4f}")
    print(f"Std Dev L_area Score: {std_score:.4f}")
    # print(f"(A lower mean score is better, indicating generated masks are close to the plausible mean area)")
    print("="*50)

# -------------------------------------------------------------------
# MAIN EXECUTION
# -------------------------------------------------------------------
def main(config):
    device = config['DEVICE']
    print(f"Using device: {device}")

    # --- Objective 1 & 2: Utility and Fairness ---
    # 1. Generate synthetic data
    generate_synthetic_data(config, device)

    # 2. Train classifier on synthetic data
    trained_classifier = train_downstream_classifier(config, device)

    # 3. Evaluate classifier on real data
    evaluate_on_real_data(trained_classifier, config, device)

    # --- Objective 3: Clinical Plausibility ---
    # 4. Evaluate generated images using Cseg
    evaluate_clinical_plausibility(config, device)

if __name__ == '__main__':
    # Mount drive first
    from google.colab import drive
    drive.mount('/content/drive')

    # Set your project directory
    %cd /content/drive/MyDrive/CAF-GAN/

    # Run the full evaluation
    main(CONFIG)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/CAF-GAN
Using device: cuda
--- 1. Generating Synthetic Dataset ---
‚úÖ Generator loaded from /content/drive/MyDrive/CAF-GAN/outputs/caf_gan_final/caf_gan_generator_final.pth


Generating Images: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 313/313 [32:10<00:00,  6.17s/it]


‚úÖ Generated 5000 images, resized to 256x256, and saved labels.

--- 2. Training Downstream Classifier on Synthetic Data ---
Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 97.8M/97.8M [00:00<00:00, 190MB/s]
Epoch 1/15: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [00:59<00:00,  2.62it/s, loss=0.627]


   ‚ú® Saved new best model with loss: 0.7063


Epoch 2/15: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [01:01<00:00,  2.56it/s, loss=0.647]


   ‚ú® Saved new best model with loss: 0.6964


Epoch 3/15: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [01:01<00:00,  2.57it/s, loss=0.698]


   ‚ú® Saved new best model with loss: 0.6874


Epoch 4/15: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [01:02<00:00,  2.51it/s, loss=0.618]


   ‚ú® Saved new best model with loss: 0.6752


Epoch 5/15: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [01:01<00:00,  2.56it/s, loss=0.661]


   ‚ú® Saved new best model with loss: 0.6481


Epoch 6/15: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [01:01<00:00,  2.55it/s, loss=0.51]


   ‚ú® Saved new best model with loss: 0.5895


Epoch 7/15: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [01:01<00:00,  2.56it/s, loss=0.361]


   ‚ú® Saved new best model with loss: 0.4864


Epoch 8/15: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [01:01<00:00,  2.54it/s, loss=0.734]


   ‚ú® Saved new best model with loss: 0.3838


Epoch 9/15: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [01:01<00:00,  2.55it/s, loss=0.544]


   ‚ú® Saved new best model with loss: 0.2645


Epoch 10/15: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [01:01<00:00,  2.55it/s, loss=0.0234]


   ‚ú® Saved new best model with loss: 0.2122


Epoch 11/15: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [01:01<00:00,  2.55it/s, loss=0.0486]


   ‚ú® Saved new best model with loss: 0.1672


Epoch 12/15: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [01:01<00:00,  2.54it/s, loss=0.0173]


   ‚ú® Saved new best model with loss: 0.1259


Epoch 13/15: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [01:01<00:00,  2.55it/s, loss=0.183]


   ‚ú® Saved new best model with loss: 0.0976


Epoch 14/15: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [01:01<00:00,  2.54it/s, loss=2.16]
Epoch 15/15: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [00:59<00:00,  2.62it/s, loss=0.00684]


   ‚ú® Saved new best model with loss: 0.0963
‚úÖ Downstream classifier training complete.

--- 3. Evaluating on Real Test Data (Utility & Fairness) ---


Evaluating on Real Data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [01:38<00:00,  9.83s/it]



üìä EVALUATION REPORT (TS-TR)
Trained on 5000 synthetic images. Evaluated on 301 real test images.

üéØ Objective 1: Downstream Utility
AUC: 0.5149
Accuracy: 0.3953
F1-Score: 0.5260

‚öñÔ∏è Objective 2: Fairness (Equal Opportunity)
True Positive Rate (TPR) by Group:
  - ASIAN: 1.0000  (TP: 4 / Total: 4)
  - BLACK: 0.8333  (TP: 15 / Total: 18)
  - HISPANIC/LATINO: 0.8333  (TP: 5 / Total: 6)
  - OTHER: 0.7500  (TP: 6 / Total: 8)
  - WHITE: 0.8987  (TP: 71 / Total: 79)

Equal Opportunity Difference (Max TPR - Min TPR): 0.2500

--- 4. Evaluating Clinical Plausibility (L_area) ---
‚úÖ Generator loaded from /content/drive/MyDrive/CAF-GAN/outputs/caf_gan_final/caf_gan_generator_final.pth
‚úÖ Cseg loaded from /content/drive/MyDrive/CAF-GAN/outputs/cseg_512/best_cseg_512.pth


Evaluating Plausibility: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:23<00:00,  2.64it/s]


## ü©∫ Objective 3: Clinical Plausibility (L_area Score)
Evaluated on 1000 synthetic images.
Mean L_area Score: 0.3856
Std Dev L_area Score: 0.2974



