# Diffusion Model Training on PSC Bridges2

This notebook is adapted to run on PSC's Bridges2 supercomputer.

## Prerequisites:
1. SSH into Bridges2 via VSCode
2. Navigate to `/jet/home/<your_username>`
3. Request a GPU node: `interact -p GPU-shared --gres=gpu:v100-32:1 -t 8:00:00 -A cis250019p`
4. Load Anaconda: `module load anaconda3`
5. Activate your environment (see setup cell below)
6. Start Jupyter: `jupyter notebook --no-browser --ip=0.0.0.0`
7. Connect to the Jupyter server in VSCode

## Step 1: Environment Setup

Run these commands in the terminal BEFORE starting Jupyter:

```bash
# Create environment (first time only)
conda create -n diffusion_env python=3.10 -y
conda activate diffusion_env

# Install PyTorch with CUDA support
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Install other dependencies
pip install numpy pillow tqdm wandb ruamel.yaml gdown torchmetrics
pip install jupyter ipykernel

# Register kernel
python -m ipykernel install --user --name diffusion_env --display-name "Python (diffusion)"
```

## Step 2: Verify Setup and Check GPU

In [None]:
# Check current directory and GPU availability
!pwd
!hostname
!nvidia-smi

In [None]:
!pip install gdown einops pytorch-lightning

In [None]:
import torch
import os
import sys

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")

## Step 3: Navigate to Your Project Directory

In [None]:
# TODO: Replace <your_username> and <your_repo_name> with your actual values
import os

# Change to your project directory
project_dir = "/jet/home/<username>/<your_repo_name>"  # TODO: Update this!
os.chdir(project_dir)
print(f"Current directory: {os.getcwd()}")

# Verify project structure
print("\nProject files:")
!ls -la

## Step 4: Download and Convert CIFAR-10 to ImageFolder Format

Following the original Colab approach - download CIFAR-10 and convert to ImageFolder structure.

In [None]:
# Download and distribute CIFAR10 data into train and test
"""
root = "./data"
out_root = f"{root}/cifar10_imagefolder"
os.makedirs(root, exist_ok=True)

print(f"Data will be saved to: {out_root}")
"""

In [None]:
"""
from torchvision.datasets import CIFAR10
import shutil
from tqdm import tqdm

# Learning Multiple Layers of Features from Tiny Images, Alex Krizhevsky, 2009.
# Tech Report: https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf
# Dataset link (original not torchvision): https://www.cs.toronto.edu/~kriz/cifar.html
train_ds = CIFAR10(root=root, train=True,  download=True)
test_ds  = CIFAR10(root=root, train=False, download=True)
classes = train_ds.classes  # ['airplane','automobile',...,'truck']

print(f"Train samples: {len(train_ds)}")
print(f"Test samples: {len(test_ds)}")
print(f"Classes: {classes}")
"""



# ImageNet-100 Download for VAE

In [None]:
root = "./data"
out_root = f"{root}/imagenet100_128x128"
os.makedirs(root, exist_ok=True)

print(f"Data will be saved to: {out_root}")

In [None]:
import gdown
import tarfile
import os
from tqdm import tqdm

root = "./data"
os.makedirs(root, exist_ok=True)

# Download ImageNet-100 128x128
tar_path = f"{root}/imagenet100_128x128.tar.gz"
extract_path = f"{root}/imagenet100_128x128"

if not os.path.exists(extract_path):
    # Download from Google Drive
    if not os.path.exists(tar_path):
        print("Downloading ImageNet-100 128x128 from Google Drive...")
        url = "https://drive.google.com/file/d/11Pk4QvNX3fZkpa_ZvdjD_fQFjCBoa1M0/view"
        gdown.download(url, tar_path, quiet=False, fuzzy=True)
        print("Download complete!")
    
    # Extract tar.gz
    print("Extracting archive...")
    with tarfile.open(tar_path, 'r:gz') as tar:
        tar.extractall(root)
    print("Extraction complete!")
    
    # Clean up tar file (optional)
    # os.remove(tar_path)
else:
    print(f"ImageNet-100 already exists at {extract_path}")

# Check the structure
print("\nDataset structure:")
print(f"Contents of {extract_path}:")
for item in os.listdir(extract_path):
    item_path = os.path.join(extract_path, item)
    if os.path.isdir(item_path):
        num_items = len(os.listdir(item_path))
        print(f"  {item}/  ({num_items} items)")
    else:
        print(f"  {item}")

In [None]:
from torchvision.datasets import ImageFolder
from torchvision import transforms

"""
# Transforms for 128x128 ImageNet-100
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

val_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])
"""


# Load datasets (adjust paths based on actual structure)
train_ds = ImageFolder(root=f"{extract_path}/train", transform=train_transform)
test_ds = ImageFolder(root=f"{extract_path}/validation", transform=val_transform)

classes = train_ds.classes
print(f"\nTrain samples: {len(train_ds)}")
print(f"Test samples: {len(test_ds)}")
print(f"Number of classes: {len(classes)}")

In [None]:
# Export to ImageFolder: ./data/cifar10_imagefolder/{train|val}/{class}/img_XXXXX.png
"""
def dump_split(ds, split_name):
    split_dir = f"{out_root}/{split_name}"
    if os.path.exists(split_dir):
        shutil.rmtree(split_dir)
    os.makedirs(split_dir, exist_ok=True)
    # make per-class dirs
    for c in classes:
        os.makedirs(os.path.join(split_dir, c), exist_ok=True)
    # write PNGs
    for i in tqdm(range(len(ds)), desc=f"Writing {split_name} set"):
        img, label = ds[i]                       # PIL Image, int label
        cls = classes[label]
        img.save(os.path.join(split_dir, cls, f"img_{i:05d}.png"))

dump_split(train_ds, "train")
dump_split(test_ds,  "val")

print("\nDone! Point train.py --data_dir to:", f"{out_root}/train")
print("Validation dir:", f"{out_root}/val")
"""

In [None]:
# Verify the structure
"""
#for CIFAR10
print("\nDirectory structure:")
!ls -lh {out_root}/
print("\nTrain classes:")
!ls {out_root}/train/
print("\nSample counts per class (train):")
!ls {out_root}/train/airplane/ | wc -l
"""

## Step 4b: Set Data Paths

In [None]:
# Set the data paths for training
data_dir = f"{out_root}/train"
#val_dir = f"{out_root}/val"
val_dir = f"{out_root}/validation"

print(f"Training data: {data_dir}")
print(f"Validation data: {val_dir}")

## Step 5: Configure Weights & Biases (W&B)

Set up W&B for experiment tracking.

In [None]:
import wandb

# Login to W&B (run this once)
# You'll need to paste your API key from https://wandb.ai/authorize
#wandb.login()
os.environ["WANDB_ENTITY"] = "Diffusion-F25DL_Project"
os.environ["WANDB_PROJECT"] = "DiffusionRuns"
os.environ["WANDB_API_KEY"] = "<insert-api-key>"


RESUME_LOGGING = False # Set this to true if you are resuming training from a previous run

# Create your wandb run
#run_name = '{}_checkpoint_submission_1'.format(config['Name'])
run_name = '<insert-run-name>'  # TODO: Update this!

# If you are resuming an old run


wandb.login(key="<insert-api-key>") 

if RESUME_LOGGING:
    run = wandb.init(
        id     = "", ### Insert specific run id here if you want to resume a previous run
        resume = "must", ### You need this to resume previous runs
        project = "DiffusionRuns", ### Project should be created in your wandb
        settings = wandb.Settings(_service_wait=300)
    )


else:
    run = wandb.init(
        name    = run_name, ### Wandb creates random run names if you skip this field, we recommend you give useful names
        reinit  = True, ### Allows reinitalizing runs when you re-run this cell
        entity="Diffusion-F25DL_Project",
        project = "DiffusionRuns" ### Project should be created in your wandb account
        #config  = config ### Wandb Config for your run
    )



## Step 6: Check Your Config File

In [None]:
# Display your config file
config_path = "configs/ddpm.yaml"  # TODO: Update if using a different config

if os.path.exists(config_path):
    print(f"Config file: {config_path}")
    print("=" * 60)
    with open(config_path, 'r') as f:
        print(f.read())
else:
    print(f"Config file not found: {config_path}")

## Step 7: Run Training

### Option A: Match Original Colab Command (Recommended)

In [None]:
# Navigate to project directory
%cd /jet/home/dkilari/F25-Deep-Learning-Project 

# Training command matching original Colab setup
"""
!python train.py \
  --output_dir experiments \
  --data_dir {data_dir} \
  --val_dir {val_dir} \
  --image_size 32 --unet_in_size 32 \
  --num_classes 10 \
  --batch_size 64 --num_workers 8 \
  --num_epochs 50 --learning_rate 1e-4 --weight_decay 1e-4 \
  --num_train_timesteps 1000 --num_inference_steps 50 \
  --beta_start 0.0001 --beta_end 0.02 --beta_schedule linear \
  --use_ddim False --ddim_eta 0.0 \
  --latent_ddpm True \
  --run_name psc_imagenet_vae_no_ddim_1
"""

!python train.py \
  --output_dir experiments \
  --data_dir {data_dir} \
  --val_dir {val_dir} \
  --image_size 128 --unet_in_size 32 \
  --num_classes 100 \
  --batch_size 64 \
  --num_workers 4 \
  --num_epochs 100 \
  --unet_ch 160 \
  --unet_ch_mult 1 2 2 3 \
  --unet_attn 1 2 3 \
  --unet_num_res_blocks 2 \
  --unet_dropout 0.1 \
  --learning_rate 2e-4 --weight_decay 1e-5 \
  --num_train_timesteps 1000 --num_inference_steps 100 \
  --beta_start 0.0001 --beta_end 0.02 --beta_schedule linear \
  --latent_ddpm True \
  --use_ddim False --ddim_eta 0.0 \
  --eval_fid_is True \
  --run_name imagenet100_vae_sc_4

In [None]:
wandb.finish()

### Option B: With Config File (if you have one)

In [None]:
# Training with config file
"""
!python train.py \
    --config configs/cifar10.yaml \
    --data_dir {data_dir} \
    --val_dir {val_dir}
"""

### Option C: With Classifier-Free Guidance

In [None]:
# Add CFG for conditional generation
"""
!python train.py \
  --data_dir {data_dir} \
  --val_dir {val_dir} \
  --image_size 32 --unet_in_size 32 \
  --num_classes 10 \
  --batch_size 64 --num_workers 4 \
  --num_epochs 50 --learning_rate 1e-4 \
  --num_train_timesteps 1000 --num_inference_steps 50 \
  --beta_start 0.0001 --beta_end 0.02 \
  --use_ddim True --ddim_eta 0.0 \
  --use_cfg True --cfg_guidance_scale 3.0 \
  --run_name psc_cifar10_cfg
"""

### Option D: Background Training (For Long Runs)

In [None]:
# Run in background - useful if your Jupyter session might disconnect
# Note: Run this in the terminal, not in Jupyter

print("To run in background, use this command in the terminal:")
print(f"""\nnohup python train.py \\
  --data_dir {data_dir} \\
  --val_dir {val_dir} \\
  --image_size 32 --unet_in_size 32 \\
  --num_classes 10 \\
  --batch_size 64 --num_workers 4 \\
  --num_epochs 50 --learning_rate 1e-4 \\
  --num_train_timesteps 1000 --num_inference_steps 50 \\
  --use_ddim True --ddim_eta 0.0 \\
  > training.log 2>&1 &

# Then monitor with:
tail -f training.log
""")


## Step 8: Monitor Training Progress

Check your W&B dashboard or view the training log.

In [None]:
# Check experiments directory
"""
!ls -lh experiments/

# View latest experiment
import glob
experiments = sorted(glob.glob('experiments/exp-*'))
if experiments:
    latest_exp = experiments[-1]
    print(f"\nLatest experiment: {latest_exp}")
    print("\nCheckpoints:")
    !ls -lh {latest_exp}/checkpoints/
"""

## Step 9: View Generated Images

In [None]:
"""
from PIL import Image
import matplotlib.pyplot as plt

# Find the latest experiment
experiments = sorted(glob.glob('experiments/exp-*'))
if experiments:
    latest_exp = experiments[-1]
    
    # Check if there are any generated images saved locally
    # (Your training script saves to W&B, but you can add local saving too)
    print(f"Check W&B dashboard for generated images!")
    print(f"Experiment directory: {latest_exp}")
"""