<a href="https://colab.research.google.com/github/sid0nair/3D-CNN-/blob/main/diffusion_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [2]:
!git clone https://github.com/ashawkey/stable-dreamfusion.git
%cd stable-dreamfusion

Cloning into 'stable-dreamfusion'...
remote: Enumerating objects: 1281, done.[K
remote: Counting objects: 100% (705/705), done.[K
remote: Compressing objects: 100% (145/145), done.[K
remote: Total 1281 (delta 616), reused 560 (delta 560), pack-reused 576 (from 1)[K
Receiving objects: 100% (1281/1281), 17.13 MiB | 19.03 MiB/s, done.
Resolving deltas: 100% (792/792), done.
/content/stable-dreamfusion


In [3]:
!pip install -r requirements.txt

Collecting git+https://github.com/NVlabs/nvdiffrast/ (from -r requirements.txt (line 36))
  Cloning https://github.com/NVlabs/nvdiffrast/ to /tmp/pip-req-build-yv7vy07b
  Running command git clone --filter=blob:none --quiet https://github.com/NVlabs/nvdiffrast/ /tmp/pip-req-build-yv7vy07b
  Resolved https://github.com/NVlabs/nvdiffrast/ to commit 729261dc64c4241ea36efda84fbf532cc8b425b8
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting git+https://github.com/openai/CLIP.git (from -r requirements.txt (line 44))
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-y52pbr6i
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-y52pbr6i
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ninja (from -r requirements.txt (line 3))
  Downloading ninja-1.11.1.4-py3-none-manylinux_2_12_x86_64.manylinux2010

In [4]:
!pip install git+https://github.com/NVlabs/nvdiffrast/

Collecting git+https://github.com/NVlabs/nvdiffrast/
  Cloning https://github.com/NVlabs/nvdiffrast/ to /tmp/pip-req-build-9n7s_feu
  Running command git clone --filter=blob:none --quiet https://github.com/NVlabs/nvdiffrast/ /tmp/pip-req-build-9n7s_feu
  Resolved https://github.com/NVlabs/nvdiffrast/ to commit 729261dc64c4241ea36efda84fbf532cc8b425b8
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [5]:
# Make sure the folder is here and inspect its top‐level files
!ls /content/stable-dreamfusion

activation.py  evaluation   meshutils.py	 requirements.txt
assets	       freqencoder  nerf		 scripts
config	       gridencoder  optimizer.py	 shencoder
data	       guidance     preprocess_image.py  taichi_modules
docker	       ldm	    pretrained		 tets
dpt.py	       LICENSE	    raymarching
encoding.py    main.py	    readme.md


In [10]:
!ls /content/stable-dreamfusion/guidance

clip_utils.py  perpneg_utils.py  sd_utils.py
if_utils.py    __pycache__	 zero123_utils.py


In [7]:
# Cell A: Move into the repo (only if you aren’t already there)
%cd /content/stable-dreamfusion

/content/stable-dreamfusion


In [8]:
%%bash
# Cell B: Replace all torch_dtype=torch.float16 → torch_dtype=torch.float32,
#         and remove any 'revision="fp16"' flags.

# 1) Change torch_dtype
sed -i 's/torch_dtype=torch.float16/torch_dtype=torch.float32/g' guidance/sd_utils.py

# 2) Remove any occurrence of revision="fp16"
sed -i 's/revision="fp16"/revision="main"/g' guidance/sd_utils.py

In [None]:
%%bash
# Move into the repository folder
%cd /content/stable-dreamfusion

# Customize these variables however you like:
PROMPT="a shiny golden apple on a pedestal"
ITERS=3000
LR=1e-3
RES=64
SEED=0
LAMBDA_EN=1e-4
NUM_STEPS=64
UPS_STEPS=32
WORKSPACE="trial"
CKPT="latest"

python main.py \
  -O2 \
  --text "${PROMPT}" \
  --workspace "${WORKSPACE}" \
  --iters ${ITERS} \
  --lr ${LR} \
  --w ${RES} \
  --h ${RES} \
  --seed ${SEED} \
  --lambda_entropy ${LAMBDA_EN} \
  --ckpt ${CKPT} \
  --save_mesh \
  --num_steps ${NUM_STEPS} \
  --upsample_steps ${UPS_STEPS}


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# ---------------------------------------
# 1) Sinusoidal Position Embeddings
# ---------------------------------------
class SinusoidalPositionEmbeddings(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.dim = dim

    def forward(self, timesteps):
        """
        timesteps: tensor of shape (B,), dtype torch.long
        returns: embeddings of shape (B, dim)
        """
        device = timesteps.device
        half_dim = self.dim // 2
        freq = torch.exp(
            -math.log(10000) * torch.arange(0, half_dim, device=device) / half_dim
        )
        args = timesteps[:, None].float() * freq[None]  # (B, half_dim)
        embeddings = torch.cat([torch.sin(args), torch.cos(args)], dim=-1)
        return embeddings  # (B, dim)

# ---------------------------------------
# 2) Small U-Net Blocks
# ---------------------------------------
class ResidualBlock(nn.Module):
    """
    A single residual block with group-norm → SiLU → Conv → group-norm → SiLU → Conv.
    Adds a time embedding (via a linear layer) to the feature map mid-block.
    """
    def __init__(self, in_channels, out_channels, time_emb_dim):
        super().__init__()
        self.time_mlp = nn.Linear(time_emb_dim, out_channels)
        self.block1 = nn.Sequential(
            nn.GroupNorm(num_groups=8, num_channels=in_channels),
            nn.SiLU(),
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
        )
        self.block2 = nn.Sequential(
            nn.GroupNorm(num_groups=8, num_channels=out_channels),
            nn.SiLU(),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
        )
        # If channels differ, use a 1×1 conv for skip connection
        self.res_conv = nn.Conv2d(in_channels, out_channels, kernel_size=1) \
                        if in_channels != out_channels else nn.Identity()

    def forward(self, x, t):
        """
        x: (B, in_channels, H, W)
        t: (B, time_emb_dim)
        """
        h = self.block1(x)  # (B, out_channels, H, W)
        time_emb = self.time_mlp(t).unsqueeze(-1).unsqueeze(-1)  # (B, out_channels, 1, 1)
        h = h + time_emb
        h = self.block2(h)
        return h + self.res_conv(x)

class UNet(nn.Module):
    """
    A minimal U-Net with two downsample blocks, a bottleneck, and two upsample blocks.
    Predicts the noise given a noisy image x_t and timestep t.
    """
    def __init__(self, channels=3, base_channels=64, time_emb_dim=256):
        super().__init__()
        # Time‐embedding MLP
        self.time_embed = nn.Sequential(
            SinusoidalPositionEmbeddings(time_emb_dim),
            nn.Linear(time_emb_dim, time_emb_dim),
            nn.SiLU(),
        )

        # Down‐blocks
        self.conv1 = ResidualBlock(in_channels=channels, out_channels=base_channels, time_emb_dim=time_emb_dim)
        self.conv2 = ResidualBlock(in_channels=base_channels, out_channels=base_channels * 2, time_emb_dim=time_emb_dim)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

        # Bottleneck
        self.bottleneck = ResidualBlock(in_channels=base_channels * 2, out_channels=base_channels * 2, time_emb_dim=time_emb_dim)

        # Up‐blocks
        self.up_conv = nn.ConvTranspose2d(base_channels * 2, base_channels, kernel_size=2, stride=2)
        self.conv3 = ResidualBlock(in_channels=base_channels * 2, out_channels=base_channels, time_emb_dim=time_emb_dim)

        # Final 1×1 convolution back to channels
        self.final_conv = nn.Conv2d(base_channels, channels, kernel_size=1)

    def forward(self, x, t):
        """
        x: (B, 3, H, W)  noisy image
        t: (B,) integer timesteps
        """
        t_emb = self.time_embed(t)  # (B, time_emb_dim)

        # Down
        x1 = self.conv1(x, t_emb)           # (B, base_channels, H, W)
        x2 = self.pool(x1)                  # (B, base_channels, H/2, W/2)
        x2 = self.conv2(x2, t_emb)          # (B, base_channels*2, H/2, W/2)

        # Bottleneck
        x3 = self.pool(x2)                  # (B, base_channels*2, H/4, W/4)
        x3 = self.bottleneck(x3, t_emb)     # (B, base_channels*2, H/4, W/4)

        # Up
        x4 = self.up_conv(x3)               # (B, base_channels, H/2, W/2)
        x4 = torch.cat([x4, x2], dim=1)     # (B, base_channels*3, H/2, W/2) → after concat, feed into ResidualBlock expecting 2×channels from conv2 + 1×from up
        x4 = self.conv3(x4, t_emb)          # (B, base_channels, H/2, W/2)

        # Final conv (upsampled to H×W by unpooling)
        # Note: We skipped a second unpool for simplicity. If you want full H×W, you can pool/unpool symmetrical.
        # Here, since conv2 and conv3 both work at H/2, we use pool only once. For H/2→H/2 → final:
        return self.final_conv(x4)          # (B, 3, H/2, W/2) if you want full size, add an upsample to (H, W) here.


In [None]:
# ---------------------------------------
# 3) Diffusion (DDPM) Class
# ---------------------------------------
class Diffusion(nn.Module):
    def __init__(self, model, img_size, timesteps=1000, beta_start=1e-4, beta_end=2e-2):
        """
        model: your U-Net denoiser
        img_size: resolution (e.g., 64)
        timesteps: T in the DDPM
        beta_start, beta_end: linear noise schedule
        """
        super().__init__()
        self.model = model
        self.timesteps = timesteps
        self.img_size = img_size

        # Linear β schedule from beta_start → beta_end over T timesteps
        self.betas = torch.linspace(beta_start, beta_end, timesteps)
        self.alphas = 1.0 - self.betas
        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)

    def add_noise(self, x0, t):
        """
        x0: (B, 3, H, W) original images in [−1,1] or [0,1]
        t: (B,) integer timesteps between [0, T-1]
        returns: x_t and the actual noise added
        """
        # Gather ᾱ_t (cumulative product up to t) for each sample
        a_cumprod_t = self.alphas_cumprod[t].sqrt()                   # (B,)
        one_minus_a_cumprod_t = (1 - self.alphas_cumprod[t]).sqrt()   # (B,)

        noise = torch.randn_like(x0)
        x_t = (
            a_cumprod_t[:, None, None, None] * x0
            + one_minus_a_cumprod_t[:, None, None, None] * noise
        )
        return x_t, noise

    def forward(self, x0):
        """
        Single training step:
        - Sample random t
        - Add noise x_t ← q(x_t | x0)
        - Predict the noise with the U-Net
        - Compute MSE between predicted noise and true noise
        """
        B = x0.shape[0]
        device = x0.device
        t = torch.randint(0, self.timesteps, (B,), device=device).long()  # (B,)
        x_t, noise = self.add_noise(x0, t)
        pred_noise = self.model(x_t, t)
        loss = F.mse_loss(pred_noise, noise)
        return loss

    @torch.no_grad()
    def sample(self, n_samples, device):
        """
        Generates images by reversing the diffusion process:
        - Start from pure Gaussian noise
        - For t from T−1…0, predict ε_θ(x_t, t), then compute x_{t−1}
        """
        x = torch.randn(n_samples, 3, self.img_size, self.img_size, device=device)
        for t in reversed(range(self.timesteps)):
            t_batch = torch.full((n_samples,), t, device=device, dtype=torch.long)
            predicted_noise = self.model(x, t_batch)

            beta_t = self.betas[t]
            alpha_t = self.alphas[t]
            alpha_cumprod_t = self.alphas_cumprod[t]

            # Estimate x0 from predicted noise
            x0_pred = (x - beta_t.sqrt() * predicted_noise) / alpha_t.sqrt()
            if t > 0:
                noise = torch.randn_like(x)
                posterior_mean = alpha_t.sqrt() * x0_pred
                posterior_var = beta_t
                x = posterior_mean + posterior_var.sqrt() * noise
            else:
                x = x0_pred
        return x

In [None]:
if __name__ == "__main__":
    img_size = 64
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Instantiate U-Net and Diffusion
    unet = UNet(channels=3, base_channels=64, time_emb_dim=256).to(device)
    diffusion = Diffusion(unet, img_size=img_size, timesteps=1000).to(device)
    optimizer = torch.optim.Adam(diffusion.parameters(), lr=1e-4)

    # Dummy loop: replace `x0 = ...` with real batch from your DataLoader
    for epoch in range(10):
        x0 = torch.randn(8, 3, img_size, img_size, device=device)  # replace with real data
        loss = diffusion(x0)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"[Epoch {epoch+1}] Training loss: {loss.item():.4f}")

    # After training, sample 4 new images:
    samples = diffusion.sample(n_samples=4, device=device)
    print("Generated samples tensor shape:", samples.shape)