In [15]:
# tab_diffusion_demo.py
"""
Mini TabDiffusion demo.
- Generates toy tabular data
- Trains a simple denoising model
- Samples synthetic rows

Dependencies:
    pip install torch numpy pandas scikit-learn
"""

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# -----------------------------
# 1. Create toy tabular dataset
# -----------------------------
np.random.seed(42)
n_samples = 500

# Example: age, income, binary gender
age = np.random.normal(40, 12, n_samples)
income = np.random.normal(50000, 15000, n_samples)
gender = np.random.binomial(1, 0.5, n_samples)  # 0=female, 1=male

data = np.stack([age, income, gender], axis=1)
scaler = MinMaxScaler()
data = scaler.fit_transform(data)  # scale to [0,1]

data = torch.tensor(data, dtype=torch.float32)


In [16]:
# -----------------------------
# 2. Define forward diffusion
# -----------------------------
def q_sample(x0, t, noise):
    """
    Diffuse the data (add Gaussian noise) at step t.
    """
    # Linear beta schedule
    betas = torch.linspace(1e-4, 0.02, steps=T)
    alphas = 1.0 - betas
    alphas_bar = torch.cumprod(alphas, dim=0)

    sqrt_ab = torch.sqrt(alphas_bar[t])[:, None]
    sqrt_one_minus_ab = torch.sqrt(1 - alphas_bar[t])[:, None]

    return sqrt_ab * x0 + sqrt_one_minus_ab * noise


In [17]:
# -----------------------------
# 3. Simple denoising model
# -----------------------------
class DenoiseMLP(nn.Module):
    def __init__(self, dim, hidden=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim + 1, hidden),
            nn.ReLU(),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Linear(hidden, dim)
        )

    def forward(self, x, t):
        # append time embedding (scaled)
        t = t.float().unsqueeze(1) / T
        xt = torch.cat([x, t], dim=1)
        return self.net(xt)


In [18]:
# -----------------------------
# 4. Training loop
# -----------------------------
T = 100  # number of diffusion steps
model = DenoiseMLP(dim=data.shape[1])
opt = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

for epoch in range(200):
    idx = torch.randint(0, data.shape[0], (64,))
    x0 = data[idx]
    t = torch.randint(0, T, (64,))
    noise = torch.randn_like(x0)
    xt = q_sample(x0, t, noise)

    pred_noise = model(xt, t)
    loss = loss_fn(pred_noise, noise)

    opt.zero_grad()
    loss.backward()
    opt.step()

    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss {loss.item():.4f}")

Epoch 0, Loss 1.0804
Epoch 50, Loss 0.4121
Epoch 100, Loss 0.3700
Epoch 150, Loss 0.2913


In [19]:
# -----------------------------
# 5. Sampling new synthetic data
# -----------------------------
@torch.no_grad()
def p_sample_loop(n_samples=10):
    x = torch.randn(n_samples, data.shape[1])
    betas = torch.linspace(1e-4, 0.02, steps=T)
    alphas = 1.0 - betas
    alphas_bar = torch.cumprod(alphas, dim=0)

    for t in reversed(range(T)):
        t_batch = torch.full((n_samples,), t)
        noise_pred = model(x, t_batch)
        alpha = alphas[t]
        alpha_bar = alphas_bar[t]

        x = (1 / torch.sqrt(alpha)) * (
            x - (1 - alpha) / torch.sqrt(1 - alpha_bar) * noise_pred
        )
        if t > 0:
            x += torch.sqrt(betas[t]) * torch.randn_like(x)
    return x

synthetic = p_sample_loop(5).numpy()
synthetic = scaler.inverse_transform(synthetic)
