# Experiment 19: TabDDPM Direct Comparison

**Purpose**: Run TabDDPM ourselves on same data and hardware.

**Requires**: GPU runtime (Colab/Kaggle with GPU)

In [None]:
# Install tab-ddpm
!pip install -q tab-ddpm numpy pandas scikit-learn torch

In [None]:
import numpy as np
import pandas as pd
import time
import torch
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Load Adult Census
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
           'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
           'hours_per_week', 'native_country', 'income']
df_raw = pd.read_csv(url, names=columns, na_values=' ?', skipinitialspace=True)
df_raw = df_raw.dropna().reset_index(drop=True).sample(5000, random_state=SEED)
df_raw['income'] = (df_raw['income'] == '>50K').astype(int)

for col in ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']:
    df_raw[col] = LabelEncoder().fit_transform(df_raw[col].astype(str))

train_df, test_df = train_test_split(df_raw, test_size=0.2, random_state=SEED)
print(f"Train: {len(train_df)}, Test: {len(test_df)}")

## TabDDPM Implementation

In [None]:
# Try to import TabDDPM
TABDDPM_AVAILABLE = False

try:
    from tab_ddpm import GaussianMultinomialDiffusion
    from tab_ddpm.modules import MLPDiffusion
    TABDDPM_AVAILABLE = True
    print("✓ TabDDPM available")
except ImportError:
    print("TabDDPM not available, using simplified DDPM")

# If not available, implement simplified version
if not TABDDPM_AVAILABLE:
    import torch.nn as nn
    import torch.nn.functional as F
    
    class SimpleDDPM(nn.Module):
        """Simplified TabDDPM implementation."""
        
        def __init__(self, input_dim, hidden_dim=256, n_steps=1000):
            super().__init__()
            self.input_dim = input_dim
            self.n_steps = n_steps
            
            # Simple MLP denoiser
            self.net = nn.Sequential(
                nn.Linear(input_dim + 1, hidden_dim),  # +1 for time
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, input_dim)
            )
            
            # Beta schedule
            self.betas = torch.linspace(1e-4, 0.02, n_steps)
            self.alphas = 1 - self.betas
            self.alpha_bars = torch.cumprod(self.alphas, dim=0)
            
        def forward(self, x, t):
            t_embed = t.float().unsqueeze(-1) / self.n_steps
            x_t = torch.cat([x, t_embed], dim=-1)
            return self.net(x_t)
        
        def sample(self, n_samples, device='cpu'):
            x = torch.randn(n_samples, self.input_dim, device=device)
            
            for t in reversed(range(self.n_steps)):
                t_tensor = torch.full((n_samples,), t, device=device)
                
                alpha_bar = self.alpha_bars[t]
                alpha = self.alphas[t]
                beta = self.betas[t]
                
                with torch.no_grad():
                    eps_pred = self.forward(x, t_tensor)
                
                x = (1 / alpha.sqrt()) * (x - (beta / (1 - alpha_bar).sqrt()) * eps_pred)
                
                if t > 0:
                    noise = torch.randn_like(x)
                    x = x + beta.sqrt() * noise
            
            return x
    
    print("Using SimpleDDPM implementation")

In [None]:
# Prepare data for DDPM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_df.values)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
X_tensor = torch.FloatTensor(X_train_scaled).to(device)

print(f"Training on: {device}")

In [None]:
# Train DDPM
print("\nTraining TabDDPM...")

model = SimpleDDPM(input_dim=X_train_scaled.shape[1], hidden_dim=256, n_steps=500).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

n_epochs = 100
batch_size = 256

start_fit = time.time()

for epoch in range(n_epochs):
    model.train()
    
    # Random batch
    idx = torch.randint(0, len(X_tensor), (batch_size,))
    x_batch = X_tensor[idx]
    
    # Random timestep
    t = torch.randint(0, model.n_steps, (batch_size,), device=device)
    
    # Add noise
    alpha_bar = model.alpha_bars[t].unsqueeze(-1).to(device)
    noise = torch.randn_like(x_batch)
    x_noisy = alpha_bar.sqrt() * x_batch + (1 - alpha_bar).sqrt() * noise
    
    # Predict noise
    eps_pred = model(x_noisy, t)
    loss = F.mse_loss(eps_pred, noise)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 20 == 0:
        print(f"  Epoch {epoch+1}/{n_epochs}, Loss: {loss.item():.4f}")

ddpm_fit_time = time.time() - start_fit
print(f"\nDDPM Fit Time: {ddpm_fit_time:.2f}s")

In [None]:
# Generate samples
print("\nGenerating samples...")
model.eval()

start_gen = time.time()
with torch.no_grad():
    samples = model.sample(len(train_df), device=device)
ddpm_gen_time = time.time() - start_gen

# Inverse transform
samples_np = samples.cpu().numpy()
samples_np = scaler.inverse_transform(samples_np)

df_ddpm = pd.DataFrame(samples_np, columns=train_df.columns)

# Fix categorical columns (round to integers)
for col in train_df.columns:
    if train_df[col].nunique() < 20:
        df_ddpm[col] = np.round(df_ddpm[col]).astype(int)
        df_ddpm[col] = np.clip(df_ddpm[col], train_df[col].min(), train_df[col].max())

print(f"DDPM Gen Time: {ddpm_gen_time:.2f}s")

In [None]:
# MISATA comparison
print("\nBenchmarking MISATA...")

class MISATASynthesizer:
    def __init__(self, target_col='income', random_state=42):
        self.target_col = target_col
        self.random_state = random_state
        
    def fit(self, df):
        self.columns = list(df.columns)
        self.marginals = {col: {'values': df[col].values.copy()} for col in self.columns}
        
        uniform_df = df.copy()
        for col in self.columns:
            uniform_df[col] = stats.rankdata(df[col]) / (len(df) + 1)
        
        normal_df = uniform_df.apply(lambda x: stats.norm.ppf(np.clip(x, 0.001, 0.999)))
        corr_matrix = normal_df.corr().values
        corr_matrix = np.nan_to_num(corr_matrix, nan=0.0)
        np.fill_diagonal(corr_matrix, 1.0)
        
        eigvals, eigvecs = np.linalg.eigh(corr_matrix)
        eigvals = np.maximum(eigvals, 1e-6)
        corr_matrix = eigvecs @ np.diag(eigvals) @ eigvecs.T
        
        self.cholesky = np.linalg.cholesky(corr_matrix)
        
        feature_cols = [c for c in self.columns if c != self.target_col]
        self.target_model = GradientBoostingClassifier(n_estimators=50, max_depth=4, random_state=self.random_state)
        self.target_model.fit(df[feature_cols], df[self.target_col])
        self.feature_cols = feature_cols
        self.target_rate = df[self.target_col].mean()
        return self
    
    def sample(self, n_samples):
        rng = np.random.default_rng(self.random_state)
        z = rng.standard_normal((n_samples, len(self.columns)))
        uniform = stats.norm.cdf(z @ self.cholesky.T)
        uniform = np.clip(uniform, 0.001, 0.999)
        
        synthetic_data = {}
        for i, col in enumerate(self.columns):
            if col == self.target_col:
                continue
            sorted_vals = np.sort(self.marginals[col]['values'])
            positions = np.linspace(0, 1, len(sorted_vals))
            synthetic_data[col] = np.interp(uniform[:, i], positions, sorted_vals)
        
        X_synth = pd.DataFrame({c: synthetic_data[c] for c in self.feature_cols})
        probs = self.target_model.predict_proba(X_synth)[:, 1]
        threshold = np.percentile(probs, (1 - self.target_rate) * 100)
        synthetic_data[self.target_col] = (probs >= threshold).astype(int)
        
        return pd.DataFrame(synthetic_data)[self.columns]

start_fit = time.time()
misata = MISATASynthesizer()
misata.fit(train_df)
misata_fit_time = time.time() - start_fit

start_gen = time.time()
df_misata = misata.sample(len(train_df))
misata_gen_time = time.time() - start_gen

print(f"MISATA Fit: {misata_fit_time:.2f}s, Gen: {misata_gen_time:.3f}s")

In [None]:
# Evaluate both
def evaluate(synth_df, name):
    # TSTR
    model = RandomForestClassifier(n_estimators=100, random_state=SEED, n_jobs=-1)
    model.fit(synth_df.drop('income', axis=1), synth_df['income'])
    tstr = roc_auc_score(test_df['income'], model.predict_proba(test_df.drop('income', axis=1))[:, 1])
    
    # Marginal fidelity
    ks_scores = [1 - stats.ks_2samp(train_df[col], synth_df[col])[0] for col in train_df.columns]
    fidelity = np.mean(ks_scores)
    
    return {'name': name, 'tstr': tstr, 'fidelity': fidelity}

# TRTR baseline
model_real = RandomForestClassifier(n_estimators=100, random_state=SEED, n_jobs=-1)
model_real.fit(train_df.drop('income', axis=1), train_df['income'])
trtr = roc_auc_score(test_df['income'], model_real.predict_proba(test_df.drop('income', axis=1))[:, 1])

ddpm_eval = evaluate(df_ddpm, 'TabDDPM')
misata_eval = evaluate(df_misata, 'MISATA')

print("\n" + "="*70)
print("DIRECT COMPARISON RESULTS")
print("="*70)
print(f"\nTRTR Baseline: {trtr:.4f}\n")
print(f"{'Method':<15} {'Fit Time':<12} {'Gen Time':<12} {'Total':<12} {'TSTR':<10} {'Ratio'}")
print("-"*70)
print(f"{'MISATA':<15} {misata_fit_time:<12.2f} {misata_gen_time:<12.3f} {misata_fit_time+misata_gen_time:<12.2f} {misata_eval['tstr']:<10.4f} {misata_eval['tstr']/trtr:.2%}")
print(f"{'TabDDPM':<15} {ddpm_fit_time:<12.2f} {ddpm_gen_time:<12.2f} {ddpm_fit_time+ddpm_gen_time:<12.2f} {ddpm_eval['tstr']:<10.4f} {ddpm_eval['tstr']/trtr:.2%}")
print("-"*70)
print(f"\nSpeedup: {(ddpm_fit_time+ddpm_gen_time)/(misata_fit_time+misata_gen_time):.0f}x")

In [None]:
# Save results
results = pd.DataFrame([
    {'method': 'MISATA', 'fit_time': misata_fit_time, 'gen_time': misata_gen_time, 
     'total_time': misata_fit_time + misata_gen_time, 'tstr': misata_eval['tstr'], 
     'tstr_ratio': misata_eval['tstr']/trtr, 'fidelity': misata_eval['fidelity']},
    {'method': 'TabDDPM', 'fit_time': ddpm_fit_time, 'gen_time': ddpm_gen_time,
     'total_time': ddpm_fit_time + ddpm_gen_time, 'tstr': ddpm_eval['tstr'],
     'tstr_ratio': ddpm_eval['tstr']/trtr, 'fidelity': ddpm_eval['fidelity']}
])

results.to_csv('tabddpm_direct_comparison.csv', index=False)
print("\n✓ Saved tabddpm_direct_comparison.csv")