# Experiment 1B: Fair Performance Benchmark

## Critical Fix Applied
**Issue**: Previous comparison was unfair - comparing CTGAN training+generation vs MISATA generation-only.

**Fix**: Separate and report:
1. **Fitting/Training Time** - Time to learn from data
2. **Generation Time** - Time to generate N samples from fitted model
3. **Total Time** - End-to-end time to go from data to synthetic output
4. **Generation Throughput** - Rows/second for generation only

This is the HONEST, BULLETPROOF benchmark.

In [None]:
!pip install -q sdv faker numpy pandas scikit-learn matplotlib

In [None]:
import numpy as np
import pandas as pd
import time
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Reproducibility
SEED = 42
np.random.seed(SEED)

print("Setup complete.")

## Load Dataset

In [None]:
# Load Adult Census
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
           'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
           'hours_per_week', 'native_country', 'income']

df_raw = pd.read_csv(url, names=columns, na_values=' ?', skipinitialspace=True)
df_raw = df_raw.dropna().reset_index(drop=True)

# Use 10K rows for fair comparison
df_sample = df_raw.sample(n=10000, random_state=SEED).reset_index(drop=True)

print(f"Dataset: {len(df_sample):,} rows, {len(df_sample.columns)} columns")

## Benchmark Functions

In [None]:
def benchmark_with_timing(name, fit_fn, generate_fn, df, n_generate=10000, n_runs=3):
    """
    Benchmark a synthetic data generator with SEPARATE timing.
    
    Returns:
        dict with fit_time, gen_time, total_time, gen_throughput
    """
    fit_times = []
    gen_times = []
    
    for run in range(n_runs):
        # Fit/Train timing
        start = time.time()
        model = fit_fn(df)
        fit_time = time.time() - start
        fit_times.append(fit_time)
        
        # Generate timing
        start = time.time()
        synthetic = generate_fn(model, n_generate)
        gen_time = time.time() - start
        gen_times.append(gen_time)
    
    avg_fit = np.mean(fit_times)
    avg_gen = np.mean(gen_times)
    std_fit = np.std(fit_times)
    std_gen = np.std(gen_times)
    
    return {
        'name': name,
        'fit_time_mean': avg_fit,
        'fit_time_std': std_fit,
        'gen_time_mean': avg_gen,
        'gen_time_std': std_gen,
        'total_time': avg_fit + avg_gen,
        'gen_throughput': n_generate / avg_gen,
        'n_runs': n_runs,
        'n_generate': n_generate
    }

print("Benchmark function defined.")

## Method 1: CTGAN

In [None]:
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata

# Prepare metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df_sample)

def fit_ctgan(df):
    synthesizer = CTGANSynthesizer(metadata, epochs=10, verbose=False)  # Reduced epochs for speed
    synthesizer.fit(df)
    return synthesizer

def generate_ctgan(model, n):
    return model.sample(n)

print("Benchmarking CTGAN (this may take a few minutes)...")
ctgan_results = benchmark_with_timing('CTGAN', fit_ctgan, generate_ctgan, df_sample, n_runs=2)
print(f"CTGAN: Fit={ctgan_results['fit_time_mean']:.1f}s, Gen={ctgan_results['gen_time_mean']:.1f}s")

## Method 2: GaussianCopula

In [None]:
from sdv.single_table import GaussianCopulaSynthesizer

def fit_copula(df):
    synthesizer = GaussianCopulaSynthesizer(metadata)
    synthesizer.fit(df)
    return synthesizer

def generate_copula(model, n):
    return model.sample(n)

print("Benchmarking GaussianCopula...")
copula_results = benchmark_with_timing('GaussianCopula', fit_copula, generate_copula, df_sample, n_runs=3)
print(f"Copula: Fit={copula_results['fit_time_mean']:.1f}s, Gen={copula_results['gen_time_mean']:.1f}s")

## Method 3: MISATA-IPF

In [None]:
from scipy import stats
from sklearn.ensemble import GradientBoostingClassifier

class MISATAIPFSynthesizer:
    """MISATA with IPF-guided synthesis."""
    
    def __init__(self, target_col='income', random_state=42):
        self.target_col = target_col
        self.random_state = random_state
        
    def fit(self, df):
        self.columns = list(df.columns)
        self.train_data = df.copy()
        
        # Learn marginals
        self.marginals = {}
        for col in self.columns:
            self.marginals[col] = {'all_values': df[col].values}
        
        # Learn correlation via copula
        uniform_df = df.copy()
        for col in self.columns:
            uniform_df[col] = stats.rankdata(df[col]) / (len(df) + 1)
        
        normal_df = uniform_df.apply(lambda x: stats.norm.ppf(np.clip(x, 0.001, 0.999)))
        corr_matrix = normal_df.corr().values
        corr_matrix = np.nan_to_num(corr_matrix, nan=0.0)
        np.fill_diagonal(corr_matrix, 1.0)
        
        eigvals, eigvecs = np.linalg.eigh(corr_matrix)
        eigvals = np.maximum(eigvals, 1e-6)
        corr_matrix = eigvecs @ np.diag(eigvals) @ eigvecs.T
        
        self.cholesky = np.linalg.cholesky(corr_matrix)
        
        # Causal model for target
        if self.target_col in self.columns:
            feature_cols = [c for c in self.columns if c != self.target_col]
            # Encode categoricals
            X = df[feature_cols].copy()
            for col in X.columns:
                if X[col].dtype == 'object':
                    X[col] = LabelEncoder().fit_transform(X[col].astype(str))
            
            y = df[self.target_col]
            if y.dtype == 'object':
                y = LabelEncoder().fit_transform(y.astype(str))
            
            self.causal_model = GradientBoostingClassifier(
                n_estimators=50, max_depth=4, random_state=self.random_state
            )
            self.causal_model.fit(X, y)
            self.feature_cols = feature_cols
            self.target_rate = y.mean()
        
        return self
    
    def sample(self, n_samples):
        rng = np.random.default_rng(self.random_state)
        
        z = rng.standard_normal((n_samples, len(self.columns)))
        correlated_z = z @ self.cholesky.T
        uniform = stats.norm.cdf(correlated_z)
        uniform = np.clip(uniform, 0.001, 0.999)
        
        synthetic_data = {}
        for i, col in enumerate(self.columns):
            if col == self.target_col:
                continue
            
            sorted_vals = np.sort(self.marginals[col]['all_values'])
            positions = np.linspace(0, 1, len(sorted_vals))
            synthetic_data[col] = np.interp(uniform[:, i], positions, sorted_vals)
        
        # Generate target
        if self.target_col in self.columns:
            X_synth = pd.DataFrame({c: synthetic_data[c] for c in self.feature_cols})
            for col in X_synth.columns:
                if X_synth[col].dtype == 'object' or col in self.marginals:
                    X_synth[col] = X_synth[col].round().astype(int)
            
            probs = self.causal_model.predict_proba(X_synth)[:, 1]
            threshold = np.percentile(probs, (1 - self.target_rate) * 100)
            synthetic_data[self.target_col] = (probs >= threshold).astype(int)
        
        return pd.DataFrame(synthetic_data)[self.columns]


def fit_misata(df):
    # Encode categoricals for MISATA
    df_encoded = df.copy()
    for col in df_encoded.columns:
        if df_encoded[col].dtype == 'object':
            df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col].astype(str))
    
    synth = MISATAIPFSynthesizer(target_col='income', random_state=SEED)
    synth.fit(df_encoded)
    return synth

def generate_misata(model, n):
    return model.sample(n)

print("Benchmarking MISATA-IPF...")
misata_results = benchmark_with_timing('MISATA-IPF', fit_misata, generate_misata, df_sample, n_runs=5)
print(f"MISATA: Fit={misata_results['fit_time_mean']:.2f}s, Gen={misata_results['gen_time_mean']:.3f}s")

## Method 4: Faker (Baseline)

In [None]:
from faker import Faker

def fit_faker(df):
    # Faker doesn't really "fit" - just store schema
    return {'columns': list(df.columns), 'n_rows': len(df)}

def generate_faker(model, n):
    fake = Faker()
    Faker.seed(SEED)
    
    data = {
        'age': [fake.random_int(17, 90) for _ in range(n)],
        'workclass': [fake.random_element(['Private', 'Self-emp', 'Gov', 'Other']) for _ in range(n)],
        'fnlwgt': [fake.random_int(10000, 1000000) for _ in range(n)],
        'education': [fake.random_element(['HS-grad', 'Some-college', 'Bachelors', 'Masters']) for _ in range(n)],
        'education_num': [fake.random_int(1, 16) for _ in range(n)],
        'marital_status': [fake.random_element(['Married', 'Single', 'Divorced']) for _ in range(n)],
        'occupation': [fake.job()[:20] for _ in range(n)],
        'relationship': [fake.random_element(['Husband', 'Wife', 'Own-child', 'Other']) for _ in range(n)],
        'race': [fake.random_element(['White', 'Black', 'Asian', 'Other']) for _ in range(n)],
        'sex': [fake.random_element(['Male', 'Female']) for _ in range(n)],
        'capital_gain': [fake.random_int(0, 100000) for _ in range(n)],
        'capital_loss': [fake.random_int(0, 5000) for _ in range(n)],
        'hours_per_week': [fake.random_int(1, 100) for _ in range(n)],
        'native_country': [fake.country()[:20] for _ in range(n)],
        'income': [fake.random_element(['<=50K', '>50K']) for _ in range(n)]
    }
    return pd.DataFrame(data)

print("Benchmarking Faker...")
faker_results = benchmark_with_timing('Faker', fit_faker, generate_faker, df_sample, n_runs=3)
print(f"Faker: Fit={faker_results['fit_time_mean']:.4f}s, Gen={faker_results['gen_time_mean']:.2f}s")

## Results Summary

In [None]:
# Compile results
all_results = [ctgan_results, copula_results, misata_results, faker_results]
results_df = pd.DataFrame(all_results)

# Calculate speedups
ctgan_total = ctgan_results['total_time']
results_df['speedup_vs_ctgan'] = ctgan_total / results_df['total_time']

print("=" * 80)
print("FAIR PERFORMANCE BENCHMARK - SEPARATED TIMINGS")
print("=" * 80)
print(f"\nDataset: Adult Census, {len(df_sample):,} rows")
print(f"Generated: 10,000 samples")
print(f"Runs: Multiple (with std)")
print()

print("Detailed Results:")
print("-" * 80)
print(f"{'Method':<20} {'Fit Time':<15} {'Gen Time':<15} {'Total':<12} {'Gen Throughput':<15} {'Speedup'}")
print("-" * 80)

for _, row in results_df.iterrows():
    fit_str = f"{row['fit_time_mean']:.2f}s ±{row['fit_time_std']:.2f}"
    gen_str = f"{row['gen_time_mean']:.3f}s ±{row['gen_time_std']:.3f}"
    total_str = f"{row['total_time']:.2f}s"
    throughput = f"{row['gen_throughput']:,.0f}/s"
    speedup = f"{row['speedup_vs_ctgan']:.1f}x"
    
    print(f"{row['name']:<20} {fit_str:<15} {gen_str:<15} {total_str:<12} {throughput:<15} {speedup}")

print("-" * 80)

In [None]:
import matplotlib.pyplot as plt

# Visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

methods = results_df['name'].tolist()
colors = ['#e74c3c', '#3498db', '#2ecc71', '#9b59b6']

# Plot 1: Fit Time
ax1 = axes[0]
bars1 = ax1.bar(methods, results_df['fit_time_mean'], yerr=results_df['fit_time_std'], 
                color=colors, capsize=5, alpha=0.8)
ax1.set_ylabel('Time (seconds)', fontsize=11)
ax1.set_title('Fit/Training Time', fontsize=12, fontweight='bold')
ax1.tick_params(axis='x', rotation=45)
for bar, val in zip(bars1, results_df['fit_time_mean']):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
             f'{val:.1f}s', ha='center', fontsize=9)

# Plot 2: Generation Time
ax2 = axes[1]
bars2 = ax2.bar(methods, results_df['gen_time_mean'], yerr=results_df['gen_time_std'],
                color=colors, capsize=5, alpha=0.8)
ax2.set_ylabel('Time (seconds)', fontsize=11)
ax2.set_title('Generation Time (10K samples)', fontsize=12, fontweight='bold')
ax2.tick_params(axis='x', rotation=45)
for bar, val in zip(bars2, results_df['gen_time_mean']):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{val:.2f}s', ha='center', fontsize=9)

# Plot 3: Total Time
ax3 = axes[2]
bars3 = ax3.bar(methods, results_df['total_time'], color=colors, alpha=0.8)
ax3.set_ylabel('Time (seconds)', fontsize=11)
ax3.set_title('Total Time (Fit + Generate)', fontsize=12, fontweight='bold')
ax3.tick_params(axis='x', rotation=45)
for bar, val, speedup in zip(bars3, results_df['total_time'], results_df['speedup_vs_ctgan']):
    ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
             f'{val:.1f}s\n({speedup:.0f}x)', ha='center', fontsize=9)

plt.tight_layout()
plt.savefig('fair_performance_benchmark.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n✓ Saved fair_performance_benchmark.png")

In [None]:
# Save results
results_df.to_csv('fair_performance_results.csv', index=False)

print("\n" + "=" * 80)
print("EXPERIMENT COMPLETE - FAIR COMPARISON")
print("=" * 80)
print("\nKey Findings (HONEST):")
print(f"  1. CTGAN takes {ctgan_results['fit_time_mean']:.0f}s to train, {ctgan_results['gen_time_mean']:.1f}s to generate")
print(f"  2. MISATA takes {misata_results['fit_time_mean']:.1f}s to fit, {misata_results['gen_time_mean']:.3f}s to generate")
print(f"  3. Total time speedup: {ctgan_total / misata_results['total_time']:.0f}x")
print(f"  4. Generation-only speedup: {ctgan_results['gen_time_mean'] / misata_results['gen_time_mean']:.0f}x")
print("\nThis is a FAIR comparison with all timings separated.")
print("\nFiles saved:")
print("  - fair_performance_benchmark.png")
print("  - fair_performance_results.csv")