# Experiment 23: Additional Baselines (TVAE, Higher-Order Tests)

Addresses reviewer concerns:
1. Missing TVAE baseline
2. Higher-order dependency tests
3. Heavy-tail distribution analysis

In [None]:
!pip install -q sdv

In [None]:
import numpy as np
import pandas as pd
import time
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)
print("Setup complete.")

In [None]:
# Load Adult Census
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
           'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
           'hours_per_week', 'native_country', 'income']
df_raw = pd.read_csv(url, names=columns, na_values=' ?', skipinitialspace=True)
df_raw = df_raw.dropna().reset_index(drop=True).sample(5000, random_state=SEED)
df_raw['income'] = (df_raw['income'] == '>50K').astype(int)

for col in ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']:
    df_raw[col] = LabelEncoder().fit_transform(df_raw[col].astype(str))

train_df, test_df = train_test_split(df_raw, test_size=0.2, random_state=SEED)
print(f"Train: {len(train_df)}, Test: {len(test_df)}")

## Part 1: TVAE Baseline

In [None]:
from sdv.single_table import TVAESynthesizer, CTGANSynthesizer, GaussianCopulaSynthesizer
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(train_df)

print("SDV models ready.")

In [None]:
# MISATA
class MISATASynthesizer:
    def __init__(self, target_col='income', random_state=42):
        self.target_col = target_col
        self.random_state = random_state
        
    def fit(self, df):
        self.columns = list(df.columns)
        self.marginals = {col: {'values': df[col].values.copy()} for col in self.columns}
        uniform_df = df.copy()
        for col in self.columns:
            uniform_df[col] = stats.rankdata(df[col]) / (len(df) + 1)
        normal_df = uniform_df.apply(lambda x: stats.norm.ppf(np.clip(x, 0.001, 0.999)))
        corr_matrix = normal_df.corr().values
        corr_matrix = np.nan_to_num(corr_matrix, nan=0.0)
        np.fill_diagonal(corr_matrix, 1.0)
        eigvals, eigvecs = np.linalg.eigh(corr_matrix)
        eigvals = np.maximum(eigvals, 1e-6)
        corr_matrix = eigvecs @ np.diag(eigvals) @ eigvecs.T
        self.cholesky = np.linalg.cholesky(corr_matrix)
        feature_cols = [c for c in self.columns if c != self.target_col]
        self.target_model = GradientBoostingClassifier(n_estimators=50, max_depth=4, random_state=self.random_state)
        self.target_model.fit(df[feature_cols], df[self.target_col])
        self.feature_cols = feature_cols
        self.target_rate = df[self.target_col].mean()
        return self
    
    def sample(self, n_samples):
        rng = np.random.default_rng(self.random_state)
        z = rng.standard_normal((n_samples, len(self.columns)))
        uniform = stats.norm.cdf(z @ self.cholesky.T)
        uniform = np.clip(uniform, 0.001, 0.999)
        synthetic_data = {}
        for i, col in enumerate(self.columns):
            if col == self.target_col:
                continue
            sorted_vals = np.sort(self.marginals[col]['values'])
            positions = np.linspace(0, 1, len(sorted_vals))
            synthetic_data[col] = np.interp(uniform[:, i], positions, sorted_vals)
        X_synth = pd.DataFrame({c: synthetic_data[c] for c in self.feature_cols})
        probs = self.target_model.predict_proba(X_synth)[:, 1]
        threshold = np.percentile(probs, (1 - self.target_rate) * 100)
        synthetic_data[self.target_col] = (probs >= threshold).astype(int)
        return pd.DataFrame(synthetic_data)[self.columns]

In [None]:
# Benchmark all methods
results = []

# MISATA
print("MISATA...")
start = time.time()
misata = MISATASynthesizer()
misata.fit(train_df)
df_misata = misata.sample(len(train_df))
misata_time = time.time() - start

# TVAE
print("TVAE...")
start = time.time()
tvae = TVAESynthesizer(metadata, epochs=100)
tvae.fit(train_df)
df_tvae = tvae.sample(len(train_df))
tvae_time = time.time() - start

# GaussianCopula
print("GaussianCopula...")
start = time.time()
gc = GaussianCopulaSynthesizer(metadata)
gc.fit(train_df)
df_gc = gc.sample(len(train_df))
gc_time = time.time() - start

print("All methods complete.")

In [None]:
# Evaluate
def evaluate(syn_df, name, total_time):
    model = RandomForestClassifier(n_estimators=100, random_state=SEED, n_jobs=-1)
    model.fit(syn_df.drop('income', axis=1), syn_df['income'])
    tstr = roc_auc_score(test_df['income'], model.predict_proba(test_df.drop('income', axis=1))[:, 1])
    
    # Marginal fidelity
    ks_scores = [1 - stats.ks_2samp(train_df[col], syn_df[col])[0] for col in train_df.columns]
    fidelity = np.mean(ks_scores)
    
    return {'method': name, 'time': total_time, 'tstr': tstr, 'fidelity': fidelity}

# TRTR
model_real = RandomForestClassifier(n_estimators=100, random_state=SEED, n_jobs=-1)
model_real.fit(train_df.drop('income', axis=1), train_df['income'])
trtr = roc_auc_score(test_df['income'], model_real.predict_proba(test_df.drop('income', axis=1))[:, 1])

results = [
    evaluate(df_misata, 'MISATA', misata_time),
    evaluate(df_tvae, 'TVAE', tvae_time),
    evaluate(df_gc, 'GaussianCopula', gc_time)
]

for r in results:
    r['tstr_ratio'] = r['tstr'] / trtr

In [None]:
print("\n" + "="*60)
print("BASELINE COMPARISON (incl. TVAE)")
print("="*60)
print(f"\nTRTR: {trtr:.4f}\n")

results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

## Part 2: Higher-Order Dependency Tests

In [None]:
def higher_order_test(real_df, syn_df, cols):
    """
    Test 3-way interaction preservation.
    Compare P(c | a, b) in real vs synthetic.
    """
    a, b, c = cols
    
    # Bin continuous variables
    real_df = real_df.copy()
    syn_df = syn_df.copy()
    
    for df in [real_df, syn_df]:
        for col in [a, b, c]:
            if df[col].nunique() > 10:
                df[col + '_bin'] = pd.qcut(df[col], 5, labels=False, duplicates='drop')
            else:
                df[col + '_bin'] = df[col]
    
    # Compute joint distributions
    real_joint = real_df.groupby([a + '_bin', b + '_bin', c + '_bin']).size() / len(real_df)
    syn_joint = syn_df.groupby([a + '_bin', b + '_bin', c + '_bin']).size() / len(syn_df)
    
    # Align indices
    all_idx = real_joint.index.union(syn_joint.index)
    real_joint = real_joint.reindex(all_idx, fill_value=0)
    syn_joint = syn_joint.reindex(all_idx, fill_value=0)
    
    # Total variation distance
    tvd = np.abs(real_joint - syn_joint).sum() / 2
    
    return 1 - tvd  # Higher = better

# Test 3-way interactions
triplets = [
    ['age', 'education_num', 'income'],
    ['hours_per_week', 'capital_gain', 'income'],
    ['age', 'sex', 'income']
]

print("\n" + "="*60)
print("HIGHER-ORDER DEPENDENCY PRESERVATION")
print("="*60)

for triplet in triplets:
    misata_score = higher_order_test(train_df, df_misata, triplet)
    tvae_score = higher_order_test(train_df, df_tvae, triplet)
    gc_score = higher_order_test(train_df, df_gc, triplet)
    
    print(f"\n{triplet}:")
    print(f"  MISATA: {misata_score:.3f}")
    print(f"  TVAE: {tvae_score:.3f}")
    print(f"  GaussianCopula: {gc_score:.3f}")

## Part 3: Tail Distribution Analysis

In [None]:
def tail_preservation(real_df, syn_df, col, percentile=95):
    """Test if extreme values are preserved."""
    real_upper = np.percentile(real_df[col], percentile)
    syn_upper = np.percentile(syn_df[col], percentile)
    
    real_lower = np.percentile(real_df[col], 100 - percentile)
    syn_lower = np.percentile(syn_df[col], 100 - percentile)
    
    upper_ratio = min(syn_upper, real_upper) / max(syn_upper, real_upper)
    lower_ratio = min(syn_lower, real_lower) / max(syn_lower, real_lower) if min(syn_lower, real_lower) > 0 else 0
    
    return (upper_ratio + lower_ratio) / 2 if lower_ratio > 0 else upper_ratio

print("\n" + "="*60)
print("TAIL DISTRIBUTION PRESERVATION (95th percentile)")
print("="*60)

tail_cols = ['age', 'capital_gain', 'hours_per_week']

for col in tail_cols:
    misata_tail = tail_preservation(train_df, df_misata, col)
    tvae_tail = tail_preservation(train_df, df_tvae, col)
    gc_tail = tail_preservation(train_df, df_gc, col)
    
    print(f"\n{col}:")
    print(f"  MISATA: {misata_tail:.3f}")
    print(f"  TVAE: {tvae_tail:.3f}")
    print(f"  GaussianCopula: {gc_tail:.3f}")

In [None]:
# Save results
results_df.to_csv('additional_baselines_results.csv', index=False)

print("\n" + "="*60)
print("EXPERIMENT 23 COMPLETE")
print("="*60)
print("\nKey Findings:")
print("  - MISATA vs TVAE: Direct comparison available")
print("  - Higher-order dependencies tested")
print("  - Tail preservation analyzed")
print("\nFile saved: additional_baselines_results.csv")