# Experiment 16: MISATA v2 - Improved Synthesizer

## Issues from Experiment 15 to Fix

| Issue | Problem | Solution |
|-------|---------|----------|
| Kendall τ = 0.003 | Metric bug (sample-wise, not matrix-wise) | Fix computation |
| MIA AUC = 0.868 | Synthetic too distinguishable | Add noise injection |
| Cover Type 89.8% | Struggles high-dim | Better marginal sampling |

## Improvements Implemented
1. **Noise injection** to improve privacy
2. **Bootstrap marginal sampling** for diversity
3. **Fixed correlation metrics**

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)
print("Setup complete.")

## MISATA v2: Privacy-Enhanced Synthesizer

In [None]:
class MISATAv2Synthesizer:
    """
    MISATA v2 with privacy enhancements:
    1. Noise injection for privacy
    2. Bootstrap marginal sampling for diversity
    3. Laplace noise on correlations
    """
    
    def __init__(self, target_col=None, task='classification', 
                 noise_scale=0.1, random_state=42):
        self.target_col = target_col
        self.task = task
        self.noise_scale = noise_scale  # Privacy parameter
        self.random_state = random_state
        
    def fit(self, df):
        self.columns = list(df.columns)
        self.n_train = len(df)
        
        # Store marginal distributions (with jitter for diversity)
        self.marginals = {}
        for col in self.columns:
            values = df[col].values.copy()
            self.marginals[col] = {
                'values': values,
                'min': values.min(),
                'max': values.max(),
                'std': values.std()
            }
        
        # Learn correlation via copula with noise
        uniform_df = df.copy()
        for col in self.columns:
            uniform_df[col] = stats.rankdata(df[col]) / (len(df) + 1)
        
        normal_df = uniform_df.apply(lambda x: stats.norm.ppf(np.clip(x, 0.001, 0.999)))
        corr_matrix = normal_df.corr().values
        corr_matrix = np.nan_to_num(corr_matrix, nan=0.0)
        
        # Add Laplace noise for differential privacy
        if self.noise_scale > 0:
            rng = np.random.default_rng(self.random_state)
            noise = rng.laplace(0, self.noise_scale * 0.1, corr_matrix.shape)
            noise = (noise + noise.T) / 2  # Symmetric
            corr_matrix = corr_matrix + noise
        
        # Ensure valid correlation matrix
        np.fill_diagonal(corr_matrix, 1.0)
        corr_matrix = np.clip(corr_matrix, -0.99, 0.99)
        np.fill_diagonal(corr_matrix, 1.0)
        
        eigvals, eigvecs = np.linalg.eigh(corr_matrix)
        eigvals = np.maximum(eigvals, 1e-6)
        corr_matrix = eigvecs @ np.diag(eigvals) @ eigvecs.T
        corr_matrix = (corr_matrix + corr_matrix.T) / 2
        np.fill_diagonal(corr_matrix, 1.0)
        
        self.cholesky = np.linalg.cholesky(corr_matrix)
        
        # Target model
        if self.target_col and self.target_col in self.columns:
            feature_cols = [c for c in self.columns if c != self.target_col]
            if self.task == 'classification':
                self.target_model = GradientBoostingClassifier(
                    n_estimators=50, max_depth=4, random_state=self.random_state
                )
            else:
                from sklearn.ensemble import GradientBoostingRegressor
                self.target_model = GradientBoostingRegressor(
                    n_estimators=50, max_depth=4, random_state=self.random_state
                )
            self.target_model.fit(df[feature_cols], df[self.target_col])
            self.feature_cols = feature_cols
            self.target_rate = df[self.target_col].mean() if self.task == 'classification' else None
        
        return self
    
    def sample(self, n_samples, seed=None):
        if seed is None:
            seed = self.random_state
        rng = np.random.default_rng(seed)
        
        # Correlated uniform sampling
        z = rng.standard_normal((n_samples, len(self.columns)))
        uniform = stats.norm.cdf(z @ self.cholesky.T)
        uniform = np.clip(uniform, 0.001, 0.999)
        
        synthetic_data = {}
        for i, col in enumerate(self.columns):
            if col == self.target_col:
                continue
            
            # Bootstrap sample from marginal (adds diversity)
            bootstrap_idx = rng.choice(len(self.marginals[col]['values']), 
                                       size=len(self.marginals[col]['values']),
                                       replace=True)
            sorted_vals = np.sort(self.marginals[col]['values'][bootstrap_idx])
            positions = np.linspace(0, 1, len(sorted_vals))
            
            synth_vals = np.interp(uniform[:, i], positions, sorted_vals)
            
            # Add small noise for privacy
            if self.noise_scale > 0:
                noise = rng.normal(0, self.marginals[col]['std'] * self.noise_scale * 0.1, n_samples)
                synth_vals = synth_vals + noise
                # Clip to valid range
                synth_vals = np.clip(synth_vals, 
                                     self.marginals[col]['min'] - self.marginals[col]['std'],
                                     self.marginals[col]['max'] + self.marginals[col]['std'])
            
            synthetic_data[col] = synth_vals
        
        # Generate target with noise
        if self.target_col and self.target_col in self.columns:
            X_synth = pd.DataFrame({c: synthetic_data[c] for c in self.feature_cols})
            if self.task == 'classification':
                probs = self.target_model.predict_proba(X_synth)[:, 1]
                # Add noise to threshold for diversity
                noisy_probs = probs + rng.normal(0, 0.05, len(probs))
                threshold = np.percentile(noisy_probs, (1 - self.target_rate) * 100)
                synthetic_data[self.target_col] = (noisy_probs >= threshold).astype(int)
            else:
                pred = self.target_model.predict(X_synth)
                # Add regression noise
                noise = rng.normal(0, np.std(pred) * self.noise_scale, len(pred))
                synthetic_data[self.target_col] = pred + noise
        
        return pd.DataFrame(synthetic_data)[self.columns]

print("MISATA v2 Synthesizer defined.")

## Fixed Correlation Metrics

In [None]:
def fixed_correlation_metrics(real_df, synth_df):
    """
    FIXED: Compute correlation similarity correctly.
    Compare correlation MATRICES, not individual samples.
    """
    metrics = {}
    
    # 1. Pearson correlation matrix similarity
    real_pearson = real_df.corr(method='pearson').values
    synth_pearson = synth_df.corr(method='pearson').values
    
    # Flatten and compare (excluding diagonal)
    mask = ~np.eye(real_pearson.shape[0], dtype=bool)
    real_flat = real_pearson[mask]
    synth_flat = synth_pearson[mask]
    
    valid = ~(np.isnan(real_flat) | np.isnan(synth_flat))
    metrics['pearson_matrix_corr'] = np.corrcoef(real_flat[valid], synth_flat[valid])[0, 1]
    
    # 2. Spearman correlation matrix similarity
    real_spearman = real_df.corr(method='spearman').values
    synth_spearman = synth_df.corr(method='spearman').values
    
    real_flat_s = real_spearman[mask]
    synth_flat_s = synth_spearman[mask]
    valid_s = ~(np.isnan(real_flat_s) | np.isnan(synth_flat_s))
    metrics['spearman_matrix_corr'] = np.corrcoef(real_flat_s[valid_s], synth_flat_s[valid_s])[0, 1]
    
    # 3. Kendall tau - compare correlation matrices
    # Use Kendall tau between flattened correlation matrices
    tau, _ = stats.kendalltau(real_flat[valid], synth_flat[valid])
    metrics['kendall_matrix_tau'] = tau
    
    # 4. Mean Absolute Difference in correlations
    metrics['corr_mae'] = np.mean(np.abs(real_flat[valid] - synth_flat[valid]))
    
    return metrics

print("Fixed correlation metrics defined.")

## Privacy Metrics

In [None]:
def compute_privacy_metrics(real_df, synth_df):
    """Compute privacy metrics."""
    scaler = StandardScaler()
    real_scaled = scaler.fit_transform(real_df)
    synth_scaled = scaler.transform(synth_df)
    
    # DCR
    nn = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
    nn.fit(real_scaled)
    distances, _ = nn.kneighbors(synth_scaled)
    
    metrics = {
        'dcr_mean': np.mean(distances),
        'dcr_5th': np.percentile(distances, 5)
    }
    
    # MIA
    n_test = min(1000, len(real_df), len(synth_df))
    real_sample = real_df.sample(n_test, random_state=42)
    synth_sample = synth_df.sample(n_test, random_state=42)
    
    X_mia = pd.concat([real_sample, synth_sample], ignore_index=True)
    y_mia = np.array([1] * n_test + [0] * n_test)
    
    X_train, X_test, y_train, y_test = train_test_split(X_mia, y_mia, test_size=0.3, random_state=42)
    
    mia_model = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
    mia_model.fit(X_train, y_train)
    
    mia_auc = roc_auc_score(y_test, mia_model.predict_proba(X_test)[:, 1])
    metrics['mia_auc'] = mia_auc
    metrics['mia_advantage'] = 2 * (mia_auc - 0.5)
    
    return metrics

print("Privacy metrics defined.")

## Compare v1 vs v2

In [None]:
# Load Adult Census
print("Loading Adult Census...")
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
           'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
           'hours_per_week', 'native_country', 'income']
df_raw = pd.read_csv(url, names=columns, na_values=' ?', skipinitialspace=True)
df_raw = df_raw.dropna().reset_index(drop=True).sample(5000, random_state=SEED)
df_raw['income'] = (df_raw['income'] == '>50K').astype(int)

for col in ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']:
    df_raw[col] = LabelEncoder().fit_transform(df_raw[col].astype(str))

train_df, test_df = train_test_split(df_raw, test_size=0.2, random_state=SEED)
print(f"Train: {len(train_df)}, Test: {len(test_df)}")

In [None]:
# Test different noise scales
noise_scales = [0.0, 0.1, 0.2, 0.3, 0.5]
results = []

print("\nComparing noise scales for privacy/utility tradeoff...\n")

for noise in noise_scales:
    print(f"Noise scale: {noise}")
    
    synth = MISATAv2Synthesizer(target_col='income', noise_scale=noise, random_state=SEED)
    synth.fit(train_df)
    df_synth = synth.sample(len(train_df))
    
    # Correlation metrics
    corr = fixed_correlation_metrics(train_df, df_synth)
    
    # Privacy metrics
    privacy = compute_privacy_metrics(train_df, df_synth)
    
    # TSTR
    X_synth = df_synth.drop('income', axis=1)
    y_synth = df_synth['income']
    X_test = test_df.drop('income', axis=1)
    y_test = test_df['income']
    
    model = RandomForestClassifier(n_estimators=100, random_state=SEED, n_jobs=-1)
    model.fit(X_synth, y_synth)
    tstr_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    
    # TRTR
    model_real = RandomForestClassifier(n_estimators=100, random_state=SEED, n_jobs=-1)
    model_real.fit(train_df.drop('income', axis=1), train_df['income'])
    trtr_auc = roc_auc_score(y_test, model_real.predict_proba(X_test)[:, 1])
    
    result = {
        'noise_scale': noise,
        'pearson_corr': corr['pearson_matrix_corr'],
        'kendall_tau': corr['kendall_matrix_tau'],
        'corr_mae': corr['corr_mae'],
        'mia_auc': privacy['mia_auc'],
        'mia_advantage': privacy['mia_advantage'],
        'dcr_5th': privacy['dcr_5th'],
        'tstr_auc': tstr_auc,
        'tstr_ratio': tstr_auc / trtr_auc
    }
    results.append(result)
    
    print(f"  Kendall τ: {corr['kendall_matrix_tau']:.4f}, MIA: {privacy['mia_auc']:.4f}, TSTR: {tstr_auc/trtr_auc:.2%}")

In [None]:
results_df = pd.DataFrame(results)

print("\n" + "="*80)
print("PRIVACY-UTILITY TRADEOFF ANALYSIS")
print("="*80)
print(results_df.to_string(index=False))

# Find optimal noise scale (best privacy while maintaining >90% TSTR)
viable = results_df[results_df['tstr_ratio'] > 0.90]
if len(viable) > 0:
    best = viable.loc[viable['mia_auc'].idxmin()]
    print(f"\nOptimal Configuration:")
    print(f"  Noise Scale: {best['noise_scale']}")
    print(f"  MIA AUC: {best['mia_auc']:.4f} (lower = better privacy)")
    print(f"  TSTR Ratio: {best['tstr_ratio']:.2%}")
    print(f"  Kendall τ: {best['kendall_tau']:.4f}")

In [None]:
# Visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Plot 1: Privacy-Utility Tradeoff
ax1 = axes[0]
ax1.plot(results_df['noise_scale'], results_df['mia_auc'], 'b-o', label='MIA AUC (lower=better)', linewidth=2)
ax1.plot(results_df['noise_scale'], results_df['tstr_ratio'], 'g-s', label='TSTR Ratio', linewidth=2)
ax1.axhline(y=0.5, color='blue', linestyle='--', alpha=0.5, label='Perfect Privacy (0.5)')
ax1.axhline(y=0.9, color='green', linestyle='--', alpha=0.5, label='90% Utility Target')
ax1.set_xlabel('Noise Scale', fontsize=11)
ax1.set_ylabel('Score', fontsize=11)
ax1.set_title('Privacy-Utility Tradeoff', fontsize=12, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Correlation Preservation
ax2 = axes[1]
ax2.plot(results_df['noise_scale'], results_df['pearson_corr'], 'r-o', label='Pearson', linewidth=2)
ax2.plot(results_df['noise_scale'], results_df['kendall_tau'], 'b-s', label='Kendall τ', linewidth=2)
ax2.set_xlabel('Noise Scale', fontsize=11)
ax2.set_ylabel('Correlation Similarity', fontsize=11)
ax2.set_title('Correlation Preservation', fontsize=12, fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Plot 3: MIA Advantage
ax3 = axes[2]
colors = ['red' if x > 0.5 else 'orange' if x > 0.2 else 'green' for x in results_df['mia_advantage']]
bars = ax3.bar(results_df['noise_scale'].astype(str), results_df['mia_advantage'], color=colors, alpha=0.8)
ax3.axhline(y=0, color='green', linestyle='--', linewidth=2, label='Perfect Privacy (0)')
ax3.set_xlabel('Noise Scale', fontsize=11)
ax3.set_ylabel('MIA Advantage (0=best)', fontsize=11)
ax3.set_title('Privacy Level', fontsize=12, fontweight='bold')
ax3.legend()

plt.tight_layout()
plt.savefig('misata_v2_privacy_utility.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n✓ Saved misata_v2_privacy_utility.png")

In [None]:
# Save results
results_df.to_csv('misata_v2_results.csv', index=False)

print("\n" + "="*80)
print("EXPERIMENT 16 COMPLETE")
print("="*80)
print("\nKey Improvements:")
print("  1. Fixed Kendall τ calculation (matrix-based, not sample-based)")
print("  2. Added noise injection for privacy")
print("  3. Demonstrated privacy-utility tradeoff")
print("\nFiles saved:")
print("  - misata_v2_privacy_utility.png")
print("  - misata_v2_results.csv")