# Experiment 18: Differentially Private MISATA (DP-MISATA)

## Research Summary

State-of-the-art approaches for private synthetic data:

| Method | Mechanism | Privacy |
|--------|-----------|--------|
| **DP-Copula** | Noisy histograms + noisy correlation | ε-DP |
| **DPNPC** | Fourier perturbation for copulas | ε-DP |
| **Subsample-Aggregate** | Train on disjoint subsets | Amplification |

## Our Approach: DP-MISATA

1. **Noisy Histogram Marginals**: Instead of exact CDF, use DP histograms
2. **Laplace Correlation Noise**: Add calibrated noise to correlation matrix
3. **Subsampling**: Train on random subsets for privacy amplification
4. **Bounded Sensitivity**: Clip extreme values

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)
print("Setup complete.")

## DP-MISATA: Differentially Private Synthesizer

In [None]:
class DPMISATASynthesizer:
    """
    Differentially Private MISATA.
    
    Privacy mechanisms:
    1. Noisy histogram marginals (Laplace mechanism)
    2. Noisy correlation (Laplace on covariance)
    3. Subsampling for privacy amplification
    4. No access to raw data during generation
    """
    
    def __init__(self, target_col=None, task='classification',
                 epsilon=1.0, n_bins=50, subsample_ratio=0.5,
                 random_state=42):
        """
        Args:
            epsilon: Privacy budget (lower = more private)
            n_bins: Number of histogram bins per feature
            subsample_ratio: Fraction of data to use (amplification)
        """
        self.target_col = target_col
        self.task = task
        self.epsilon = epsilon
        self.n_bins = n_bins
        self.subsample_ratio = subsample_ratio
        self.random_state = random_state
        
    def _add_laplace_noise(self, value, sensitivity, epsilon_budget):
        """Add Laplace noise for differential privacy."""
        if epsilon_budget <= 0:
            return value
        scale = sensitivity / epsilon_budget
        noise = np.random.laplace(0, scale, value.shape if hasattr(value, 'shape') else 1)
        return value + noise
    
    def _compute_dp_histogram(self, values, epsilon_budget):
        """
        Compute differentially private histogram.
        Sensitivity = 1 (adding/removing a person changes count by 1)
        """
        # Compute histogram
        hist, bin_edges = np.histogram(values, bins=self.n_bins)
        
        # Add Laplace noise (sensitivity = 1)
        noisy_hist = self._add_laplace_noise(hist.astype(float), 1.0, epsilon_budget)
        
        # Ensure non-negative and normalize
        noisy_hist = np.maximum(noisy_hist, 0)
        noisy_hist = noisy_hist / noisy_hist.sum() if noisy_hist.sum() > 0 else np.ones_like(noisy_hist) / len(noisy_hist)
        
        return noisy_hist, bin_edges
    
    def _compute_dp_correlation(self, df, epsilon_budget):
        """
        Compute differentially private correlation matrix.
        Uses covariance with bounded sensitivity.
        """
        n = len(df)
        d = len(df.columns)
        
        # Normalize to [-1, 1] for bounded sensitivity
        normalized = df.copy()
        for col in df.columns:
            min_val, max_val = df[col].min(), df[col].max()
            if max_val > min_val:
                normalized[col] = 2 * (df[col] - min_val) / (max_val - min_val) - 1
            else:
                normalized[col] = 0
        
        # Compute covariance
        cov_matrix = normalized.cov().values
        
        # Sensitivity for covariance with bounded data: O(1/n)
        # For correlation: ~4/n (each entry bounded by [-1,1])
        sensitivity = 4.0 / n
        
        # Add noise
        noisy_cov = self._add_laplace_noise(cov_matrix, sensitivity, epsilon_budget)
        
        # Make symmetric
        noisy_cov = (noisy_cov + noisy_cov.T) / 2
        
        # Convert to correlation
        diag = np.sqrt(np.abs(np.diag(noisy_cov)))
        diag[diag == 0] = 1
        corr_matrix = noisy_cov / np.outer(diag, diag)
        
        # Ensure valid correlation matrix
        corr_matrix = np.clip(corr_matrix, -1, 1)
        np.fill_diagonal(corr_matrix, 1.0)
        
        # Make positive definite
        eigvals, eigvecs = np.linalg.eigh(corr_matrix)
        eigvals = np.maximum(eigvals, 1e-6)
        corr_matrix = eigvecs @ np.diag(eigvals) @ eigvecs.T
        corr_matrix = (corr_matrix + corr_matrix.T) / 2
        np.fill_diagonal(corr_matrix, 1.0)
        
        return corr_matrix
    
    def fit(self, df):
        """Fit with differential privacy."""
        rng = np.random.default_rng(self.random_state)
        
        # Subsample for privacy amplification
        n_subsample = int(len(df) * self.subsample_ratio)
        subsample_idx = rng.choice(len(df), size=n_subsample, replace=False)
        df_sub = df.iloc[subsample_idx].copy()
        
        self.columns = list(df.columns)
        self.n_train = len(df_sub)
        
        # Allocate privacy budget
        # Split: 40% for marginals, 40% for correlation, 20% for target model
        eps_marginal = self.epsilon * 0.4 / len(self.columns)  # Per column
        eps_corr = self.epsilon * 0.4
        eps_target = self.epsilon * 0.2
        
        # Learn DP marginals
        self.marginal_hists = {}
        self.marginal_edges = {}
        
        for col in self.columns:
            hist, edges = self._compute_dp_histogram(df_sub[col].values, eps_marginal)
            self.marginal_hists[col] = hist
            self.marginal_edges[col] = edges
        
        # Learn DP correlation
        self.corr_matrix = self._compute_dp_correlation(df_sub, eps_corr)
        
        # Cholesky decomposition
        self.cholesky = np.linalg.cholesky(self.corr_matrix)
        
        # Target model (with DP via subsampling)
        if self.target_col and self.target_col in self.columns:
            feature_cols = [c for c in self.columns if c != self.target_col]
            
            # Use very simple model to limit information leakage
            if self.task == 'classification':
                self.target_model = GradientBoostingClassifier(
                    n_estimators=20, max_depth=3, 
                    subsample=0.5,  # Additional subsampling
                    random_state=self.random_state
                )
            else:
                from sklearn.ensemble import GradientBoostingRegressor
                self.target_model = GradientBoostingRegressor(
                    n_estimators=20, max_depth=3,
                    subsample=0.5,
                    random_state=self.random_state
                )
            self.target_model.fit(df_sub[feature_cols], df_sub[self.target_col])
            self.feature_cols = feature_cols
            self.target_rate = df_sub[self.target_col].mean() if self.task == 'classification' else None
        
        return self
    
    def sample(self, n_samples, seed=None):
        """Generate samples from DP model (no access to original data)."""
        if seed is None:
            seed = self.random_state
        rng = np.random.default_rng(seed)
        
        # Generate correlated uniforms
        z = rng.standard_normal((n_samples, len(self.columns)))
        uniform = stats.norm.cdf(z @ self.cholesky.T)
        uniform = np.clip(uniform, 0.001, 0.999)
        
        synthetic_data = {}
        for i, col in enumerate(self.columns):
            if col == self.target_col:
                continue
            
            # Sample from noisy histogram
            hist = self.marginal_hists[col]
            edges = self.marginal_edges[col]
            
            # Convert uniform to bin index, then to value
            cumsum = np.cumsum(hist)
            cumsum = cumsum / cumsum[-1]  # Normalize
            
            bin_indices = np.searchsorted(cumsum, uniform[:, i])
            bin_indices = np.clip(bin_indices, 0, len(edges) - 2)
            
            # Sample uniformly within bin
            low = edges[bin_indices]
            high = edges[bin_indices + 1]
            synthetic_data[col] = rng.uniform(low, high)
        
        # Generate target
        if self.target_col and self.target_col in self.columns:
            X_synth = pd.DataFrame({c: synthetic_data[c] for c in self.feature_cols})
            if self.task == 'classification':
                probs = self.target_model.predict_proba(X_synth)[:, 1]
                threshold = np.percentile(probs, (1 - self.target_rate) * 100)
                synthetic_data[self.target_col] = (probs >= threshold).astype(int)
            else:
                synthetic_data[self.target_col] = self.target_model.predict(X_synth)
        
        return pd.DataFrame(synthetic_data)[self.columns]

print("DP-MISATA Synthesizer defined.")

## Evaluation Functions

In [None]:
def compute_metrics(real_train, real_test, synth):
    """Compute all metrics."""
    results = {}
    
    # Correlation
    real_corr = real_train.corr().values
    synth_corr = synth.corr().values
    mask = ~np.eye(real_corr.shape[0], dtype=bool)
    real_flat = real_corr[mask]
    synth_flat = synth_corr[mask]
    valid = ~(np.isnan(real_flat) | np.isnan(synth_flat))
    results['corr_similarity'] = np.corrcoef(real_flat[valid], synth_flat[valid])[0, 1] if valid.sum() > 1 else 0
    
    # Marginal fidelity
    ks_scores = []
    for col in real_train.columns:
        stat, _ = stats.ks_2samp(real_train[col], synth[col])
        ks_scores.append(1 - stat)
    results['marginal_fidelity'] = np.mean(ks_scores)
    
    # MIA
    n_test = min(1000, len(real_train), len(synth))
    real_sample = real_train.sample(n_test, random_state=42)
    synth_sample = synth.sample(n_test, random_state=42)
    
    X_mia = pd.concat([real_sample, synth_sample], ignore_index=True)
    y_mia = np.array([1] * n_test + [0] * n_test)
    
    X_tr, X_te, y_tr, y_te = train_test_split(X_mia, y_mia, test_size=0.3, random_state=42)
    mia_model = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
    mia_model.fit(X_tr, y_tr)
    
    results['mia_auc'] = roc_auc_score(y_te, mia_model.predict_proba(X_te)[:, 1])
    
    # TSTR
    target = 'income'
    X_synth = synth.drop(target, axis=1)
    y_synth = synth[target]
    X_test = real_test.drop(target, axis=1)
    y_test = real_test[target]
    
    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    model.fit(X_synth, y_synth)
    results['tstr_auc'] = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    
    # TRTR
    model_real = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    model_real.fit(real_train.drop(target, axis=1), real_train[target])
    trtr = roc_auc_score(y_test, model_real.predict_proba(X_test)[:, 1])
    results['tstr_ratio'] = results['tstr_auc'] / trtr
    
    return results

print("Metrics defined.")

## Test Different Epsilon Values

In [None]:
# Load Adult Census
print("Loading Adult Census...")
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
           'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
           'hours_per_week', 'native_country', 'income']
df_raw = pd.read_csv(url, names=columns, na_values=' ?', skipinitialspace=True)
df_raw = df_raw.dropna().reset_index(drop=True).sample(5000, random_state=SEED)
df_raw['income'] = (df_raw['income'] == '>50K').astype(int)

for col in ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']:
    df_raw[col] = LabelEncoder().fit_transform(df_raw[col].astype(str))

train_df, test_df = train_test_split(df_raw, test_size=0.2, random_state=SEED)
print(f"Train: {len(train_df)}, Test: {len(test_df)}")

In [None]:
# Test different epsilon values
epsilon_values = [0.1, 0.5, 1.0, 2.0, 5.0, 10.0, float('inf')]  # inf = no privacy
results = []

print("\nTesting epsilon values (privacy budget)...\n")

for eps in epsilon_values:
    eps_label = f"{eps:.1f}" if eps != float('inf') else "∞"
    print(f"Epsilon: {eps_label}")
    
    synth = DPMISATASynthesizer(
        target_col='income', 
        epsilon=eps if eps != float('inf') else 1000,  # Large epsilon ≈ no privacy
        n_bins=30,
        subsample_ratio=0.8,
        random_state=SEED
    )
    synth.fit(train_df)
    df_synth = synth.sample(len(train_df))
    
    metrics = compute_metrics(train_df, test_df, df_synth)
    metrics['epsilon'] = eps
    metrics['epsilon_label'] = eps_label
    results.append(metrics)
    
    print(f"  Corr: {metrics['corr_similarity']:.4f}, MIA: {metrics['mia_auc']:.4f}, TSTR: {metrics['tstr_ratio']:.2%}")

results_df = pd.DataFrame(results)

In [None]:
print("\n" + "="*80)
print("DP-MISATA RESULTS")
print("="*80)

display_cols = ['epsilon_label', 'corr_similarity', 'marginal_fidelity', 'mia_auc', 'tstr_ratio']
print(results_df[display_cols].to_string(index=False))

# Find optimal epsilon
viable = results_df[(results_df['tstr_ratio'] > 0.85) & (results_df['epsilon'] < float('inf'))]
if len(viable) > 0:
    best = viable.loc[viable['mia_auc'].idxmin()]
    print(f"\n✓ Optimal Configuration:")
    print(f"  Epsilon: {best['epsilon_label']}")
    print(f"  MIA AUC: {best['mia_auc']:.4f}")
    print(f"  TSTR Ratio: {best['tstr_ratio']:.2%}")
    print(f"  Correlation: {best['corr_similarity']:.4f}")

In [None]:
# Visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Filter out infinity for plotting
plot_df = results_df[results_df['epsilon'] < float('inf')].copy()

# Plot 1: Privacy-Utility Tradeoff
ax1 = axes[0]
ax1.semilogx(plot_df['epsilon'], plot_df['mia_auc'], 'b-o', label='MIA AUC (↓better)', linewidth=2, markersize=8)
ax1.semilogx(plot_df['epsilon'], plot_df['tstr_ratio'], 'g-s', label='TSTR Ratio', linewidth=2, markersize=8)
ax1.axhline(y=0.5, color='blue', linestyle='--', alpha=0.5)
ax1.axhline(y=0.85, color='green', linestyle='--', alpha=0.5)
ax1.set_xlabel('Epsilon (Privacy Budget)', fontsize=12)
ax1.set_ylabel('Score', fontsize=12)
ax1.set_title('Privacy-Utility Tradeoff\n(Lower ε = More Private)', fontsize=14, fontweight='bold')
ax1.legend(fontsize=10)
ax1.grid(True, alpha=0.3)

# Plot 2: Fidelity vs Privacy
ax2 = axes[1]
ax2.semilogx(plot_df['epsilon'], plot_df['corr_similarity'], 'r-o', label='Correlation', linewidth=2, markersize=8)
ax2.semilogx(plot_df['epsilon'], plot_df['marginal_fidelity'], 'purple', marker='s', label='Marginal', linewidth=2, markersize=8)
ax2.set_xlabel('Epsilon (Privacy Budget)', fontsize=12)
ax2.set_ylabel('Fidelity', fontsize=12)
ax2.set_title('Fidelity vs Privacy', fontsize=14, fontweight='bold')
ax2.legend(fontsize=10)
ax2.grid(True, alpha=0.3)

# Plot 3: Privacy Score
ax3 = axes[2]
privacy_score = 1 - (plot_df['mia_auc'] - 0.5) * 2  # 1 = perfect privacy, 0 = no privacy
privacy_score = np.clip(privacy_score, 0, 1)
ax3.bar(plot_df['epsilon_label'], privacy_score, color='teal', alpha=0.8)
ax3.set_xlabel('Epsilon', fontsize=12)
ax3.set_ylabel('Privacy Score (higher=better)', fontsize=12)
ax3.set_title('Privacy Protection Level', fontsize=14, fontweight='bold')
ax3.set_ylim(0, 1)

plt.tight_layout()
plt.savefig('dp_misata_results.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n✓ Saved dp_misata_results.png")

In [None]:
# Save results
results_df.to_csv('dp_misata_results.csv', index=False)

print("\n" + "="*80)
print("EXPERIMENT 18 COMPLETE")
print("="*80)
print("\nKey Findings:")
print("  - Lower epsilon = better privacy (lower MIA AUC)")
print("  - But also lower utility (TSTR ratio)")
print("  - Sweet spot likely around ε = 1.0 - 2.0")
print("\nFiles saved:")
print("  - dp_misata_results.png")
print("  - dp_misata_results.csv")