In [1]:
from google.colab import drive
drive.mount('/content/drive')
# then, for convenience
%cd /content/drive/MyDrive/CTAB-GAN-Plus-main/CTAB-GAN-Plus-main/

Mounted at /content/drive
/content/drive/MyDrive/CTAB-GAN-Plus-main/CTAB-GAN-Plus-main


In [1]:
!nvidia-smi

Mon Dec  8 05:46:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   30C    P0             43W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# ALM / PSyGen settings
alm:
  P_min: 0.7          # minimum acceptable privacy score
  lambda_init: 0.0
  mu_init: 0.1
  lambda_lr: 1e-3     # step size for λ
  mu_growth: 1.5      # µ ← mu_growth * µ when violations persist
  alm_weight: 1.0     # how strongly to weight ALM term vs adv loss

  # quality metric weights (sum to 1)
  w_quality:
    D: 0.3   # distribution similarity
    A: 0.1   # anomaly / rare events
    C: 0.3   # inter-feature associations
    V: 0.2   # diversity / coverage
    T: 0.1   # temporal consistency (0 if not temporal data)

  # privacy metric weights (sum to 1)
  w_privacy:
    DCR: 0.4    # distance to closest real
    Qdelta: 0.3 # quantile difference
    I: 0.3      # duplicate score


SyntaxError: invalid syntax (ipython-input-4072824666.py, line 2)

In [2]:
# PSyGen-style ALM Tabular GAN demo for Google Colab
# ---------------------------------------------------
# This script:
# 1) Loads a numeric CSV dataset (or generates a toy one if not found)
# 2) Trains a simple WGAN-GP GAN on the table
# 3) Wraps the generator loss with the Augmented Lagrangian Method (ALM)
#    using PSyGen-style quality Q and privacy P metrics.
#
# Paste this whole cell into a Colab notebook and run it.
# Edit DATA_PATH to point to your real CSV.

!pip install -q pandas scikit-learn tqdm

import os
import math
import random
from dataclasses import dataclass

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

# ------------------
# Config
# ------------------

DATA_PATH = "/content/drive/MyDrive/CTAB-GAN-Plus-main/CTAB-GAN-Plus-main/data/breast-cancer-wisconsin.csv"  # change to your CSV path
BATCH_SIZE = 256
EPOCHS = 50
LATENT_DIM = 64
LR_G = 1e-4
LR_D = 1e-4
N_CRITIC = 5        # WGAN-GP critic steps per G step
LAMBDA_GP = 10.0    # gradient penalty weight

# ALM config
P_MIN = 0.7         # target minimum privacy-preservability
ALM_WEIGHT = 1.0    # how strongly to weight ALM term vs adversarial
LAMBDA_INIT = 0.0
MU_INIT = 0.1
LAMBDA_LR = 1e-3
MU_GROWTH = 1.5

# Quality metric weights (sum ≈ 1)
WQ = dict(D=0.3, A=0.1, C=0.3, V=0.3, T=0.0)

# Privacy metric weights (sum ≈ 1)
WP = dict(DCR=0.4, Qdelta=0.3, I=0.3)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

# ------------------
# Data
# ------------------

class TabularDataset(Dataset):
    def __init__(self, x):
        self.x = x.astype(np.float32)

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, idx):
        return self.x[idx]


def load_or_make_data():
    """
    Load a numeric CSV from DATA_PATH, or create a toy dataset if not found.
    Assumes all columns are numeric; encode categoricals beforehand if needed.
    """
    if not os.path.exists(DATA_PATH):
        print(f"{DATA_PATH} not found, generating a toy dataset instead.")
        from sklearn.datasets import make_classification
        X, y = make_classification(
            n_samples=5000,
            n_features=8,
            n_informative=5,
            n_redundant=0,
            n_clusters_per_class=1,
            random_state=42,
        )
        df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
        df["label"] = y
        df.to_csv(DATA_PATH, index=False)
    else:
        print(f"Loading data from {DATA_PATH}")

    df = pd.read_csv(DATA_PATH)
    print("Columns:", df.columns.tolist())
    # Enforce numeric
    for c in df.columns:
        if not pd.api.types.is_numeric_dtype(df[c]):
            raise ValueError(
                f"Column {c} is not numeric. "
                "Please encode categorical columns before using this script."
            )

    scaler = StandardScaler()
    X = scaler.fit_transform(df.values.astype(np.float32))

    return X, df.columns.tolist(), scaler


X_real, column_names, scaler = load_or_make_data()
dim = X_real.shape[1]
print("Data shape:", X_real.shape)

dataset = TabularDataset(X_real)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

# ------------------
# Models
# ------------------

class Generator(nn.Module):
    def __init__(self, latent_dim, data_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(True),
            nn.Linear(256, 256),
            nn.ReLU(True),
            nn.Linear(256, data_dim),
        )

    def forward(self, z):
        return self.net(z)


class Discriminator(nn.Module):
    def __init__(self, data_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(data_dim, 256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 1),
        )

    def forward(self, x):
        return self.net(x).view(-1)

G = Generator(LATENT_DIM, dim).to(DEVICE)
D = Discriminator(dim).to(DEVICE)

opt_G = torch.optim.Adam(G.parameters(), lr=LR_G, betas=(0.5, 0.9))
opt_D = torch.optim.Adam(D.parameters(), lr=LR_D, betas=(0.5, 0.9))

# ------------------
# WGAN-GP helpers
# ------------------

def gradient_penalty(D, real, fake):
    batch_size = real.size(0)
    alpha = torch.rand(batch_size, 1, device=real.device)
    alpha = alpha.expand_as(real)

    interpolated = alpha * real + (1 - alpha) * fake
    interpolated.requires_grad_(True)

    d_interpolated = D(interpolated)

    gradients = torch.autograd.grad(
        outputs=d_interpolated,
        inputs=interpolated,
        grad_outputs=torch.ones_like(d_interpolated),
        create_graph=True,
        retain_graph=True,
        only_inputs=True,
    )[0]

    gradients = gradients.view(batch_size, -1)
    grad_norm = gradients.norm(2, dim=1)
    gp = ((grad_norm - 1) ** 2).mean()
    return gp


def sample_noise(batch_size, latent_dim, device):
    return torch.randn(batch_size, latent_dim, device=device)

# ------------------
# PSyGen-style Q and P metrics
# ------------------

class QualityPrivacyMetrics:
    """
    Differentiable-ish proxies for PSyGen quality Q and privacy-preservability P.
    Operate on scaled numeric data (real_batch, fake_batch) of shape [B, dim].
    """

    def __init__(self, w_quality, w_privacy, device="cuda"):
        self.wq = w_quality
        self.wp = w_privacy
        self.device = device

    def _distribution_similarity(self, real, fake):
        """
        D: use RBF-kernel MMD and map to [0,1] via exp(-mmd2).
        """
        def rbf(x, y, gamma=1.0):
            x = x.view(x.size(0), -1)
            y = y.view(y.size(0), -1)
            xx = torch.cdist(x, x, p=2)
            yy = torch.cdist(y, y, p=2)
            xy = torch.cdist(x, y, p=2)
            kxx = torch.exp(-gamma * xx ** 2).mean()
            kyy = torch.exp(-gamma * yy ** 2).mean()
            kxy = torch.exp(-gamma * xy ** 2).mean()
            return kxx + kyy - 2 * kxy

        mmd2 = rbf(real, fake)
        return torch.exp(-mmd2).clamp(0.0, 1.0)

    def _anomaly_preservation(self, real, fake):
        """
        A: based on IQR anomalies.
        Not perfectly differentiable but okay as an auxiliary signal.
        """
        q1_real = torch.quantile(real, 0.25, dim=0)
        q3_real = torch.quantile(real, 0.75, dim=0)
        iqr_real = q3_real - q1_real + 1e-6
        hi = q3_real + 1.5 * iqr_real
        lo = q1_real - 1.5 * iqr_real

        rare_real = ((real > hi) | (real < lo)).float().mean()
        rare_fake = ((fake > hi) | (fake < lo)).float().mean()
        return (1.0 - torch.abs(rare_real - rare_fake)).clamp(0.0, 1.0)

    def _association_preservation(self, real, fake):
        """
        C: compare correlation matrices.
        """
        def corr(x):
            x = (x - x.mean(0, keepdim=True)) / (x.std(0, keepdim=True) + 1e-6)
            return (x.T @ x) / (x.size(0) - 1)

        c_real = corr(real)
        c_fake = corr(fake)
        diff = torch.mean(torch.abs(c_real - c_fake))
        return torch.exp(-diff).clamp(0.0, 1.0)

    def _diversity(self, fake):
        """
        V: encourage diversity via 1 - avg cosine similarity.
        """
        x = fake.view(fake.size(0), -1)
        x = x - x.mean(0, keepdim=True)
        x = x / (x.norm(p=2, dim=1, keepdim=True) + 1e-6)
        sim = torch.mm(x, x.T)
        mask = ~torch.eye(sim.size(0), dtype=torch.bool, device=sim.device)
        avg_sim = sim[mask].mean()
        return (1.0 - avg_sim).clamp(0.0, 1.0)

    def _temporal_consistency(self, real, fake):
        """
        T: placeholder for temporal metrics (DTW etc.).
        For static tables we set to 0.
        """
        return torch.tensor(0.0, device=self.device)

    # Privacy helpers
    def _distance_to_closest_record(self, real, fake):
        """
        DCR: mean min distance between fake and real.
        Larger distances = better privacy.
        Normalize via tanh.
        """
        real = real.view(real.size(0), -1)
        fake = fake.view(fake.size(0), -1)
        dists = torch.cdist(fake, real, p=2)
        min_dist, _ = dists.min(dim=1)
        dcr_raw = min_dist.mean()
        dcr_norm = torch.tanh(dcr_raw)  # in (0,1)
        return dcr_norm

    def _quantile_diff(self, real, fake):
        """
        Qδ: mean absolute difference between quantiles.
        Smaller is better for privacy (less overfitting).
        We'll return a normalized version in [0,1].
        """
        qs = torch.tensor([0.1, 0.25, 0.5, 0.75, 0.9], device=real.device)
        real_q = torch.quantile(real, qs, dim=0)
        fake_q = torch.quantile(fake, qs, dim=0)
        diff = torch.abs(real_q - fake_q).mean()
        qdelta_raw = diff
        qdelta_norm = qdelta_raw / (qdelta_raw + 1.0)  # in (0,1)
        return qdelta_norm

    def _duplicate_score(self, real, fake):
        """
        I: duplicates within fake and between fake and real.
        High duplicate rate = bad for privacy.
        We'll approximate via very small distances.
        """
        x = torch.cat([real, fake], dim=0)
        x = x.view(x.size(0), -1)
        dists = torch.cdist(x, x, p=2)
        dup_mask = (dists < 1e-3).float()
        dup_mask = dup_mask - torch.eye(dists.size(0), device=dists.device)
        dup_rate = dup_mask.clamp(min=0).mean()  # in [0,1] roughly
        i_norm = dup_rate
        return i_norm

    def __call__(self, real_batch, fake_batch):
        real = real_batch.float()
        fake = fake_batch.float()

        D = self._distribution_similarity(real, fake)
        A = self._anomaly_preservation(real, fake)
        C = self._association_preservation(real, fake)
        V = self._diversity(fake)
        T = self._temporal_consistency(real, fake)

        Q = (
            self.wq["D"] * D +
            self.wq["A"] * A +
            self.wq["C"] * C +
            self.wq["V"] * V +
            self.wq["T"] * T
        )

        DCR = self._distance_to_closest_record(real, fake)
        Qdelta = self._quantile_diff(real, fake)
        I = self._duplicate_score(real, fake)

        # P = w1*normalize(DCR) - w2*normalize(Qδ) - w3*normalize(I)
        P = (
            self.wp["DCR"] * DCR -
            self.wp["Qdelta"] * Qdelta -
            self.wp["I"] * I
        )

        return Q, P


class ALMController:
    """
    Implements the ALM update:
    L_AL(θ, λ) = (1 − Q(θ)) + λ(P_min − P(θ)) + µ/2 (P_min − P(θ))^2
    and two-time-scale updates of λ, µ.
    """

    def __init__(self, P_min, lambda_init, mu_init, lambda_lr, mu_growth, device="cuda"):
        self.P_min = torch.tensor(P_min, device=device)
        self.lambda_lr = lambda_lr
        self.mu_growth = mu_growth
        self.device = device

        self.lmbda = torch.tensor(lambda_init, device=device)
        self.mu = torch.tensor(mu_init, device=device)

        self.residual_ma = None

    def compute_loss(self, Q, P):
        """
        Q, P are scalar tensors.
        """
        residual = self.P_min - P
        violation = torch.relu(residual)  # only if P < P_min
        L = (1.0 - Q) + self.lmbda * violation + 0.5 * self.mu * violation ** 2
        return L, residual.detach()

    def update_multipliers(self, residual_epoch):
        """
        residual_epoch: scalar tensor or float, mean over the epoch.
        """
        r = residual_epoch if torch.is_tensor(residual_epoch) else torch.tensor(residual_epoch, device=self.device)

        if self.residual_ma is None:
            self.residual_ma = r
        else:
            self.residual_ma = 0.9 * self.residual_ma + 0.1 * r

        # λ_{k+1} = max(0, λ_k + η * residual)
        self.lmbda = (self.lmbda + self.lambda_lr * self.residual_ma).clamp(min=0.0)

        # If privacy violations persist, increase µ
        if self.residual_ma > 0:
            self.mu = self.mu * self.mu_growth


qp_metrics = QualityPrivacyMetrics(WQ, WP, device=DEVICE)
alm = ALMController(
    P_min=P_MIN,
    lambda_init=LAMBDA_INIT,
    mu_init=MU_INIT,
    lambda_lr=LAMBDA_LR,
    mu_growth=MU_GROWTH,
    device=DEVICE,
)

# ------------------
# Training loop
# ------------------

G.train()
D.train()

global_step = 0

for epoch in range(1, EPOCHS + 1):
    d_losses = []
    g_losses = []
    q_scores = []
    p_scores = []
    residuals = []

    pbar = tqdm(dataloader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False)
    for real_batch in pbar:
        real_batch = real_batch.to(DEVICE)

        # ------------------
        # Train Discriminator / Critic
        # ------------------
        for _ in range(N_CRITIC):
            z = sample_noise(real_batch.size(0), LATENT_DIM, DEVICE)
            fake_batch = G(z).detach()

            d_real = D(real_batch)
            d_fake = D(fake_batch)

            gp = gradient_penalty(D, real_batch, fake_batch)
            d_loss = -(d_real.mean() - d_fake.mean()) + LAMBDA_GP * gp

            opt_D.zero_grad()
            d_loss.backward()
            opt_D.step()

        # ------------------
        # Train Generator with ALM
        # ------------------
        z = sample_noise(real_batch.size(0), LATENT_DIM, DEVICE)
        fake_batch = G(z)

        adv_loss = -D(fake_batch).mean()  # WGAN generator loss

        Q, P = qp_metrics(real_batch, fake_batch)
        L_AL, residual = alm.compute_loss(Q, P)

        g_loss = adv_loss + ALM_WEIGHT * L_AL

        opt_G.zero_grad()
        g_loss.backward()
        opt_G.step()

        d_losses.append(d_loss.item())
        g_losses.append(g_loss.item())
        q_scores.append(Q.item())
        p_scores.append(P.item())
        residuals.append(residual.item())

        global_step += 1
        pbar.set_postfix(
            d_loss=f"{np.mean(d_losses):.3f}",
            g_loss=f"{np.mean(g_losses):.3f}",
            Q=f"{np.mean(q_scores):.3f}",
            P=f"{np.mean(p_scores):.3f}",
        )

    mean_residual = float(np.mean(residuals))
    alm.update_multipliers(mean_residual)

    print(
        f"Epoch {epoch:03d} | "
        f"D_loss={np.mean(d_losses):.3f} | "
        f"G_loss={np.mean(g_losses):.3f} | "
        f"Q={np.mean(q_scores):.3f} | "
        f"P={np.mean(p_scores):.3f} | "
        f"residual={mean_residual:.3f} | "
        f"lambda={alm.lmbda.item():.3f} | "
        f"mu={alm.mu.item():.3f}"
    )

print("Training finished.")

# ------------------
# Sample synthetic data
# ------------------

G.eval()
with torch.no_grad():
    N_SYN = 1000
    z = sample_noise(N_SYN, LATENT_DIM, DEVICE)
    fake = G(z).cpu().numpy()

# Inverse scaling back to original numeric space
fake_unscaled = scaler.inverse_transform(fake)
synthetic_df = pd.DataFrame(fake_unscaled, columns=column_names)

print("Synthetic sample head:")
print(synthetic_df.head())

# Save to CSV
out_path = "/content/drive/MyDrive/CTAB-GAN-Plus-main/CTAB-GAN-Plus-main/data/synthetic_data_alm.csv"
synthetic_df.to_csv(out_path, index=False)
print(f"Synthetic data saved to {out_path}")


Using device: cuda
Loading data from /content/drive/MyDrive/CTAB-GAN-Plus-main/CTAB-GAN-Plus-main/data/breast-cancer-wisconsin.csv
Columns: ['ID', 'CT', 'UCSi', 'UCSh', 'Madh', 'SECS', 'BN', 'BC', 'NN', 'Mi', 'Class']
Data shape: (683, 11)


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch 001 | D_loss=6.970 | G_loss=0.304 | Q=0.704 | P=0.243 | residual=0.457 | lambda=0.000 | mu=0.150




Epoch 002 | D_loss=4.991 | G_loss=0.288 | Q=0.703 | P=0.246 | residual=0.454 | lambda=0.001 | mu=0.225




Epoch 003 | D_loss=3.205 | G_loss=0.252 | Q=0.705 | P=0.245 | residual=0.455 | lambda=0.001 | mu=0.338




Epoch 004 | D_loss=1.770 | G_loss=0.194 | Q=0.706 | P=0.243 | residual=0.457 | lambda=0.002 | mu=0.506




Epoch 005 | D_loss=0.820 | G_loss=0.112 | Q=0.710 | P=0.249 | residual=0.451 | lambda=0.002 | mu=0.759




Epoch 006 | D_loss=0.294 | G_loss=0.031 | Q=0.711 | P=0.248 | residual=0.452 | lambda=0.003 | mu=1.139




Epoch 007 | D_loss=0.110 | G_loss=-0.037 | Q=0.705 | P=0.251 | residual=0.449 | lambda=0.003 | mu=1.709




Epoch 008 | D_loss=0.060 | G_loss=-0.088 | Q=0.710 | P=0.249 | residual=0.451 | lambda=0.004 | mu=2.563




Epoch 009 | D_loss=-0.035 | G_loss=-0.060 | Q=0.706 | P=0.246 | residual=0.454 | lambda=0.004 | mu=3.844




Epoch 010 | D_loss=-0.130 | G_loss=0.002 | Q=0.708 | P=0.249 | residual=0.451 | lambda=0.005 | mu=5.767




Epoch 011 | D_loss=-0.217 | G_loss=0.133 | Q=0.704 | P=0.254 | residual=0.446 | lambda=0.005 | mu=8.650




Epoch 012 | D_loss=-0.402 | G_loss=0.402 | Q=0.707 | P=0.250 | residual=0.450 | lambda=0.005 | mu=12.975




Epoch 013 | D_loss=-0.508 | G_loss=0.787 | Q=0.709 | P=0.254 | residual=0.446 | lambda=0.006 | mu=19.462




Epoch 014 | D_loss=-0.721 | G_loss=1.453 | Q=0.708 | P=0.250 | residual=0.450 | lambda=0.006 | mu=29.193




Epoch 015 | D_loss=-0.809 | G_loss=2.375 | Q=0.710 | P=0.256 | residual=0.444 | lambda=0.007 | mu=43.789




Epoch 016 | D_loss=-1.120 | G_loss=3.920 | Q=0.711 | P=0.252 | residual=0.448 | lambda=0.007 | mu=65.684




Epoch 017 | D_loss=-1.320 | G_loss=6.122 | Q=0.710 | P=0.253 | residual=0.447 | lambda=0.008 | mu=98.526




Epoch 018 | D_loss=-1.431 | G_loss=9.373 | Q=0.712 | P=0.256 | residual=0.444 | lambda=0.008 | mu=147.789




Epoch 019 | D_loss=-1.682 | G_loss=14.470 | Q=0.714 | P=0.253 | residual=0.447 | lambda=0.009 | mu=221.684




Epoch 020 | D_loss=-1.865 | G_loss=21.952 | Q=0.710 | P=0.254 | residual=0.446 | lambda=0.009 | mu=332.526




Epoch 021 | D_loss=-1.975 | G_loss=33.002 | Q=0.711 | P=0.254 | residual=0.446 | lambda=0.010 | mu=498.789




Epoch 022 | D_loss=-2.151 | G_loss=49.084 | Q=0.719 | P=0.257 | residual=0.443 | lambda=0.010 | mu=748.183




Epoch 023 | D_loss=-2.407 | G_loss=73.784 | Q=0.717 | P=0.257 | residual=0.443 | lambda=0.010 | mu=1122.274




Epoch 024 | D_loss=-2.550 | G_loss=111.425 | Q=0.714 | P=0.255 | residual=0.445 | lambda=0.011 | mu=1683.411




Epoch 025 | D_loss=-2.734 | G_loss=166.396 | Q=0.718 | P=0.256 | residual=0.444 | lambda=0.011 | mu=2525.117




Epoch 026 | D_loss=-2.831 | G_loss=248.782 | Q=0.716 | P=0.257 | residual=0.443 | lambda=0.012 | mu=3787.675




Epoch 027 | D_loss=-2.895 | G_loss=372.460 | Q=0.721 | P=0.257 | residual=0.443 | lambda=0.012 | mu=5681.513




Epoch 028 | D_loss=-2.966 | G_loss=558.450 | Q=0.721 | P=0.257 | residual=0.443 | lambda=0.013 | mu=8522.270




Epoch 029 | D_loss=-2.940 | G_loss=829.983 | Q=0.717 | P=0.259 | residual=0.441 | lambda=0.013 | mu=12783.404




Epoch 030 | D_loss=-2.989 | G_loss=1250.530 | Q=0.724 | P=0.258 | residual=0.442 | lambda=0.014 | mu=19175.105




Epoch 031 | D_loss=-2.906 | G_loss=1834.045 | Q=0.725 | P=0.263 | residual=0.437 | lambda=0.014 | mu=28762.658




Epoch 032 | D_loss=-3.020 | G_loss=2779.438 | Q=0.719 | P=0.260 | residual=0.440 | lambda=0.014 | mu=43143.988




Epoch 033 | D_loss=-3.031 | G_loss=4172.299 | Q=0.724 | P=0.260 | residual=0.440 | lambda=0.015 | mu=64715.984




Epoch 034 | D_loss=-3.020 | G_loss=6166.721 | Q=0.725 | P=0.263 | residual=0.437 | lambda=0.015 | mu=97073.977




Epoch 035 | D_loss=-2.986 | G_loss=9282.730 | Q=0.717 | P=0.263 | residual=0.437 | lambda=0.016 | mu=145610.969




Epoch 036 | D_loss=-3.091 | G_loss=14029.398 | Q=0.724 | P=0.261 | residual=0.439 | lambda=0.016 | mu=218416.453




Epoch 037 | D_loss=-3.046 | G_loss=21020.895 | Q=0.722 | P=0.261 | residual=0.439 | lambda=0.017 | mu=327624.688




Epoch 038 | D_loss=-3.040 | G_loss=31215.221 | Q=0.721 | P=0.263 | residual=0.437 | lambda=0.017 | mu=491437.031




Epoch 039 | D_loss=-3.135 | G_loss=46469.994 | Q=0.718 | P=0.265 | residual=0.435 | lambda=0.018 | mu=737155.562




Epoch 040 | D_loss=-3.144 | G_loss=70084.250 | Q=0.726 | P=0.264 | residual=0.436 | lambda=0.018 | mu=1105733.375




Epoch 041 | D_loss=-3.233 | G_loss=105838.172 | Q=0.726 | P=0.262 | residual=0.438 | lambda=0.018 | mu=1658600.000




Epoch 042 | D_loss=-3.250 | G_loss=158456.945 | Q=0.729 | P=0.263 | residual=0.437 | lambda=0.019 | mu=2487900.000




Epoch 043 | D_loss=-3.203 | G_loss=236569.766 | Q=0.725 | P=0.264 | residual=0.436 | lambda=0.019 | mu=3731850.000




Epoch 044 | D_loss=-3.235 | G_loss=353253.656 | Q=0.729 | P=0.265 | residual=0.435 | lambda=0.020 | mu=5597775.000




Epoch 045 | D_loss=-3.196 | G_loss=528894.641 | Q=0.730 | P=0.265 | residual=0.435 | lambda=0.020 | mu=8396662.000




Epoch 046 | D_loss=-3.230 | G_loss=794396.375 | Q=0.723 | P=0.265 | residual=0.435 | lambda=0.021 | mu=12594993.000




Epoch 047 | D_loss=-3.267 | G_loss=1189262.562 | Q=0.731 | P=0.265 | residual=0.435 | lambda=0.021 | mu=18892490.000




Epoch 048 | D_loss=-3.241 | G_loss=1777344.062 | Q=0.727 | P=0.266 | residual=0.434 | lambda=0.021 | mu=28338736.000




Epoch 049 | D_loss=-3.277 | G_loss=2700718.375 | Q=0.725 | P=0.263 | residual=0.437 | lambda=0.022 | mu=42508104.000




Epoch 050 | D_loss=-3.310 | G_loss=4016132.250 | Q=0.730 | P=0.265 | residual=0.435 | lambda=0.022 | mu=63762156.000
Training finished.
Synthetic sample head:
             ID        CT      UCSi      UCSh      Madh      SECS        BN  \
0  9.802956e+05  3.494788  4.893621  1.101263  2.211968  1.836290  8.120708   
1  9.846604e+05  1.747692  5.169800  1.946902  3.232622  2.012821  7.650734   
2  1.047130e+06  2.965686  5.934578  1.142742  2.804155  1.559520  9.980186   
3  1.274527e+06  2.480464  4.493620  1.830411  2.117072  1.691466  6.905674   
4  1.187850e+06  3.587873  4.591006  2.031217  1.882766  1.571781  7.535800   

         BC        NN        Mi     Class  
0  2.221941  2.962717  0.944879  0.511762  
1  4.179457  2.985354  1.013454  0.594624  
2  3.424134  3.791218  0.893769  0.545127  
3  3.024504  3.311614  0.926361  0.588189  
4  2.612856  3.146319  1.173965  0.466975  
Synthetic data saved to /content/drive/MyDrive/CTAB-GAN-Plus-main/CTAB-GAN-Plus-main/data/synthetic_da

In [7]:
# -----------------------------------------------------------
# PSyGen-style Quality (Q) and Privacy (P) metrics
# for real_data.csv vs synthetic_data_alm.csv
# -----------------------------------------------------------
# Assumes:
#   - /content/real_data.csv      (used for training)
#   - /content/synthetic_data_alm.csv  (produced by the GAN cell)
#
# Metrics follow the paper's structure:
#   Quality Q = w1*D + w2*A + w3*C + w4*V + w5*T
#   Privacy P = w1*normalize(DCR) - w2*normalize(Qδ) - w3*normalize(I)
#
# This implementation is for NUMERIC-ONLY data.
# -----------------------------------------------------------

!pip install -q scipy

import numpy as np
import pandas as pd

from scipy.stats import ks_2samp, entropy
from scipy.spatial.distance import jensenshannon

# ------------------
# Config: weights
# ------------------

WQ = dict(D=0.3, A=0.1, C=0.3, V=0.3, T=0.0)  # quality weights (sum to 1)
WP = dict(DCR=0.4, Qdelta=0.3, I=0.3)         # privacy weights (sum to 1)

REAL_PATH = "/content/drive/MyDrive/CTAB-GAN-Plus-main/CTAB-GAN-Plus-main/data/breast-cancer-wisconsin.csv"
SYN_PATH  = "/content/drive/MyDrive/CTAB-GAN-Plus-main/CTAB-GAN-Plus-main/data/breast_cancer_CTABGAN_DP_QP_report.csv"

# ------------------
# Utility helpers
# ------------------

def align_real_synth(real_df, syn_df, random_state=42):
    """
    Align columns and sample same number of rows from real and synthetic.
    Assumes the same schema (same column names / order).
    """
    # Keep only common columns, in real's order
    common_cols = [c for c in real_df.columns if c in syn_df.columns]
    real_df = real_df[common_cols].copy()
    syn_df  = syn_df[common_cols].copy()

    n = min(len(real_df), len(syn_df))
    rng = np.random.default_rng(random_state)
    real_idx = rng.choice(len(real_df), n, replace=False)
    syn_idx  = rng.choice(len(syn_df), n, replace=False)

    real_df = real_df.iloc[real_idx].reset_index(drop=True)
    syn_df  = syn_df.iloc[syn_idx].reset_index(drop=True)
    return real_df, syn_df

def hist_match(real_col, syn_col, bins=20):
    """
    Make histograms for real and synthetic with same bin edges.
    Returns normalized histograms p, q (summing to 1).
    """
    real_col = np.asarray(real_col)
    syn_col  = np.asarray(syn_col)

    # Use real-data range
    r_min, r_max = np.nanmin(real_col), np.nanmax(real_col)
    if r_min == r_max:
        # Degenerate column: all values identical
        p = np.array([1.0])
        q = np.array([1.0])
        return p, q

    hist_r, bin_edges = np.histogram(real_col, bins=bins, range=(r_min, r_max), density=False)
    hist_s, _        = np.histogram(syn_col,  bins=bin_edges, density=False)

    # Add small epsilon, then normalize to probability distributions
    eps = 1e-12
    p = hist_r.astype(float) + eps
    q = hist_s.astype(float) + eps
    p /= p.sum()
    q /= q.sum()
    return p, q

def safe_entropy(p, q):
    """
    KL divergence KL(p || q) with safeguards.
    """
    return entropy(p, qk=q)  # scipy.stats.entropy

def js_divergence(p, q):
    """
    JS divergence between p and q.
    scipy.spatial.distance.jensenshannon returns sqrt(JS), so we square it.
    """
    return jensenshannon(p, q, base=2)**2

# ------------------
# Quality metrics
# ------------------

def compute_D(real_df, syn_df, bins=20):
    """
    D: data distribution similarity.
    Uses per-column KS, KL, JSD + Range Coverage (RC),
    aggregated into a [0,1] similarity per column and then averaged.
    """
    scores = []
    for col in real_df.columns:
        r = real_df[col].dropna().values
        s = syn_df[col].dropna().values

        if len(r) < 10 or len(s) < 10:
            continue

        # KS: 1 - KS statistic
        ks_stat, _ = ks_2samp(r, s)
        ks_sim = 1.0 - ks_stat  # in [0,1] if ks_stat <= 1

        # Hist-based divergences
        p, q = hist_match(r, s, bins=bins)

        # KL (two directions, average) → similarity
        kl_pq = safe_entropy(p, q)
        kl_qp = safe_entropy(q, p)
        kl_sym = 0.5 * (kl_pq + kl_qp)
        kl_sim = 1.0 / (1.0 + kl_sym)   # [0,1], larger KL → smaller similarity

        # JS divergence → similarity
        jsd = js_divergence(p, q)       # [0,1]
        js_sim = 1.0 - jsd              # [0,1]

        # Range coverage (RC): fraction of synthetic values within real min–max
        r_min, r_max = np.min(r), np.max(r)
        in_range = np.logical_and(s >= r_min, s <= r_max).mean()
        rc = float(in_range)            # already in [0,1]

        col_score = np.mean([ks_sim, kl_sim, js_sim, rc])
        scores.append(col_score)

    if not scores:
        return 0.0
    return float(np.clip(np.mean(scores), 0.0, 1.0))

def compute_A(real_df, syn_df):
    """
    A: anomaly & rare-event pattern preservation using IQR.
    For each column:
      - define anomaly via Tukey IQR rule on real data
      - compare anomaly rates in real vs synthetic
      - score_j = 1 - |rate_real - rate_syn|
    Then average over columns.
    """
    scores = []
    n_real = len(real_df)
    n_syn  = len(syn_df)
    for col in real_df.columns:
        r = real_df[col].dropna().values
        s = syn_df[col].dropna().values

        if len(r) < 10 or len(s) < 10:
            continue

        q1, q3 = np.percentile(r, [25, 75])
        iqr = q3 - q1
        if iqr == 0:
            continue
        lo = q1 - 1.5 * iqr
        hi = q3 + 1.5 * iqr

        rare_r = np.logical_or(r < lo, r > hi).mean()
        rare_s = np.logical_or(s < lo, s > hi).mean()

        diff = abs(rare_r - rare_s)
        score = 1.0 - diff  # if rates match exactly → 1.0
        scores.append(np.clip(score, 0.0, 1.0))
    if not scores:
        return 0.0
    return float(np.clip(np.mean(scores), 0.0, 1.0))

def compute_C(real_df, syn_df):
    """
    C: association preservation via correlation similarity.
    Compute Pearson correlation matrices for real and synthetic,
    then 1 - mean absolute difference across (i,j) pairs.
    """
    if real_df.shape[1] < 2:
        return 0.0

    # Use numpy correlation (columns = variables)
    corr_real = np.corrcoef(real_df.values, rowvar=False)
    corr_syn  = np.corrcoef(syn_df.values,  rowvar=False)

    # Ignore NaNs from constant columns
    mask = np.isfinite(corr_real) & np.isfinite(corr_syn)
    if not mask.any():
        return 0.0

    diff = np.abs(corr_real - corr_syn)[mask]
    mean_diff = diff.mean()   # max possible difference per entry is 2.0

    # Normalize: if mean_diff = 0 → 1, if mean_diff >= 2 → 0
    score = 1.0 - (mean_diff / 2.0)
    return float(np.clip(score, 0.0, 1.0))

def compute_V(real_df, syn_df, bins=10):
    """
    V: diversity / support coverage for numeric columns.
    For each column:
      - build histogram on real
      - consider only bins that are non-empty in real
      - measure fraction of those bins that are also non-empty in synthetic
    Average across columns.
    """
    scores = []
    for col in real_df.columns:
        r = real_df[col].dropna().values
        s = syn_df[col].dropna().values

        if len(r) < 10 or len(s) < 10:
            continue

        r_min, r_max = np.min(r), np.max(r)
        if r_min == r_max:
            continue

        hist_r, bin_edges = np.histogram(r, bins=bins, range=(r_min, r_max))
        hist_s, _        = np.histogram(s, bins=bin_edges)

        real_support = hist_r > 0
        if not real_support.any():
            continue

        syn_support  = hist_s > 0
        overlap = np.logical_and(real_support, syn_support).sum()
        coverage = overlap / real_support.sum()
        scores.append(float(np.clip(coverage, 0.0, 1.0)))
    if not scores:
        return 0.0
    return float(np.clip(np.mean(scores), 0.0, 1.0))

def compute_T(real_df, syn_df):
    """
    T: temporal consistency placeholder.
    For purely static tabular data, this is set to 0.
    (For real temporal data, you would add DTW/auto-correlation-based scores.)
    """
    return 0.0

def compute_quality(real_df, syn_df, w_quality=WQ):
    """
    Compute D, A, C, V, T and aggregate into Q.
    All sub-metrics are in [0,1].
    """
    D = compute_D(real_df, syn_df)
    A = compute_A(real_df, syn_df)
    C = compute_C(real_df, syn_df)
    V = compute_V(real_df, syn_df)
    T = compute_T(real_df, syn_df)

    Q = (
        w_quality["D"] * D +
        w_quality["A"] * A +
        w_quality["C"] * C +
        w_quality["V"] * V +
        w_quality["T"] * T
    )

    return {
        "Q": float(Q),
        "D": float(D),
        "A": float(A),
        "C": float(C),
        "V": float(V),
        "T": float(T),
    }

# ------------------
# Privacy metrics
# ------------------

def compute_DCR(real_df, syn_df):
    """
    DCR: Distance to Closest Record.
    For each synthetic record, compute distance to closest real record.
    Return mean, std, and a normalized [0,1] version where larger is better.
    """
    R = real_df.values.astype(float)
    S = syn_df.values.astype(float)

    # Compute pairwise distances in batches to be safer on memory
    dists_min = []
    batch_size = 512
    for start in range(0, len(S), batch_size):
        end = min(len(S), start + batch_size)
        S_batch = S[start:end]
        # Euclidean distance to all real points
        diff = S_batch[:, None, :] - R[None, :, :]
        dist = np.linalg.norm(diff, axis=-1)
        d_min = dist.min(axis=1)
        dists_min.append(d_min)
    dists_min = np.concatenate(dists_min, axis=0)

    mean_d = float(dists_min.mean())
    std_d  = float(dists_min.std())
    # Aggregate mean+std as overall DCR magnitude
    dcr_raw = mean_d + std_d

    # Normalize with tanh to [0,1). Larger distance → closer to 1.
    dcr_norm = float(np.tanh(dcr_raw))

    return {
        "DCR_mean": mean_d,
        "DCR_std": std_d,
        "DCR_raw": dcr_raw,
        "DCR_norm": dcr_norm,
    }

def compute_Qdelta(real_df, syn_df, qs=(0.1, 0.25, 0.5, 0.75, 0.9)):
    """
    Qδ: Quantile Difference statistic over numeric columns.
    For each column:
      - compute quantiles in real and synthetic at positions qs
      - take mean absolute difference
    Then average over columns.
    Normalize as qdelta_norm = qdelta_raw / (1 + qdelta_raw) ∈ [0,1).
    """
    diffs = []
    qs = np.array(qs)
    for col in real_df.columns:
        r = real_df[col].dropna().values
        s = syn_df[col].dropna().values

        if len(r) < 10 or len(s) < 10:
            continue

        qr = np.quantile(r, qs)
        qs_syn = np.quantile(s, qs)
        diff = np.abs(qr - qs_syn).mean()
        diffs.append(diff)

    if not diffs:
        qdelta_raw = 0.0
    else:
        qdelta_raw = float(np.mean(diffs))

    qdelta_norm = float(qdelta_raw / (1.0 + qdelta_raw))  # (0,1)
    return {"Qdelta_raw": qdelta_raw, "Qdelta_norm": qdelta_norm}

def compute_duplicates(real_df, syn_df, round_decimals=3):
    """
    I: Duplicate analysis.
    - Within synthetic: fraction of duplicate rows
    - Between synthetic and real: fraction of synthetic rows that exactly
      match a real row.
    All comparisons are done after rounding to 'round_decimals' to make
    exact matching meaningful in floating point.
    """
    def df_to_tuple(df):
        arr = np.round(df.values.astype(float), round_decimals)
        return [tuple(row) for row in arr]

    syn_tuples  = df_to_tuple(syn_df)
    real_tuples = df_to_tuple(real_df)

    n_syn = len(syn_tuples)

    # Within synthetic duplicates
    from collections import Counter
    counts = Counter(syn_tuples)
    dup_within = sum(c for c in counts.values() if c > 1)
    dup_within_rate = dup_within / n_syn

    # Cross duplicates
    real_set = set(real_tuples)
    dup_cross = sum(1 for t in syn_tuples if t in real_set)
    dup_cross_rate = dup_cross / n_syn

    # Combine (simple average). This is already in [0,1].
    I_raw = float((dup_within_rate + dup_cross_rate) / 2.0)
    I_norm = float(np.clip(I_raw, 0.0, 1.0))

    return {
        "dup_within_rate": float(dup_within_rate),
        "dup_cross_rate": float(dup_cross_rate),
        "I_raw": I_raw,
        "I_norm": I_norm,
    }

def compute_privacy(real_df, syn_df, w_privacy=WP):
    """
    Compute DCR, Qδ, I and aggregate into P:
        P = w1*normalize(DCR) - w2*normalize(Qδ) - w3*normalize(I)
    All normalized metrics are in [0,1].
    """
    dcr = compute_DCR(real_df, syn_df)
    qd  = compute_Qdelta(real_df, syn_df)
    dup = compute_duplicates(real_df, syn_df)

    DCR_norm = dcr["DCR_norm"]
    Qdelta_norm = qd["Qdelta_norm"]
    I_norm = dup["I_norm"]

    P = (
        w_privacy["DCR"]   * DCR_norm -
        w_privacy["Qdelta"] * Qdelta_norm -
        w_privacy["I"]      * I_norm
    )

    out = {}
    out.update(dcr)
    out.update(qd)
    out.update(dup)
    out["P"] = float(P)
    return out

# ------------------
# Main evaluation
# ------------------

real_df = pd.read_csv(REAL_PATH)
syn_df  = pd.read_csv(SYN_PATH)

# Numeric-only assumption (same as training cell)
for c in real_df.columns:
    if not pd.api.types.is_numeric_dtype(real_df[c]):
        raise ValueError(f"Non-numeric column in real data: {c}")
for c in syn_df.columns:
    if not pd.api.types.is_numeric_dtype(syn_df[c]):
        raise ValueError(f"Non-numeric column in synthetic data: {c}")

real_df, syn_df = align_real_synth(real_df, syn_df)

quality_metrics = compute_quality(real_df, syn_df, WQ)
privacy_metrics = compute_privacy(real_df, syn_df, WP)

print("=== QUALITY METRICS (Q) ===")
for k, v in quality_metrics.items():
    print(f"{k:>3}: {v:.4f}")

print("\n=== PRIVACY METRICS (P) ===")
for k, v in privacy_metrics.items():
    if isinstance(v, float):
        print(f"{k:>15}: {v:.6f}")
    else:
        print(f"{k:>15}: {v}")


ValueError: Non-numeric column in synthetic data: dataset

In [None]:
# ============================================
# Tabular synthetic data metrics (Quality + Privacy)
# ============================================
# Metrics:
#   Q_overall
#   Q_num(1-KS)
#   Q_cat(1-TVD)
#   Q_range
#   Q_anomaly
#   Q_corr
#   Q_diversity
#   P_overall
#   P_DCR
#   P_Qdelta
#   P_noDup
#   n_real, n_syn, num_cols_used, cat_cols_used
#
# Works for numeric + categorical columns.
# Categorical detection:
#   - auto: dtype == object/category
#   - or you can pass cat_cols=['col1', 'col2', ...]
# ============================================

!pip install -q scipy

import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
from collections import Counter


# -------------------------
# Helpers: types & alignment
# -------------------------

def _detect_column_types(df, cat_cols=None):
    cols = df.columns.tolist()
    if cat_cols is None:
        # auto-detect categoricals
        cat_cols = [c for c in cols
                    if df[c].dtype == "object" or str(df[c].dtype).startswith("category")]
    else:
        cat_cols = [c for c in cat_cols if c in cols]

    num_cols = [c for c in cols if c not in cat_cols]
    return num_cols, cat_cols


def _align_real_synth(real_df, syn_df):
    # keep common columns, same order as real
    common = [c for c in real_df.columns if c in syn_df.columns]
    real_df = real_df[common].copy()
    syn_df  = syn_df[common].copy()
    # same number of rows (truncate to min)
    n = min(len(real_df), len(syn_df))
    real_df = real_df.iloc[:n].reset_index(drop=True)
    syn_df  = syn_df.iloc[:n].reset_index(drop=True)
    return real_df, syn_df


# -------------------------
# Quality metrics
# -------------------------

def _q_num_1_minus_ks(real_df, syn_df, num_cols):
    """Numeric quality: mean(1 - KS-statistic) across numeric columns."""
    if not num_cols:
        return 1.0, 0

    scores = []
    for c in num_cols:
        r = real_df[c].dropna().values
        s = syn_df[c].dropna().values
        if len(r) < 2 or len(s) < 2:
            continue
        stat, _ = ks_2samp(r, s)
        scores.append(1.0 - stat)

    if not scores:
        return 1.0, 0
    return float(np.clip(np.mean(scores), 0.0, 1.0)), len(scores)


def _q_cat_1_minus_tvd(real_df, syn_df, cat_cols):
    """Categorical quality: mean(1 - TVD) over categorical columns."""
    if not cat_cols:
        return 1.0, 0

    scores = []
    for c in cat_cols:
        r = real_df[c].astype("object").fillna("nan").values
        s = syn_df[c].astype("object").fillna("nan").values
        if len(r) == 0 or len(s) == 0:
            continue

        vals = sorted(set(r) | set(s))
        cr = Counter(r)
        cs = Counter(s)
        pr = np.array([cr[v] for v in vals], dtype=float)
        ps = np.array([cs[v] for v in vals], dtype=float)
        pr /= pr.sum()
        ps /= ps.sum()
        tvd = 0.5 * np.abs(pr - ps).sum()
        scores.append(1.0 - tvd)

    if not scores:
        return 1.0, 0
    return float(np.clip(np.mean(scores), 0.0, 1.0)), len(scores)


def _q_range(real_df, syn_df, num_cols):
    """Q_range: fraction of synthetic values inside real min–max, averaged across numeric columns."""
    if not num_cols:
        return 1.0

    covs = []
    for c in num_cols:
        r = real_df[c].dropna().values
        s = syn_df[c].dropna().values
        if len(r) == 0 or len(s) == 0:
            continue
        lo, hi = np.min(r), np.max(r)
        if hi == lo:
            continue
        in_range = np.logical_and(s >= lo, s <= hi).mean()
        covs.append(in_range)

    if not covs:
        return 1.0
    return float(np.clip(np.mean(covs), 0.0, 1.0))


def _q_anomaly(real_df, syn_df, num_cols):
    """
    Q_anomaly: anomaly-rate preservation using IQR rule.
    For each numeric column:
      - define anomalies on real via Tukey rule
      - compare anomaly rates (real vs synthetic)
      - score_j = 1 - |rate_real - rate_syn|
    """
    if not num_cols:
        return 1.0

    scores = []
    for c in num_cols:
        r = real_df[c].dropna().values
        s = syn_df[c].dropna().values
        if len(r) < 10 or len(s) < 10:
            continue

        q1, q3 = np.percentile(r, [25, 75])
        iqr = q3 - q1
        if iqr == 0:
            continue

        lo = q1 - 1.5 * iqr
        hi = q3 + 1.5 * iqr
        rare_r = np.logical_or(r < lo, r > hi).mean()
        rare_s = np.logical_or(s < lo, s > hi).mean()
        score = 1.0 - abs(rare_r - rare_s)
        scores.append(score)

    if not scores:
        return 1.0
    return float(np.clip(np.mean(scores), 0.0, 1.0))


def _q_corr(real_df, syn_df, num_cols):
    """
    Q_corr: correlation structure similarity across numeric features.
    1 - mean |corr_real - corr_syn| / 2.
    """
    if len(num_cols) < 2:
        return 1.0

    r_num = real_df[num_cols].astype(float).values
    s_num = syn_df[num_cols].astype(float).values

    cr = np.corrcoef(r_num, rowvar=False)
    cs = np.corrcoef(s_num, rowvar=False)

    mask = np.isfinite(cr) & np.isfinite(cs)
    if not mask.any():
        return 1.0

    diff = np.abs(cr - cs)[mask]
    mean_diff = diff.mean()     # max diff per entry ~2
    score = 1.0 - (mean_diff / 2.0)
    return float(np.clip(score, 0.0, 1.0))


def _q_diversity(real_df, syn_df, num_cols, cat_cols, num_bins=10):
    """
    Q_diversity:
      - numeric: support coverage via hist bins
      - categorical: category coverage
    """
    scores = []

    # numeric
    for c in num_cols:
        r = real_df[c].dropna().values
        s = syn_df[c].dropna().values
        if len(r) < 10 or len(s) < 10:
            continue
        lo, hi = np.min(r), np.max(r)
        if hi == lo:
            continue

        hr, edges = np.histogram(r, bins=num_bins, range=(lo, hi))
        hs, _ = np.histogram(s, bins=edges)
        real_support = hr > 0
        if not real_support.any():
            continue
        syn_support = hs > 0
        overlap = np.logical_and(real_support, syn_support).sum()
        cov = overlap / real_support.sum()
        scores.append(cov)

    # categorical
    for c in cat_cols:
        r = real_df[c].astype("object").fillna("nan").values
        s = syn_df[c].astype("object").fillna("nan").values
        if len(r) == 0 or len(s) == 0:
            continue
        cats_r = set(r)
        cats_s = set(s)
        if not cats_r:
            continue
        cov = len(cats_r & cats_s) / len(cats_r)
        scores.append(cov)

    if not scores:
        return 1.0
    return float(np.clip(np.mean(scores), 0.0, 1.0))


# -------------------------
# Privacy metrics
# -------------------------

def _p_dcr(real_df, syn_df, num_cols, cat_cols):
    """
    P_DCR: Distance to Closest Record (higher = more private).
    We:
      - map categoricals to integer codes
      - z-score all columns using real
      - compute mean min-distance(syn row, real dataset)
      - squash with tanh to [0,1)
    """
    df_real = real_df.copy()
    df_syn  = syn_df.copy()

    # encode categoricals
    for c in cat_cols:
        cats = sorted(
            set(df_real[c].dropna().astype("object").unique()) |
            set(df_syn[c].dropna().astype("object").unique())
        )
        mapping = {v: i for i, v in enumerate(cats)}
        df_real[c] = df_real[c].astype("object").map(mapping).fillna(-1).astype(float)
        df_syn[c]  = df_syn[c].astype("object").map(mapping).fillna(-1).astype(float)

    arr_r = df_real.values.astype(float)
    arr_s = df_syn.values.astype(float)

    means = np.nanmean(arr_r, axis=0, keepdims=True)
    stds = np.nanstd(arr_r, axis=0, keepdims=True)
    stds[stds == 0] = 1.0
    arr_r = (arr_r - means) / stds
    arr_s = (arr_s - means) / stds

    d_min_all = []
    batch = 256
    for start in range(0, len(arr_s), batch):
        s_batch = arr_s[start:start+batch]
        diff = s_batch[:, None, :] - arr_r[None, :, :]
        dists = np.linalg.norm(diff, axis=-1)
        d_min = dists.min(axis=1)
        d_min_all.append(d_min)

    d_min_all = np.concatenate(d_min_all, axis=0)
    mean_d = float(d_min_all.mean())

    P_DCR = float(np.tanh(mean_d))   # in (0,1)
    return P_DCR


def _p_qdelta(real_df, syn_df, num_cols, quantiles=(0.1, 0.25, 0.5, 0.75, 0.9)):
    """
    P_Qdelta: privacy from quantile differences on numeric columns.
    Smaller quantile mismatch => larger P_Qdelta.
    """
    if not num_cols:
        return 1.0

    diffs = []
    qs = np.array(quantiles)
    for c in num_cols:
        r = real_df[c].dropna().values
        s = syn_df[c].dropna().values
        if len(r) < 10 or len(s) < 10:
            continue
        qr = np.quantile(r, qs)
        qs_syn = np.quantile(s, qs)
        diffs.append(np.abs(qr - qs_syn).mean())

    if not diffs:
        return 1.0

    qdelta_raw = float(np.mean(diffs))
    P_Qdelta = 1.0 / (1.0 + qdelta_raw)  # (0,1]
    return float(np.clip(P_Qdelta, 0.0, 1.0))


def _p_no_dup(real_df, syn_df, round_decimals=3):
    """
    P_noDup: 1 - duplicate rate
      - within synthetic
      - and synthetic rows that exactly match a real row
    """
    def df_to_tuples(df):
        vals = df.copy()
        for c in vals.columns:
            if vals[c].dtype == "object":
                vals[c] = vals[c].fillna("nan")
            else:
                vals[c] = np.round(vals[c].astype(float), round_decimals)
        return [tuple(row) for row in vals.values]

    syn_t  = df_to_tuples(syn_df)
    real_t = set(df_to_tuples(real_df))
    n_syn = len(syn_t)

    from collections import Counter
    counts = Counter(syn_t)

    dup_within = sum(c for c in counts.values() if c > 1)
    dup_within_rate = dup_within / n_syn

    dup_cross = sum(1 for t in syn_t if t in real_t)
    dup_cross_rate = dup_cross / n_syn

    dup_rate = (dup_within_rate + dup_cross_rate) / 2.0
    P_noDup = 1.0 - dup_rate
    return float(np.clip(P_noDup, 0.0, 1.0))


# -------------------------
# Main entry
# -------------------------

def evaluate_pair(
    real_path: str,
    syn_path: str,
    dataset_name: str = None,
    synthetic_name: str = None,
    cat_cols=None,  # optional list of categorical columns
):
    """
    Compute all metrics for one (real, synthetic) pair.

    Returns a 1-row pandas.DataFrame with columns:
      dataset, synthetic,
      Q_overall, Q_num(1-KS), Q_cat(1-TVD), Q_range, Q_anomaly, Q_corr, Q_diversity,
      P_overall, P_DCR, P_Qdelta, P_noDup,
      n_real, n_syn, num_cols_used, cat_cols_used
    """
    real_df = pd.read_csv(real_path)
    syn_df  = pd.read_csv(syn_path)

    real_df, syn_df = _align_real_synth(real_df, syn_df)

    num_cols, cat_cols_detected = _detect_column_types(real_df, cat_cols)
    cat_cols_used = cat_cols_detected
    num_cols_used = num_cols

    # Quality
    Q_num, _ = _q_num_1_minus_ks(real_df, syn_df, num_cols_used)
    Q_cat, _ = _q_cat_1_minus_tvd(real_df, syn_df, cat_cols_used)
    Q_range = _q_range(real_df, syn_df, num_cols_used)
    Q_anomaly = _q_anomaly(real_df, syn_df, num_cols_used)
    Q_corr = _q_corr(real_df, syn_df, num_cols_used)
    Q_diversity = _q_diversity(real_df, syn_df, num_cols_used, cat_cols_used)

    Q_components = [Q_num, Q_cat, Q_range, Q_anomaly, Q_corr, Q_diversity]
    Q_overall = float(np.mean(Q_components))

    # Privacy
    P_DCR = _p_dcr(real_df, syn_df, num_cols_used, cat_cols_used)
    P_Qdelta = _p_qdelta(real_df, syn_df, num_cols_used)
    P_noDup = _p_no_dup(real_df, syn_df)

    P_overall = float(np.mean([P_DCR, P_Qdelta, P_noDup]))

    result = {
        "dataset": dataset_name if dataset_name is not None else real_path,
        "synthetic": synthetic_name if synthetic_name is not None else syn_path,
        "Q_overall": Q_overall,
        "Q_num(1-KS)": Q_num,
        "Q_cat(1-TVD)": Q_cat,
        "Q_range": Q_range,
        "Q_anomaly": Q_anomaly,
        "Q_corr": Q_corr,
        "Q_diversity": Q_diversity,
        "P_overall": P_overall,
        "P_DCR": P_DCR,
        "P_Qdelta": P_Qdelta,
        "P_noDup": P_noDup,
        "n_real": len(real_df),
        "n_syn": len(syn_df),
        "num_cols_used": len(num_cols_used),
        "cat_cols_used": len(cat_cols_used),
    }
    return pd.DataFrame([result])


# -------------------------
# Example usage in Colab
# -------------------------
# metrics = evaluate_pair(
#     real_path="breast-cancer-wisconsin.csv",
#     syn_path="breast-cancer-wisconsin_CTABGAN_DP.csv",
#     dataset_name="breast-cancer-wisconsin.csv",
#     synthetic_name="breast-cancer-wisconsin_CTABGAN_DP.csv",
#     cat_cols=None,  # or ['some_categorical_col']
# )
# print(metrics)


In [5]:
# ============================================
# Tabular synthetic data metrics (Quality + Privacy)
# ============================================
# Metrics:
#   Q_overall
#   Q_num(1-KS)
#   Q_cat(1-TVD)
#   Q_range
#   Q_anomaly
#   Q_corr
#   Q_diversity
#   P_overall
#   P_DCR
#   P_Qdelta
#   P_noDup
#   n_real, n_syn, num_cols_used, cat_cols_used
# ============================================

!pip install -q scipy

import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
from collections import Counter


# -------------------------
# Helpers: types & alignment
# -------------------------

def _detect_column_types(df, cat_cols=None):
    cols = df.columns.tolist()
    if cat_cols is None:
        # auto-detect categoricals
        cat_cols = [c for c in cols
                    if df[c].dtype == "object" or str(df[c].dtype).startswith("category")]
    else:
        cat_cols = [c for c in cat_cols if c in cols]

    num_cols = [c for c in cols if c not in cat_cols]
    return num_cols, cat_cols


def _align_real_synth(real_df, syn_df):
    # keep common columns, same order as real
    common = [c for c in real_df.columns if c in syn_df.columns]
    real_df = real_df[common].copy()
    syn_df  = syn_df[common].copy()
    # same number of rows (truncate to min)
    n = min(len(real_df), len(syn_df))
    real_df = real_df.iloc[:n].reset_index(drop=True)
    syn_df  = syn_df.iloc[:n].reset_index(drop=True)
    return real_df, syn_df


# -------------------------
# Quality metrics
# -------------------------

def _q_num_1_minus_ks(real_df, syn_df, num_cols):
    """Numeric quality: mean(1 - KS-statistic) across numeric columns."""
    if not num_cols:
        return 1.0, 0

    scores = []
    for c in num_cols:
        r = real_df[c].dropna().values
        s = syn_df[c].dropna().values
        if len(r) < 2 or len(s) < 2:
            continue
        stat, _ = ks_2samp(r, s)
        scores.append(1.0 - stat)

    if not scores:
        return 1.0, 0
    return float(np.clip(np.mean(scores), 0.0, 1.0)), len(scores)


def _q_cat_1_minus_tvd(real_df, syn_df, cat_cols):
    """Categorical quality: mean(1 - TVD) over categorical columns."""
    if not cat_cols:
        return 1.0, 0

    scores = []
    for c in cat_cols:
        r = real_df[c].astype("object").fillna("nan").values
        s = syn_df[c].astype("object").fillna("nan").values
        if len(r) == 0 or len(s) == 0:
            continue

        vals = sorted(set(r) | set(s))
        cr = Counter(r)
        cs = Counter(s)
        pr = np.array([cr[v] for v in vals], dtype=float)
        ps = np.array([cs[v] for v in vals], dtype=float)
        pr /= pr.sum()
        ps /= ps.sum()
        tvd = 0.5 * np.abs(pr - ps).sum()
        scores.append(1.0 - tvd)

    if not scores:
        return 1.0, 0
    return float(np.clip(np.mean(scores), 0.0, 1.0)), len(scores)


def _q_range(real_df, syn_df, num_cols):
    """Q_range: fraction of synthetic values inside real min–max, averaged across numeric columns."""
    if not num_cols:
        return 1.0

    covs = []
    for c in num_cols:
        r = real_df[c].dropna().values
        s = syn_df[c].dropna().values
        if len(r) == 0 or len(s) == 0:
            continue
        lo, hi = np.min(r), np.max(r)
        if hi == lo:
            continue
        in_range = np.logical_and(s >= lo, s <= hi).mean()
        covs.append(in_range)

    if not covs:
        return 1.0
    return float(np.clip(np.mean(covs), 0.0, 1.0))


def _q_anomaly(real_df, syn_df, num_cols):
    """
    Q_anomaly: anomaly-rate preservation using IQR rule.
    For each numeric column:
      - define anomalies on real via Tukey rule
      - compare anomaly rates (real vs synthetic)
      - score_j = 1 - |rate_real - rate_syn|
    """
    if not num_cols:
        return 1.0

    scores = []
    for c in num_cols:
        r = real_df[c].dropna().values
        s = syn_df[c].dropna().values
        if len(r) < 10 or len(s) < 10:
            continue

        q1, q3 = np.percentile(r, [25, 75])
        iqr = q3 - q1
        if iqr == 0:
            continue

        lo = q1 - 1.5 * iqr
        hi = q3 + 1.5 * iqr
        rare_r = np.logical_or(r < lo, r > hi).mean()
        rare_s = np.logical_or(s < lo, s > hi).mean()
        score = 1.0 - abs(rare_r - rare_s)
        scores.append(score)

    if not scores:
        return 1.0
    return float(np.clip(np.mean(scores), 0.0, 1.0))


def _q_corr(real_df, syn_df, num_cols):
    """
    Q_corr: correlation structure similarity across numeric features.
    1 - mean |corr_real - corr_syn| / 2.
    """
    if len(num_cols) < 2:
        return 1.0

    r_num = real_df[num_cols].astype(float).values
    s_num = syn_df[num_cols].astype(float).values

    cr = np.corrcoef(r_num, rowvar=False)
    cs = np.corrcoef(s_num, rowvar=False)

    mask = np.isfinite(cr) & np.isfinite(cs)
    if not mask.any():
        return 1.0

    diff = np.abs(cr - cs)[mask]
    mean_diff = diff.mean()     # max diff per entry ~2
    score = 1.0 - (mean_diff / 2.0)
    return float(np.clip(score, 0.0, 1.0))


def _q_diversity(real_df, syn_df, num_cols, cat_cols, num_bins=10):
    """
    Q_diversity:
      - numeric: support coverage via hist bins
      - categorical: category coverage
    """
    scores = []

    # numeric
    for c in num_cols:
        r = real_df[c].dropna().values
        s = syn_df[c].dropna().values
        if len(r) < 10 or len(s) < 10:
            continue
        lo, hi = np.min(r), np.max(r)
        if hi == lo:
            continue

        hr, edges = np.histogram(r, bins=num_bins, range=(lo, hi))
        hs, _ = np.histogram(s, bins=edges)
        real_support = hr > 0
        if not real_support.any():
            continue
        syn_support = hs > 0
        overlap = np.logical_and(real_support, syn_support).sum()
        cov = overlap / real_support.sum()
        scores.append(cov)

    # categorical
    for c in cat_cols:
        r = real_df[c].astype("object").fillna("nan").values
        s = syn_df[c].astype("object").fillna("nan").values
        if len(r) == 0 or len(s) == 0:
            continue
        cats_r = set(r)
        cats_s = set(s)
        if not cats_r:
            continue
        cov = len(cats_r & cats_s) / len(cats_r)
        scores.append(cov)

    if not scores:
        return 1.0
    return float(np.clip(np.mean(scores), 0.0, 1.0))


# -------------------------
# Privacy metrics
# -------------------------

def _p_dcr(real_df, syn_df, num_cols, cat_cols):
    """
    P_DCR: Distance to Closest Record (higher = more private).
    We:
      - map categoricals to integer codes
      - z-score all columns using real
      - compute mean min-distance(syn row, real dataset)
      - squash with tanh to [0,1)
    """
    df_real = real_df.copy()
    df_syn  = syn_df.copy()

    # encode categoricals
    for c in cat_cols:
        cats = sorted(
            set(df_real[c].dropna().astype("object").unique()) |
            set(df_syn[c].dropna().astype("object").unique())
        )
        mapping = {v: i for i, v in enumerate(cats)}
        df_real[c] = df_real[c].astype("object").map(mapping).fillna(-1).astype(float)
        df_syn[c]  = df_syn[c].astype("object").map(mapping).fillna(-1).astype(float)

    arr_r = df_real.values.astype(float)
    arr_s = df_syn.values.astype(float)

    means = np.nanmean(arr_r, axis=0, keepdims=True)
    stds = np.nanstd(arr_r, axis=0, keepdims=True)
    stds[stds == 0] = 1.0
    arr_r = (arr_r - means) / stds
    arr_s = (arr_s - means) / stds

    d_min_all = []
    batch = 256
    for start in range(0, len(arr_s), batch):
        s_batch = arr_s[start:start+batch]
        diff = s_batch[:, None, :] - arr_r[None, :, :]
        dists = np.linalg.norm(diff, axis=-1)
        d_min = dists.min(axis=1)
        d_min_all.append(d_min)

    d_min_all = np.concatenate(d_min_all, axis=0)
    mean_d = float(d_min_all.mean())

    P_DCR = float(np.tanh(mean_d))   # in (0,1)
    return P_DCR


def _p_qdelta(real_df, syn_df, num_cols, quantiles=(0.1, 0.25, 0.5, 0.75, 0.9)):
    """
    P_Qdelta: privacy from quantile differences on numeric columns.
    Smaller quantile mismatch => larger P_Qdelta.
    """
    if not num_cols:
        return 1.0

    diffs = []
    qs = np.array(quantiles)
    for c in num_cols:
        r = real_df[c].dropna().values
        s = syn_df[c].dropna().values
        if len(r) < 10 or len(s) < 10:
            continue
        qr = np.quantile(r, qs)
        qs_syn = np.quantile(s, qs)
        diffs.append(np.abs(qr - qs_syn).mean())

    if not diffs:
        return 1.0

    qdelta_raw = float(np.mean(diffs))
    P_Qdelta = 1.0 / (1.0 + qdelta_raw)  # (0,1]
    return float(np.clip(P_Qdelta, 0.0, 1.0))


def _p_no_dup(real_df, syn_df, round_decimals=3):
    """
    P_noDup: 1 - duplicate rate
      - within synthetic
      - and synthetic rows that exactly match a real row
    """
    def df_to_tuples(df):
        vals = df.copy()
        for c in vals.columns:
            if vals[c].dtype == "object":
                vals[c] = vals[c].fillna("nan")
            else:
                vals[c] = np.round(vals[c].astype(float), round_decimals)
        return [tuple(row) for row in vals.values]

    syn_t  = df_to_tuples(syn_df)
    real_t = set(df_to_tuples(real_df))
    n_syn = len(syn_t)

    from collections import Counter
    counts = Counter(syn_t)

    dup_within = sum(c for c in counts.values() if c > 1)
    dup_within_rate = dup_within / n_syn

    dup_cross = sum(1 for t in syn_t if t in real_t)
    dup_cross_rate = dup_cross / n_syn

    dup_rate = (dup_within_rate + dup_cross_rate) / 2.0
    P_noDup = 1.0 - dup_rate
    return float(np.clip(P_noDup, 0.0, 1.0))


# -------------------------
# Main entry
# -------------------------

def evaluate_pair(
    real_path: str,
    syn_path: str,
    dataset_name: str = None,
    synthetic_name: str = None,
    cat_cols=None,  # optional list of categorical columns
):
    """
    Compute all metrics for one (real, synthetic) pair.

    Returns a 1-row pandas.DataFrame with columns:
      dataset, synthetic,
      Q_overall, Q_num(1-KS), Q_cat(1-TVD), Q_range, Q_anomaly, Q_corr, Q_diversity,
      P_overall, P_DCR, P_Qdelta, P_noDup,
      n_real, n_syn, num_cols_used, cat_cols_used
    """
    real_df = pd.read_csv(real_path)
    syn_df  = pd.read_csv(syn_path)

    real_df, syn_df = _align_real_synth(real_df, syn_df)

    num_cols, cat_cols_detected = _detect_column_types(real_df, cat_cols)
    cat_cols_used = cat_cols_detected
    num_cols_used = num_cols

    # Quality
    Q_num, _ = _q_num_1_minus_ks(real_df, syn_df, num_cols_used)
    Q_cat, _ = _q_cat_1_minus_tvd(real_df, syn_df, cat_cols_used)
    Q_range = _q_range(real_df, syn_df, num_cols_used)
    Q_anomaly = _q_anomaly(real_df, syn_df, num_cols_used)
    Q_corr = _q_corr(real_df, syn_df, num_cols_used)
    Q_diversity = _q_diversity(real_df, syn_df, num_cols_used, cat_cols_used)

    Q_components = [Q_num, Q_cat, Q_range, Q_anomaly, Q_corr, Q_diversity]
    Q_overall = float(np.mean(Q_components))

    # Privacy
    P_DCR = _p_dcr(real_df, syn_df, num_cols_used, cat_cols_used)
    P_Qdelta = _p_qdelta(real_df, syn_df, num_cols_used)
    P_noDup = _p_no_dup(real_df, syn_df)

    P_overall = float(np.mean([P_DCR, P_Qdelta, P_noDup]))

    result = {
        "dataset": dataset_name if dataset_name is not None else real_path,
        "synthetic": synthetic_name if synthetic_name is not None else syn_path,
        "Q_overall": Q_overall,
        "Q_num(1-KS)": Q_num,
        "Q_cat(1-TVD)": Q_cat,
        "Q_range": Q_range,
        "Q_anomaly": Q_anomaly,
        "Q_corr": Q_corr,
        "Q_diversity": Q_diversity,
        "P_overall": P_overall,
        "P_DCR": P_DCR,
        "P_Qdelta": P_Qdelta,
        "P_noDup": P_noDup,
        "n_real": len(real_df),
        "n_syn": len(syn_df),
        "num_cols_used": len(num_cols_used),
        "cat_cols_used": len(cat_cols_used),
    }
    return pd.DataFrame([result])


# -------------------------
# Interactive usage
# -------------------------
if __name__ == "__main__":
    # In Colab this will ask in the cell output area:
    real_path = input("Enter path to REAL data CSV (e.g. breast-cancer-wisconsin.csv): ").strip()
    syn_path  = input("Enter path to SYNTHETIC data CSV (e.g. breast-cancer-wisconsin_CTABGAN_DP.csv): ").strip()

    metrics = evaluate_pair(
        real_path=real_path,
        syn_path=syn_path,
        dataset_name=real_path,
        synthetic_name=syn_path,
        cat_cols=None,  # or e.g. ['diagnosis', 'gender'] if you want to force categoricals
    )

    # Wide (original) view – optional
    print("\n==== Metrics (wide, 1 row) ====\n")
    print(metrics.to_string(index=False))

    # Vertical view: key–value pairs
    print("\n==== Metrics (vertical) ====\n")
    vertical = metrics.T.reset_index()
    vertical.columns = ["Metric", "Value"]
    print(vertical.to_string(index=False))



Enter path to REAL data CSV (e.g. breast-cancer-wisconsin.csv): /content/drive/MyDrive/CTAB-GAN-Plus-main/CTAB-GAN-Plus-main/data/breast-cancer-wisconsin.csv
Enter path to SYNTHETIC data CSV (e.g. breast-cancer-wisconsin_CTABGAN_DP.csv): /content/drive/MyDrive/CTAB-GAN-Plus-main/CTAB-GAN-Plus-main/data/synthetic_data_alm.csv

==== Metrics (wide, 1 row) ====

                                                                                      dataset                                                                                synthetic  Q_overall  Q_num(1-KS)  Q_cat(1-TVD)  Q_range  Q_anomaly   Q_corr  Q_diversity  P_overall    P_DCR  P_Qdelta  P_noDup  n_real  n_syn  num_cols_used  cat_cols_used
/content/drive/MyDrive/CTAB-GAN-Plus-main/CTAB-GAN-Plus-main/data/breast-cancer-wisconsin.csv /content/drive/MyDrive/CTAB-GAN-Plus-main/CTAB-GAN-Plus-main/data/synthetic_data_alm.csv   0.741425     0.408625           1.0 0.999601   0.953294 0.791574     0.295455   0.656152 0.968378  0.000077

In [6]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from copy import deepcopy

# ---------------------------------------------------
# 1) Paths to REAL and SYNTHETIC data
# ---------------------------------------------------
real_path = "/content/drive/MyDrive/CTAB-GAN-Plus-main/CTAB-GAN-Plus-main/data/breast-cancer-wisconsin.csv"
syn_path  = "/content/drive/MyDrive/CTAB-GAN-Plus-main/CTAB-GAN-Plus-main/data/synthetic_data_alm.csv"

# Set your label column name here:
TARGET_COL = "Class"   # <-- change this if your target is called something else

# ---------------------------------------------------
# 2) Load and align data
# ---------------------------------------------------
real_df = pd.read_csv(real_path)
syn_df  = pd.read_csv(syn_path)

if TARGET_COL not in real_df.columns:
    raise ValueError(f"TARGET_COL='{TARGET_COL}' not in REAL columns: {real_df.columns.tolist()}")
if TARGET_COL not in syn_df.columns:
    raise ValueError(f"TARGET_COL='{TARGET_COL}' not in SYN columns: {syn_df.columns.tolist()}")

# Keep only common columns in the same order
common_cols = [c for c in real_df.columns if c in syn_df.columns]
real_df = real_df[common_cols].copy()
syn_df  = syn_df[common_cols].copy()

# ---------------------------------------------------
# 3) Split into features (X) and label (y)
# ---------------------------------------------------
X_real = real_df.drop(columns=[TARGET_COL])
y_real = real_df[TARGET_COL]

X_syn = syn_df.drop(columns=[TARGET_COL])
y_syn = syn_df[TARGET_COL]

# ---------------------------------------------------
# 4) Train/test split on REAL data (always test on REAL)
# ---------------------------------------------------
X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(
    X_real, y_real, test_size=0.3, random_state=42, stratify=y_real
)

# ---------------------------------------------------
# 5) Preprocessing + model
# ---------------------------------------------------
numeric_features = X_real.select_dtypes(include=["int64","float64","int32","float32"]).columns.tolist()
categorical_features = [c for c in X_real.columns if c not in numeric_features]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ],
    remainder="drop",
)

clf = LogisticRegression(max_iter=1000, n_jobs=-1)

def fit_and_eval(X_train, y_train, X_test, y_test, desc=""):
    pipe = Pipeline([
        ("preprocess", deepcopy(preprocess)),
        ("clf", deepcopy(clf)),
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    classes = np.unique(y_train)
    binary = len(classes) == 2

    acc = accuracy_score(y_test, y_pred)
    if binary:
        f1 = f1_score(y_test, y_pred, average="binary", pos_label=classes[1])
    else:
        f1 = f1_score(y_test, y_pred, average="macro")

    try:
        y_proba = pipe.predict_proba(X_test)
        if binary:
            pos_idx = list(pipe.named_steps["clf"].classes_).index(classes[1])
            y_score = y_proba[:, pos_idx]
            auc = roc_auc_score(y_test, y_score)
        else:
            auc = roc_auc_score(y_test, y_proba, multi_class="ovr", average="macro")
    except Exception:
        auc = np.nan

    return {
        "setting": desc,
        "ACC": acc,
        "F1": f1,
        "AUC": auc,
    }

# ---------------------------------------------------
# 6) Evaluate:
#     - Train on REAL, test on REAL (baseline)
#     - Train on SYN,  test on REAL (utility of synthetic)
# ---------------------------------------------------
metrics_real = fit_and_eval(
    X_train_real, y_train_real, X_test_real, y_test_real,
    desc="Train on REAL, test on REAL"
)
metrics_syn  = fit_and_eval(
    X_syn, y_syn, X_test_real, y_test_real,
    desc="Train on SYN,  test on REAL"
)

results_df = pd.DataFrame([metrics_real, metrics_syn])

# ---------------------------------------------------
# 7) Tabular output (metrics as rows, settings as columns)
# ---------------------------------------------------
# Reformat so rows = ACC/F1/AUC and columns = each setting
settings = results_df["setting"].tolist()
metric_names = ["ACC", "F1", "AUC"]

vertical_table = results_df.set_index("setting")[metric_names].T.reset_index()
vertical_table.rename(columns={"index": "Metric"}, inplace=True)

print("\n==== Classification metrics (tabular) ====\n")
print(vertical_table.to_string(index=False))


ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [8]:
# save as build_poster_slide.py and run:  python build_poster_slide.py
from pptx import Presentation
from pptx.util import Inches
from PIL import Image, ImageDraw, ImageFont

# 1) Draw the flowchart to a PNG (pure Pillow for portability)
W,H = 3200, 1400
img = Image.new("RGB", (W,H), "white")
d = ImageDraw.Draw(img)
def rounded(x,y,w,h,r,fill): d.rounded_rectangle([x,y,x+w,y+h], r, fill=fill)

# colors
title="#b6c23d"; c1="#f0c808"; c2="#d99a00"; c3="#9bbf6a"; c4="#b9b2ad"; c5="#e38b42"; foot="#e3d66e"; txt="#111111"

# title
rounded(40,40,W-80,150,20,title); d.text((60,95),"IDEAL: Interactive Detection & Explanation of Anomalies (LSTM Autoencoder)", fill=txt, anchor="lm")

# boxes
boxes=[(c1,"Data Preparation",["Clean / encode","Set window size"]),
       (c2,"Constraint Discovery",["LSTM-AE learns","Long-term constraints"]),
       (c3,"Anomaly Detection",["Detect suspicious","Records & sequences"]),
       (c4,"Anomaly Interpretation",["Explain violations","Feature importances"]),
       (c5,"Anomaly Inspection",["Expert review","Retrain with feedback"])]
x0=120; y0=380; bw=520; bh=420; gap=80
for i,(c,title_txt,bul) in enumerate(boxes):
    x = x0 + i*(bw+gap)
    rounded(x,y0,bw,bh,28,c)
    d.text((x+bw/2,y0+60), title_txt, fill=txt, anchor="mm")
    d.text((x+30,y0+140), f"• {bul[0]}\n• {bul[1]}", fill=txt, anchor="la")

    # arrow to next
    if i<4:
        ax = x+bw+10; ay = y0+bh/2
        d.line([(ax,ay),(ax+gap-20,ay)], fill="#6b6b6b", width=8)
        d.polygon([(ax+gap-20,ay-14),(ax+gap,ay),(ax+gap-20,ay+14)], fill="#6b6b6b")

# footer
rounded(40,H-170,W-80,130,20,foot)
d.text((W/2,H-105), "Suspicious Sequence Detected from Johns Hopkins COVID-19 Data", fill=txt, anchor="mm")

png_path = "flowchart_ctabgan_dp.png"
img.save(png_path, dpi=(300,300))

# 2) Drop the PNG into a PPTX slide
prs = Presentation()
prs.slide_width = Inches(13.333)   # 16:9
prs.slide_height = Inches(7.5)
slide = prs.slides.add_slide(prs.slide_layouts[6])
slide.shapes.add_picture(png_path, Inches(0.25), Inches(0.25), width=prs.slide_width-Inches(0.5))
prs.save("CTABGAN_DP_Flowchart.pptx")
print("Wrote CTABGAN_DP_Flowchart.pptx")


ModuleNotFoundError: No module named 'pptx'