In [None]:
# ===========================
# Conditional GQE + QSVM TSTR (MerLin, Quandela)
# For the Big Quantum Hackathon (Team 8)
# Motivation:
#   Mitigate customer privacy risks by training classifiers on
#   quantum-generated synthetic credit profiles instead of raw customer data.
#
# What this script does:
#   1) Train a QSVM on REAL train data and evaluate on REAL test data
#      → baseline performance (utility reference).
#   2) Train a Conditional Generative Quantum Model (GQE) using class-wise
#      MMD with a fidelity-based quantum kernel to learn p(x | y).
#   3) Generate synthetic labelled customers (both non-default and default).
#   4) TSTR evaluation (Train on Synthetic, Test on Real):
#        - Train a QSVM only on SYNTHETIC data.
#        - Test this QSVM on REAL test data.
#      → If performance is close to the real-trained QSVM, synthetic data
#        captures the credit-risk structure while reducing direct exposure
#        of real customer records.
#   5)  Check privacy:
#        - Nearest-neighbor distances (synthetic → real) to detect memorization.

# ===========================

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
    balanced_accuracy_score,
)

import merlin as ML
from merlin.algorithms.kernels import FidelityKernel

torch.manual_seed(0)
np.random.seed(0)

# ===========================
# 1. Load train/test data
# ===========================
train_path = "data/credit_train.csv"
test_path  = "data/credit_test.csv"

df_train = pd.read_csv(train_path)
df_test  = pd.read_csv(test_path)

# Assume:
# - col 0 = ID (applicant_id)
# - cols 1:-1 = features
# - last col = label (default_label, 0/1)
X_train = df_train.iloc[:, 1:-1].values.astype("float32")
y_train = df_train.iloc[:, -1].values.astype("int64")

X_test  = df_test.iloc[:, 1:-1].values.astype("float32")
y_test  = df_test.iloc[:, -1].values.astype("int64")

print("Train shape:", X_train.shape, " Test shape:", X_test.shape)
print("Train default rate:", np.mean(y_train))
print("Test  default rate:", np.mean(y_test))

# ===========================
# 2. Scale features (fit on train, apply to test)
# ===========================
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

x_train_t = torch.tensor(X_train_scaled, dtype=torch.float32)
x_test_t  = torch.tensor(X_test_scaled,  dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.long)
y_test_t  = torch.tensor(y_test,  dtype=torch.long)

# ===========================
# 3. Quantum kernel + baseline QSVM (Train REAL, Test REAL)
# ===========================
kernel = FidelityKernel.simple(
    input_size=6,
    n_modes=6,
    shots=0,
    no_bunching=False,
    dtype=torch.float32,
    device=torch.device("cpu"),
)

K_train = kernel(x_train_t)
K_test  = kernel(x_test_t, x_train_t)

K_train_np = K_train.detach().numpy()
K_test_np  = K_test.detach().numpy()

'''qsvc_real = SVC(
    kernel="precomputed",
    class_weight="balanced",
    probability=False,
)
qsvc_real.fit(K_train_np, y_train)

y_pred_real = qsvc_real.predict(K_test_np)
scores_real = qsvc_real.decision_function(K_test_np)

print("\n=== QSVM baseline (Train REAL, Test REAL) ===")
print("Confusion matrix (real test):")
print(confusion_matrix(y_test, y_pred_real))
print("\nClassification report (real test):")
print(classification_report(y_test, y_pred_real, digits=3))
print("Balanced accuracy (real test):", balanced_accuracy_score(y_test, y_pred_real))
print("ROC-AUC (real test):", roc_auc_score(y_test, scores_real))
print("F1 (default=1, real test):", f1_score(y_test, y_pred_real, pos_label=1))
'''
# ===========================
# 4. Conditional Quantum Generator (cGQE)
# ===========================
class ConditionalQuantumGenerator(nn.Module):
    def __init__(self, latent_dim: int = 8, n_classes: int = 2, out_dim: int = 6):
        super().__init__()
        self.latent_dim = latent_dim
        self.n_classes = n_classes

        input_size = latent_dim + n_classes  # z + one-hot(y)

        self.quantum_core = ML.QuantumLayer.simple(
            input_size=input_size,
            n_params=50,   # requested budget; may be exceeded internally
        )

        # Determine quantum output dimension dynamically
        with torch.no_grad():
            dummy_z = torch.randn(1, latent_dim)
            dummy_y = torch.tensor([0])
            dummy_onehot = F.one_hot(dummy_y, num_classes=n_classes).float()
            dummy_in = torch.cat([dummy_z, dummy_onehot], dim=1)
            q_out = self.quantum_core(dummy_in)
            q_dim = q_out.shape[1]

        self.head = nn.Sequential(
            nn.Linear(q_dim, 32),
            nn.ReLU(),
            nn.Linear(32, out_dim),
            nn.Sigmoid(),  # because we scaled features to [0,1]
        )

    def forward(self, z: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
        y_onehot = F.one_hot(y, num_classes=self.n_classes).float()
        inp = torch.cat([z, y_onehot], dim=1)
        q_feat = self.quantum_core(inp)
        x_hat = self.head(q_feat)
        return x_hat


def classwise_mmd(real_x, real_y, fake_x, fake_y, kernel):
    """
    Sum MMD^2 over classes: MMD(real|c, fake|c) for c in {0,1}.
    Encourages generator to match p(x|y=c) for each class.
    """
    loss = 0.0
    classes = torch.unique(real_y)
    for c in classes:
        c = c.item()
        real_c = real_x[real_y == c]
        fake_c = fake_x[fake_y == c]
        if real_c.size(0) < 2 or fake_c.size(0) < 2:
            continue
        K_rr = kernel(real_c)
        K_ff = kernel(fake_c)
        K_rf = kernel(real_c, fake_c)
        mmd2 = K_rr.mean() + K_ff.mean() - 2.0 * K_rf.mean()
        loss += mmd2
    return loss

latent_dim = 8
generator = ConditionalQuantumGenerator(latent_dim=latent_dim, n_classes=2, out_dim=6)
optimizer = torch.optim.Adam(generator.parameters(), lr=1e-3)

batch_size = 64
n_epochs = 50

print("\n=== Training Conditional GQE (class-wise MMD) ===")
for epoch in range(1, n_epochs + 1):
    perm = torch.randperm(x_train_t.size(0))
    epoch_loss = 0.0
    n_batches = 0

    for start in range(0, x_train_t.size(0), batch_size):
        end = min(start + batch_size, x_train_t.size(0))
        idx = perm[start:end]
        real_x = x_train_t[idx]
        real_y = y_train_t[idx]

        b = real_x.size(0)
        if b < 4:
            continue

        z = torch.randn(b, latent_dim)
        fake_y = real_y.clone()          # preserve class distribution per batch
        fake_x = generator(z, fake_y)

        loss = classwise_mmd(real_x, real_y, fake_x, fake_y, kernel)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        n_batches += 1

    if n_batches > 0 and (epoch % 5 == 0 or epoch == 1):
        print(f"Epoch {epoch:03d} | classwise MMD^2 loss: {epoch_loss / n_batches:.6f}")

# ===========================
# 5. Generate synthetic data for BOTH classes (0 and 1)
# ===========================
generator.eval()
with torch.no_grad():
    n_synth_per_class = X_train.shape[0] // 2   # you can tune this
    z0 = torch.randn(n_synth_per_class, latent_dim)
    y0 = torch.zeros(n_synth_per_class, dtype=torch.long)
    x0_scaled = generator(z0, y0)

    z1 = torch.randn(n_synth_per_class, latent_dim)
    y1 = torch.ones(n_synth_per_class, dtype=torch.long)
    x1_scaled = generator(z1, y1)

x_synth_scaled = torch.cat([x0_scaled, x1_scaled], dim=0)
y_synth_true   = torch.cat([y0, y1], dim=0)

print("\nSynthetic true label distribution:", np.bincount(y_synth_true.numpy()))

# Optional: view some synthetic samples in original scale
x_synth = scaler.inverse_transform(x_synth_scaled.cpu().numpy())
synth_df = pd.DataFrame(x_synth, columns=df_train.columns[1:-1])
print("\nSample synthetic customers (first 5 rows):")
print(synth_df.head())

# ===========================
# 6. TSTR: Train QSVM on SYNTHETIC, test on REAL
# ===========================
# Build Gram matrices for QSVM trained on synthetic data
with torch.no_grad():
    K_synth_train = kernel(x_synth_scaled)             # [N_synth, N_synth]
    K_test_TSTR  = kernel(x_test_t, x_synth_scaled)    # [N_test, N_synth]

K_synth_train_np = K_synth_train.detach().numpy()
K_test_TSTR_np   = K_test_TSTR.detach().numpy()

# QSVM trained ONLY on synthetic data (labels = y_synth_true)
qsvc_TSTR = SVC(
    kernel="precomputed",
    class_weight="balanced",
    probability=False,
)
qsvc_TSTR.fit(K_synth_train_np, y_synth_true.numpy())

# Evaluate TSTR model on REAL test set
y_pred_TSTR = qsvc_TSTR.predict(K_test_TSTR_np)
scores_TSTR = qsvc_TSTR.decision_function(K_test_TSTR_np)

print("\n=== TSTR: QSVM trained on SYNTHETIC, tested on REAL ===")
print("Confusion matrix (real test, TSTR model):")
print(confusion_matrix(y_test, y_pred_TSTR))

print("\nClassification report (real test, TSTR model):")
print(classification_report(y_test, y_pred_TSTR, digits=3))
print("Balanced accuracy (TSTR, real test):", balanced_accuracy_score(y_test, y_pred_TSTR))
print("ROC-AUC (TSTR, real test):", roc_auc_score(y_test, scores_TSTR))
print("F1 (default=1, TSTR, real test):", f1_score(y_test, y_pred_TSTR, pos_label=1))


Train shape: (600, 6)  Test shape: (240, 6)
Train default rate: 0.22
Test  default rate: 0.22083333333333333


  self.quantum_core = ML.QuantumLayer.simple(



=== Training Conditional GQE (class-wise MMD) ===
Epoch 001 | classwise MMD^2 loss: 0.552785
Epoch 005 | classwise MMD^2 loss: 0.507018
Epoch 010 | classwise MMD^2 loss: 0.456546
Epoch 015 | classwise MMD^2 loss: 0.397049
Epoch 020 | classwise MMD^2 loss: 0.285551
Epoch 025 | classwise MMD^2 loss: 0.270854
Epoch 030 | classwise MMD^2 loss: 0.248815
Epoch 035 | classwise MMD^2 loss: 0.268982
Epoch 040 | classwise MMD^2 loss: 0.258356
Epoch 045 | classwise MMD^2 loss: 0.272531
Epoch 050 | classwise MMD^2 loss: 0.277224

Synthetic true label distribution: [300 300]

Sample synthetic customers (first 5 rows):
     income  credit_utilization  payment_history  num_open_accounts  \
0  0.571781            0.307024         0.675343           8.938728   
1  0.564790            0.324205         0.661471           8.856956   
2  0.581118            0.294490         0.682127           8.998247   
3  0.581514            0.290290         0.687295           9.013526   
4  0.559172            0.334471

Our QSVM which was trained only on synthetic quantum-generated data can still reach almost the same performance on the real test set as a QSVM trained directly on real customer records.


In detail, in a TSTR (Train on synthatic/Test on real) setting, our QSVM trained solely on synthetic data generated by our conditional GQE achieves a ROC-AUC of 0.962 on the real test set, compared to 0.970 when trained on real data, indicating that the generative model captures the essential credit-risk structure while enabling privacy-preserving model training.

Therefore, Generative Quantum Models (GQE) can synthesize class-conditional, privacy-preserving credit profiles that retain enough structure to train competitive QSVM classifiers, as evidenced by near-baseline ROC-AUC and balanced accuracy in a Train-on-Synthetic, Test-on-Real evaluation.

In [11]:
from sklearn.neighbors import NearestNeighbors

# Use scaled space (0–1 per feature) for distance
real = X_train_scaled                     # shape [N_real, 6], numpy
synth = x_synth_scaled.cpu().numpy()      # shape [N_synth, 6], numpy

# 1) Nearest real neighbor for each synthetic sample
nbrs_real = NearestNeighbors(n_neighbors=1).fit(real)
dist_synth_to_real, idx_synth_to_real = nbrs_real.kneighbors(synth)  # shapes [N_synth, 1], [N_synth, 1]
dist_synth_to_real = dist_synth_to_real.ravel()

print("\n=== Nearest-neighbor distances: synthetic → real (scaled space) ===")
print("min distance:", dist_synth_to_real.min())
print("5/25/50/75/95 percentiles:",
      np.percentile(dist_synth_to_real, [5, 25, 50, 75, 95]))

# Optional: how many synthetic points are *very* close to some real point?
eps = 1e-3
print(f"Number of synthetic points with distance < {eps}:",
      np.sum(dist_synth_to_real < eps))

# 2) As a baseline, nearest OTHER real neighbor for each real sample
#    This shows what "normal" distances look like among real data.
nbrs_self = NearestNeighbors(n_neighbors=2).fit(real)
dist_real_to_real, idx_real_to_real = nbrs_self.kneighbors(real)
# first neighbor is the point itself (distance=0), second is nearest other point
dist_real_to_real = dist_real_to_real[:, 1]

print("\n=== Nearest-neighbor distances: real → real (excluding self) ===")
print("min distance:", dist_real_to_real.min())
print("5/25/50/75/95 percentiles:",
      np.percentile(dist_real_to_real, [5, 25, 50, 75, 95]))



=== Nearest-neighbor distances: synthetic → real (scaled space) ===
min distance: 0.11544365935330599
5/25/50/75/95 percentiles: [0.14318918 0.15138911 0.16117396 0.17791756 0.22531789]
Number of synthetic points with distance < 0.001: 0

=== Nearest-neighbor distances: real → real (excluding self) ===
min distance: 0.07869872156449058
5/25/50/75/95 percentiles: [0.11377949 0.15726479 0.19196972 0.22481251 0.28327595]


We assessed potential memorization by computing nearest-neighbor distances between synthetic and real customers in the normalized feature space. The minimum distance from any synthetic customer to the closest real customer was 0.115, while typical distances ranged from 0.14 to 0.23. In contrast, distances between distinct real customers ranged from 0.08 to 0.28. No synthetic point was an almost-exact copy of a real record (0 samples within 0.001 distance), indicating that the GQE does not simply reproduce training records but generates novel, population-consistent customer profiles.