# Application of correlation attacks: column-wise, row-wise, and integrated)

This notebook demonstrates how to embed a universal fingerprint into the Adult dataset, then conduct and evaluate 3 types of attacks: column-wise, row-wise, and their integration, by measuring how many fingerprint bits each attack corrupts (robustness) and how much data utility is lost.

In [1]:
from pathlib import Path
import sys

repo_root = Path().resolve().parents[1]
sys.path.insert(0, str(repo_root))

results_dir = Path().resolve().parents[1] / 'correlation_attacks' / 'results'
results_dir.mkdir(exist_ok=True)

import pandas as pd
from numpy.linalg import norm
import numpy as np
from tqdm.auto import tqdm
from scheme._universal import Universal
import importlib
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
import gensim.downloader as api
from scipy.cluster.hierarchy import linkage, leaves_list
from itertools import combinations
import warnings
warnings.filterwarnings("ignore")
import correlation_attacks.src.attacks2 as attacks2

importlib.reload(attacks2)

from correlation_attacks.src.attacks2 import (
    compute_joint_distribution,
    column_wise_attack,
    compute_row_similarity,
    row_wise_attack,
)

### Preparation (incl. Preprocessing)

Loading and preprocessesing the Adult Census data, semantically encoding “education” and “marital‐status”,  factorizing other categoricals, and binning numeric columns, then embeds a universal fingerprint.
Also here we compute pairwise joint distributions for column‐wise attacks and K‐means communities with their exponential Hamming‐based row similarities for row‐wise attacks, and also set the column and row attack thresholds.

In [None]:
df = pd.read_csv('../../datasets/adult_train_id.csv').iloc[:,1:]
#df = df.iloc[:1000].copy() # to test a smaller subset
wv = api.load('glove-wiki-gigaword-100')  

def embed_label(label: str):
    toks = label.replace('-', ' ').split()
    vecs = [wv[t] for t in toks if t in wv]
    if not vecs:
        return np.zeros(wv.vector_size)
    return np.mean(vecs, axis=0)

def make_semantic_codes(series: pd.Series) -> pd.Series:
    cats = series.unique().tolist()
    embeds = np.vstack([embed_label(c) for c in cats])
    Z = linkage(embeds, method='average', metric='euclidean')
    order = leaves_list(Z)
    ordered = [cats[i] for i in order]
    code_map = {cat: idx for idx, cat in enumerate(ordered)}
    return series.map(code_map)

df['education']      = make_semantic_codes(df['education'])
df['marital-status'] = make_semantic_codes(df['marital-status'])

nominals = ['workclass','occupation','relationship','race','sex','native-country','income']
for col in nominals:
    df[col], _ = pd.factorize(df[col], sort=True)

age_bins = [17, 26, 36, 46, 56, 66, df['age'].max() + 1]
df['age'] = pd.cut(
    df['age'],
    bins=age_bins,
    right=False,     # intervals are [17,26), [26,36)
    labels=False
).astype(int)

# hours-per-week: [0–20, 21–40, 41–60, 61–80, 81+]
hpw_bins = [0, 21, 41, 61, 81, df['hours-per-week'].max() + 1]
df['hours-per-week'] = pd.cut(
    df['hours-per-week'],
    bins=hpw_bins,
    right=False,
    labels=False
).astype(int)

# capital-gain and capital-loss
for col in ['capital-gain', 'capital-loss']:
    df[col] = (
        pd.qcut(df[col].rank(method='first'),
                q=4,
                labels=False,
                duplicates='drop')
          .astype(int)
    )

# fnlwgt into 5 quantile bins
df['fnlwgt'] = (
    pd.qcut(df['fnlwgt'].rank(method='first'),
            q=5,
            labels=False,
            duplicates='drop')
      .astype(int)
)

# final attribute list
attrs = df.columns.tolist()
#df_preprocessed.to_csv('adult_preprocessed.csv', index=False)

# embedding universal fingerprint
u = Universal(gamma=35, fingerprint_bit_length=128, xi=1)
print("Embedding fingerprint")
df_fp = u.insertion(
    dataset=df,
    recipient_id=7,
    secret_key=42
)

true_joint = {}
for p, q in tqdm(combinations(attrs, 2), desc="Joint dists"):
    true_joint[(p, q)] = compute_joint_distribution(df, p, q)


print("Building communities via KMeans")
X = df[attrs].values
kmeans = KMeans(n_clusters=10, random_state=42).fit(X)
labels = kmeans.labels_

clusters = {}
for c in range(10):
    members = np.where(labels == c)[0].tolist()
    for i in members:
        clusters[i] = [j for j in members if j != i]

print("Average cluster size:", 
      np.mean([len(members) for members in clusters.values()]))

print("Computing true row similarities")
true_sims = {}
for i, members in tqdm(clusters.items(), desc="Row sims"):
    for j in members:
        # raw Hamming distance
        dist_ij = (df.iloc[i] != df.iloc[j]).sum()
        sim = np.exp(-dist_ij)
        true_sims[(i, j)] = sim
        true_sims[(j, i)] = sim


tau_col = 1e-4
tau_row = 0.1
print(f"Using thresholds tau_col={tau_col}, tau_row={tau_row}")

Embedding fingerprint
Universal fingerprinting scheme - initialised.
Embedding started...
	gamma: 35
	fingerprint length: 128

Generated fingerprint for recipient 7: 01010011001110010101101110011111100110001000011111010111000100001101000001011010110000001111010110101011010101101110111101100110
Fingerprint inserted.
	marked tuples: ~2.95%
	single fingerprint bit embedded 7 times ("amount of redundancy")
Time: <1 sec.


Joint dists: 0it [00:00, ?it/s]

Building communities via KMeans
Average cluster size: 3975.48908203065
Computing true row similarities


Row sims:   0%|          | 0/32561 [00:00<?, ?it/s]

### Running the attacks

In [None]:
# running the attacks

def extract_df(obj):
    return obj.dataframe if hasattr(obj, 'dataframe') else obj

print("Running column-wise attack")
df_col = column_wise_attack(
    extract_df(df_fp).copy(),   
    true_joint,
    threshold=tau_col,
    pbar=tqdm
)


print("Running row-wise attack")
df_row = row_wise_attack(
    extract_df(df_fp).copy(),
    clusters,
    true_sims,
    threshold=tau_row,
    pbar=tqdm
)


In [None]:
# integrated wrapper
def integrated_attack(df_input, true_joint, clusters, true_sims, tau_row, tau_col, iters=1, pbar=None):
    df_curr = df_input
    for _ in range(iters):
        df_curr = row_wise_attack(df_curr, clusters, true_sims, threshold=tau_row, pbar=pbar)
        df_curr = column_wise_attack(df_curr, true_joint, threshold=tau_col, pbar=pbar)
    return df_curr

print("Running integrated attack")
df_int = integrated_attack(extract_df(df_fp).copy(), true_joint, clusters, true_sims, tau_row, tau_col, iters=1, pbar=tqdm)

In [None]:
# saving datasets
def extract_df(obj): return obj.dataframe if hasattr(obj,'dataframe') else obj

extract_df(df_col).to_csv(results_dir / 'adult_col_attacked2.csv', index=False)
extract_df(df_row).to_csv(results_dir / 'adult_row_attacked2.csv', index=False)
extract_df(df_int).to_csv(results_dir / 'adult_integrated_attacked2.csv', index=False)

print(f"Saved attacked datasets to {results_dir}")

### Evaluation

In [None]:
df_col = pd.read_csv('../results/adult_col_attacked2.csv')
df_row = pd.read_csv('../results/adult_row_attacked2.csv')
df_int = pd.read_csv('../results/adult_integrated_attacked2.csv')


def extract(a):
    return a.dataframe if hasattr(a, 'dataframe') else a

# robustness
def eval_match(a):
    orig_fp = u.create_fingerprint(recipient_id=7, secret_key=42)
    _ = u.detection(dataset=a, secret_key=42)
    cnts = u.detection_counts
    rec_bits = []
    for c0, c1 in cnts:
        if c1 > c0:
            rec_bits.append('1')
        elif c0 > c1:
            rec_bits.append('0')
        else:
            rec_bits.append('2') 
    rec_fp = ''.join(rec_bits)
    match = sum(o==r for o,r in zip(orig_fp,rec_fp)) / len(orig_fp)
    numcmp= sum(o!=r for o,r in zip(orig_fp,rec_fp))
    return match, numcmp

print("Robustness (match_rate, #compromised):")
for name, df_a in [('Column-wise',df_col),('Row-wise',df_row),('Integrated',df_int)]:
    print(f"{name}: {eval_match(df_a)}")

# utility
orig_df = df
print("\nUtility metrics:")
for name, atk in [('Column-wise',df_col),('Row-wise',df_row),('Integrated',df_int)]:
    d = extract(atk)
    # Acc
    Acc = 1 - (orig_df.values!=d.values).sum()/orig_df.size

    # Pcol: fraction of joint-cells where |J_emp-J_true| <= τ_col
    total_cells = 0; ok_cells = 0
    for (p,q), J_true in true_joint.items():
        J_emp = compute_joint_distribution(d,p,q).reindex(
                    index=J_true.index, columns=J_true.columns, fill_value=0)
        diffs = np.abs(J_emp.values - J_true.values)
        total_cells += diffs.size
        ok_cells    += (diffs <= tau_col).sum()
    Pcol = ok_cells/total_cells

    # Prow: fraction of row-pairs in communities where |s_emp-s_true| <= τ_row
    total_pairs = 0; ok_pairs = 0
    for i, members in clusters.items():
        for j in members:
            if i < j:
                # recompute s_emp
                dist_emp = (d.iloc[i] != d.iloc[j]).sum()
                s_emp     = np.exp(-dist_emp)
                total_pairs += 1
                if abs(s_emp - true_sims[(i,j)]) <= tau_row:
                    ok_pairs += 1
    Prow = ok_pairs/total_pairs

    # Pcov
    cov0 = np.cov(orig_df.values, rowvar=False)
    cov1 = np.cov(d.values,       rowvar=False)
    Pcov = 1 - norm(cov0-cov1,'fro')/norm(cov0,'fro')

    print(f"{name}: Acc={Acc:.3%}, Pcol={Pcol:.3%}, Prow={Prow:.3%}, Pcov={Pcov:.3%}")


In [None]:
import filecmp
if filecmp.cmp('../results/adult_col_attacked2.csv', '../results/adult_integrated_attacked2.csv', shallow=False):
    print("Files are identical (byte-for-byte).")
else:
    print("Files differ.")