In [None]:
import warnings
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# modelling
from sklearn.model_selection import train_test_split
from sklearn.exceptions import UndefinedMetricWarning

# generators
from imblearn.over_sampling import SMOTE

from anonymeter.evaluators import LinkabilityEvaluator

# custom
from datasets import IMB_DATASETS, load_data, prepare_data


In [2]:
warnings.simplefilter("ignore", category=FutureWarning)
warnings.simplefilter("ignore", category=UndefinedMetricWarning)
warnings.simplefilter("ignore", category=UserWarning)

In [3]:
np.random.seed(42)

In [4]:
N_GEN_FITS = 5
N_CLF_FITS = 5

# number of nearest neighbors used in SMOTE
K = 5
SAMPLING_STRATEGY = 1

In [None]:
columns = ["dataset", "link_risk"]
scores_df = pd.DataFrame(columns=columns, dtype=str)


for data_name in tqdm(IMB_DATASETS, leave=False):
    X, y = load_data(data_name)
    X, y = prepare_data(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=y)

    risks = []
    for _ in range(N_GEN_FITS):
        generator = SMOTE(k_neighbors=K, sampling_strategy=SAMPLING_STRATEGY)
        X_augmented, y_augmented = generator.fit_resample(X_train, y_train)
        
        X_real_train = X_train[(y_train==1)]
        y_real_train = y_train[(y_train==1)]
        assert np.all(y_real_train == 1)
        X_real_test = X_test[(y_test==1)]
        y_real_test = y_test[(y_test==1)]
        assert np.all(y_real_test == 1)
        X_synthetic = X_augmented[len(y_train):]
        y_synthetic = y_augmented[len(y_train):]
        assert np.all(y_synthetic == 1)

        n, d = X.shape
        cols = [str(i) for i in range(d)]

        for _ in range(N_GEN_FITS * N_CLF_FITS):
            mixed_cols = list(np.random.permutation(cols))
            aux_cols = [mixed_cols[:d//2], mixed_cols[d//2:]]

            evaluator = LinkabilityEvaluator(ori=pd.DataFrame(X_real_train, columns=cols), 
                                             syn=pd.DataFrame(X_synthetic, columns=cols), 
                                             control=pd.DataFrame(X_real_test, columns=cols),
                                             n_attacks=10,
                                             aux_cols=aux_cols,
                                             n_neighbors=1)
        
            evaluator.evaluate()
            risk = evaluator.risk()[1][0]

            risks.append(risk)
        
    data_socres = [f"{data_name}",
                   f"{np.array(risks).mean():.4f} +- {np.array(risks).std():.4f}",
                   ]
    scores_df = pd.concat([scores_df, pd.DataFrame([data_socres], columns=columns)], ignore_index=True)
    print(f"{data_name}: link risk -- {np.array(risks).mean():.4f} +- {np.array(risks).std():.4f}")

print(scores_df)
# scores_df.to_csv("results/synth/linkability.csv", index=False)
    

  0%|          | 0/8 [00:00<?, ?it/s]

ecoli: link risk -- 0.2623 +- 0.1659
yeast_me2: link risk -- 0.2180 +- 0.1577
solar_flare_m0: link risk -- 0.0358 +- 0.0813
abalone: link risk -- 0.2526 +- 0.1677
car_eval_34: link risk -- 0.0077 +- 0.0382
car_eval_4: link risk -- 0.0114 +- 0.0573
mammography: link risk -- 0.2292 +- 0.1420
abalone_19: link risk -- 0.3268 +- 0.1596
          dataset         link_risk
0           ecoli  0.2623 +- 0.1659
1       yeast_me2  0.2180 +- 0.1577
2  solar_flare_m0  0.0358 +- 0.0813
3         abalone  0.2526 +- 0.1677
4     car_eval_34  0.0077 +- 0.0382
5      car_eval_4  0.0114 +- 0.0573
6     mammography  0.2292 +- 0.1420
7      abalone_19  0.3268 +- 0.1596
