In [1]:
import warnings
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# modelling
from sklearn.metrics import roc_auc_score
from sklearn.exceptions import ConvergenceWarning, UndefinedMetricWarning

# custom
from datasets import IMB_DATASETS, load_data, prepare_data
from attacks import membership_attack

In [2]:
warnings.simplefilter("ignore", category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
warnings.simplefilter("ignore", category=UndefinedMetricWarning)

In [3]:
np.random.seed(42)

In [4]:
N_TRAIN = 0
N_TEST = 100

N_TARGETS = 100

MAX_ITER = 25  # n epochs

K = 5
SAMPLING_STRATEGY = 1

In [None]:
all_clf_aucs = {}
all_gen_clf_aucs = {}


for data_name in tqdm(IMB_DATASETS, leave=False):
    X, y = load_data(data_name)
    X, y = prepare_data(X, y)
    
    d = X.shape[1]
    
    n_minority = (y == 1).sum()
    minority_targets_idx = range(n_minority)
    if len(minority_targets_idx) > N_TARGETS:
        rng = np.random.default_rng(seed=42)
        minority_targets_idx = rng.choice(minority_targets_idx, size=N_TARGETS, replace=False)
    
    clf_aucs, gen_clf_aucs = [], []
    for minority_target_idx in tqdm(minority_targets_idx, leave=False):
        clf_kwargs = {"max_iter": MAX_ITER, "hidden_layer_sizes": (d, d)}
        target_labels, target_preds = membership_attack(X, y, minority_target_idx, N_TRAIN, N_TEST, attack_mode="clf", clf_kwargs=clf_kwargs)
        auc = roc_auc_score(target_labels, target_preds)
        clf_aucs.append(auc)

        clf_kwargs = {"max_iter": MAX_ITER, "hidden_layer_sizes": (d, d)}
        gen_kwargs = {"k_neighbors": K, "sampling_strategy": SAMPLING_STRATEGY}
        target_labels, target_preds = membership_attack(X, y, minority_target_idx, N_TRAIN, N_TEST, attack_mode="aug+clf", clf_kwargs=clf_kwargs, gen_kwargs=gen_kwargs)
        auc = roc_auc_score(target_labels, target_preds)
        gen_clf_aucs.append(auc)

    all_clf_aucs[data_name] = clf_aucs
    aucs_clf_10 = np.sort(clf_aucs)[-10:]
    print(f"{data_name}: top 10 clf aucs -- {aucs_clf_10.mean():.4f} +- {aucs_clf_10.std():.4f}")

    all_gen_clf_aucs[data_name] = gen_clf_aucs
    aucs_gen_clf_10 = np.sort(gen_clf_aucs)[-10:]
    print(f"{data_name}: top 10 aug+clf aucs -- {aucs_gen_clf_10.mean():.4f} +- {aucs_gen_clf_10.std():.4f}")


  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

ecoli: top 10 clf aucs -- 0.5497 +- 0.0134
ecoli: top 10 aug+clf aucs -- 0.5597 +- 0.0220


  0%|          | 0/51 [00:00<?, ?it/s]

yeast_me2: top 10 clf aucs -- 0.5619 +- 0.0231
yeast_me2: top 10 aug+clf aucs -- 0.7059 +- 0.0663


  0%|          | 0/68 [00:00<?, ?it/s]

solar_flare_m0: top 10 clf aucs -- 0.8326 +- 0.0169
solar_flare_m0: top 10 aug+clf aucs -- 0.9993 +- 0.0009


  0%|          | 0/100 [00:00<?, ?it/s]

abalone: top 10 clf aucs -- 0.6053 +- 0.0196
abalone: top 10 aug+clf aucs -- 0.6532 +- 0.0279


  0%|          | 0/100 [00:00<?, ?it/s]

car_eval_34: top 10 clf aucs -- 0.6412 +- 0.0123
car_eval_34: top 10 aug+clf aucs -- 0.8428 +- 0.0281


  0%|          | 0/65 [00:00<?, ?it/s]

car_eval_4: top 10 clf aucs -- 0.6320 +- 0.0224
car_eval_4: top 10 aug+clf aucs -- 0.8948 +- 0.0335


  0%|          | 0/100 [00:00<?, ?it/s]

mammography: top 10 clf aucs -- 0.5835 +- 0.0209
mammography: top 10 aug+clf aucs -- 0.6328 +- 0.0269


  0%|          | 0/32 [00:00<?, ?it/s]

abalone_19: top 10 clf aucs -- 0.6301 +- 0.0175
abalone_19: top 10 aug+clf aucs -- 0.9380 +- 0.0431


In [None]:
for top in [10, 50, 100]:

    columns=["dataset", "clf_auc", "aug+clf_auc"]
    scores_df = pd.DataFrame(columns=columns, dtype=str)

    for data_name in IMB_DATASETS:
        
        aucs_clf_top = np.sort(all_clf_aucs[data_name])[-top:]
        aucs_gen_clf_top = np.sort(all_gen_clf_aucs[data_name])[-top:]

        # print(f"{data_name}: average clf AUC across all targets -- {aucs_clf_top.mean():.4f} +- {aucs_clf_top.std():.4f}")
        # print(f"{data_name}: average aug+clf AUC across all targets -- {aucs_gen_clf_top.mean():.4f} +- {aucs_gen_clf_top.std():.4f}")

        data_socres = [f"{data_name}",
                    f"{aucs_clf_top.mean():.4f} +- {aucs_clf_top.std():.4f}",
                    f"{aucs_gen_clf_top.mean():.4f} +- {aucs_gen_clf_top.std():.4f}"
                    ]
        scores_df = pd.concat([scores_df, pd.DataFrame([data_socres], columns=columns)], ignore_index=True)

    print(scores_df)
    # scores_df.to_csv(f"results/augment/mia_{top}.csv", index=False)

          dataset           clf_auc       aug+clf_auc
0           ecoli  0.5497 +- 0.0134  0.5597 +- 0.0220
1       yeast_me2  0.5619 +- 0.0231  0.7059 +- 0.0663
2  solar_flare_m0  0.8326 +- 0.0169  0.9993 +- 0.0009
3         abalone  0.6053 +- 0.0196  0.6532 +- 0.0279
4     car_eval_34  0.6412 +- 0.0123  0.8428 +- 0.0281
5      car_eval_4  0.6320 +- 0.0224  0.8948 +- 0.0335
6     mammography  0.5835 +- 0.0209  0.6328 +- 0.0269
7      abalone_19  0.6301 +- 0.0175  0.9380 +- 0.0431
          dataset           clf_auc       aug+clf_auc
0           ecoli  0.5020 +- 0.0397  0.4999 +- 0.0492
1       yeast_me2  0.5094 +- 0.0367  0.5672 +- 0.0850
2  solar_flare_m0  0.7892 +- 0.0294  0.9690 +- 0.0284
3         abalone  0.5651 +- 0.0255  0.5820 +- 0.0419
4     car_eval_34  0.5959 +- 0.0280  0.7331 +- 0.0751
5      car_eval_4  0.5870 +- 0.0298  0.7497 +- 0.0985
6     mammography  0.5386 +- 0.0282  0.5632 +- 0.0411
7      abalone_19  0.5825 +- 0.0450  0.7956 +- 0.1166
          dataset           