In [1]:
import warnings
import numpy as np
import pandas as pd

# generators
from imblearn.over_sampling import SMOTE

# custom
from datasets import BIG_DATASETS, load_data, prepare_data
from attacks import smote_detection_attack, smote_reconstruction_attack, calculate_scores

In [2]:
warnings.simplefilter("ignore", category=FutureWarning)

In [3]:
for data_name in BIG_DATASETS:
    X, y = load_data(data_name)
    X, y = prepare_data(X, y)

    print(data_name, X.shape, pd.Series(y).value_counts(normalize=True)[1])

higgs (47974, 28) 0.03702005252845291
MiniBooNE (96690, 50) 0.03703588788913021


In [4]:
np.random.seed(42)

In [5]:
N_GEN_FITS = 3

# number of nearest neighbors used in SMOTE
K = 5
SAMPLING_STRATEGY = 1

In [6]:
DATASET_CUSTOM_KWARGS = {
    "higgs": {"nbs_multiplte": 3},
    "MiniBooNE": {"nbs_multiplte": 3},
}

In [7]:
columns = ["dataset", "precision", "recall", "f1"]
scores_df = pd.DataFrame(columns=columns, dtype=str)


for data_name in BIG_DATASETS:
    X, y = load_data(data_name)
    X, y = prepare_data(X, y)

    precs, recs, f1s = [], [], []
    for i in range(N_GEN_FITS):
        generator = SMOTE(k_neighbors=K, sampling_strategy=SAMPLING_STRATEGY)
        X_augmented, y_augmented = generator.fit_resample(X, y)

        r = (y_augmented == 1).sum() / (y == 1).sum()
        
        detected_real_minority = smote_detection_attack(X_augmented, y_augmented, k=K, r=r, **DATASET_CUSTOM_KWARGS.get(data_name, {}))
        precision, recall, f1 = calculate_scores(X, y, detected_real_minority)
        precs.append(precision)
        recs.append(recall)
        f1s.append(f1)
    
    data_socres = [f"{data_name}",
                   f"{np.array(precs).mean():.4f} +- {np.array(precs).std():.4f}",
                   f"{np.array(recs).mean():.4f} +- {np.array(recs).std():.4f}",
                   f"{np.array(f1s).mean():.4f} +- {np.array(f1s).std():.4f}"
                   ]
    scores_df = pd.concat([scores_df, pd.DataFrame([data_socres], columns=columns)], ignore_index=True)
    print(f"{data_name}: f1 -- {np.array(f1s).mean():.4f} +- {np.array(f1s).std():.4f}")

print(scores_df)
# scores_df.to_csv("results/augment/detect_smote.csv", index=False)
    

higgs: f1 -- 1.0000 +- 0.0000
MiniBooNE: f1 -- 1.0000 +- 0.0000
     dataset         precision            recall                f1
0      higgs  1.0000 +- 0.0000  1.0000 +- 0.0000  1.0000 +- 0.0000
1  MiniBooNE  1.0000 +- 0.0000  1.0000 +- 0.0000  1.0000 +- 0.0000


In [8]:
np.random.seed(42)

In [9]:
N_GEN_FITS = 3

# number of nearest neighbors used in SMOTE
K = 5
SAMPLING_STRATEGY = 1

In [10]:
columns = ["dataset", "imbalance", "precision", "recall", "f1"]
scores_df = pd.DataFrame(columns=columns, dtype=str)

for data_name in BIG_DATASETS:
    X, y = load_data(data_name)
    X, y = prepare_data(X, y)
    
    precs, recs, f1s = [], [], []
    for i in range(N_GEN_FITS):
        generator = SMOTE(k_neighbors=K, sampling_strategy=SAMPLING_STRATEGY)
        X_augmented, y_augmented = generator.fit_resample(X, y)
        
        X_synthetic = X_augmented[len(y):]
        y_synthetic = y_augmented[len(y):]
        r = len(y_synthetic) / (y == 1).sum()

        reconstructed_real_minority = smote_reconstruction_attack(X_synthetic, y_synthetic, k=K, r=r)
        precision, recall, f1 = calculate_scores(X, y, reconstructed_real_minority, exact_match=False, line_eps=1e-12)
        precs.append(precision)
        recs.append(recall)
        f1s.append(f1)

    data_socres = [f"{data_name}",
                   f"{r:.2f}",
                   f"{np.array(precs).mean():.4f} +- {np.array(precs).std():.4f}",
                   f"{np.array(recs).mean():.4f} +- {np.array(recs).std():.4f}",
                   f"{np.array(f1s).mean():.4f} +- {np.array(f1s).std():.4f}"
                ]
    scores_df = pd.concat([scores_df, pd.DataFrame([data_socres], columns=columns)], ignore_index=True)
    print(f"{data_name}, s/n_1 {r:.2f}: f1 -- {np.array(f1s).mean():.4f} +- {np.array(f1s).std():.4f}")

print(scores_df)
# scores_df.to_csv("results/synth/recon_smote.csv", index=False)

higgs, s/n_1 25.01: f1 -- 0.9983 +- 0.0002
MiniBooNE, s/n_1 25.00: f1 -- 0.9983 +- 0.0005
     dataset imbalance         precision            recall                f1
0      higgs     25.01  1.0000 +- 0.0000  0.9966 +- 0.0005  0.9983 +- 0.0002
1  MiniBooNE     25.00  1.0000 +- 0.0000  0.9966 +- 0.0010  0.9983 +- 0.0005
