In [1]:
import warnings
import numpy as np
import pandas as pd

# generators
from imblearn.over_sampling import SMOTE

# custom
from datasets import IMB_DATASETS, load_data, prepare_data
from attacks import smote_detection_attack, smote_reconstruction_attack, calculate_scores

In [2]:
warnings.simplefilter("ignore", category=FutureWarning)

In [3]:
np.random.seed(42)

In [4]:
N_GEN_FITS = 5

# number of nearest neighbors used in SMOTE
K = 2
SAMPLING_STRATEGY = 1

In [5]:
DATASET_CUSTOM_KWARGS = {
    "ecoli": {"nbs_multiplte": 3},
    "yeast_me2": {"nbs_multiplte": 3},
    "solar_flare_m0": {"nbs_multiplte": 5},  # there are duplicates
    "abalone": {"nbs_multiplte": 25},
    "car_eval_34": {"nbs_multiplte": 3},
    "car_eval_4": {"nbs_multiplte": 3},
    "mammography": {"nbs_multiplte": 5},
    "abalone_19": {"nbs_multiplte": 3},
    # "diabetes": {"use_hull": False, "nbs_multiplte": 5}, # there is categorical data
    # "phoneme": {"use_hull": False, "nbs_multiplte": 10}, # there are duplicates
}

# ecoli (nbs_multiplte=2) -- 35/35
# yeast_me2 (nbs_multiplte=2)  -- 51/51
# solar_flare_m0 (nbs_multiplte=5) -- 58/58 (there are duplicates)
# abalone (nbs_multiplte=10) -- 391/391
# car_eval_34 (nbs_multiplte=2) -- 134/134
# car_eval_4 (nbs_multiplte=2) -- 65/65
# mammography (nbs_multiplte=2) -- 254/254 (there are duplicates)
# abalone_19 (nbs_multiplte=2) -- 32/32

# diabetes (nbs_multiplte=5) -- 268/268 (if we remove the hull; there's categorical data)
# phoneme (nbs_multiplte=10) -- 1560/1560 (if we remove the hull; there are duplicates)

In [6]:
columns = ["dataset", "precision", "recall", "f1"]
scores_df = pd.DataFrame(columns=columns, dtype=str)


for data_name in IMB_DATASETS:
    X, y = load_data(data_name)
    X, y = prepare_data(X, y)

    precs, recs, f1s = [], [], []
    for i in range(N_GEN_FITS):
        generator = SMOTE(k_neighbors=K, sampling_strategy=SAMPLING_STRATEGY)
        X_augmented, y_augmented = generator.fit_resample(X, y)

        r = (y_augmented == 1).sum() / (y == 1).sum()
        
        detected_real_minority = smote_detection_attack(X_augmented, y_augmented, k=K, r=r, **DATASET_CUSTOM_KWARGS.get(data_name, {}))
        precision, recall, f1 = calculate_scores(X, y, detected_real_minority)
        precs.append(precision)
        recs.append(recall)
        f1s.append(f1)
    
    data_socres = [f"{data_name}",
                   f"{np.array(precs).mean():.4f} +- {np.array(precs).std():.4f}",
                   f"{np.array(recs).mean():.4f} +- {np.array(recs).std():.4f}",
                   f"{np.array(f1s).mean():.4f} +- {np.array(f1s).std():.4f}"
                   ]
    scores_df = pd.concat([scores_df, pd.DataFrame([data_socres], columns=columns)], ignore_index=True)
    print(f"{data_name}: f1 -- {np.array(f1s).mean():.4f} +- {np.array(f1s).std():.4f}")
    # print(f"{data_name}: precision -- {np.array(precs).mean():.4f} +- {np.array(precs).std():.4f}")
    # print(f"{data_name}: recall -- {np.array(recs).mean():.4f} +- {np.array(recs).std():.4f}")

print(scores_df)
# scores_df.to_csv("results/augment/detect_smote.csv", index=False)
    

ecoli: f1 -- 1.0000 +- 0.0000
yeast_me2: f1 -- 1.0000 +- 0.0000
solar_flare_m0: f1 -- 1.0000 +- 0.0000
abalone: f1 -- 1.0000 +- 0.0000
car_eval_34: f1 -- 1.0000 +- 0.0000
car_eval_4: f1 -- 1.0000 +- 0.0000
mammography: f1 -- 1.0000 +- 0.0000
abalone_19: f1 -- 1.0000 +- 0.0000
          dataset         precision            recall                f1
0           ecoli  1.0000 +- 0.0000  1.0000 +- 0.0000  1.0000 +- 0.0000
1       yeast_me2  1.0000 +- 0.0000  1.0000 +- 0.0000  1.0000 +- 0.0000
2  solar_flare_m0  1.0000 +- 0.0000  1.0000 +- 0.0000  1.0000 +- 0.0000
3         abalone  1.0000 +- 0.0000  1.0000 +- 0.0000  1.0000 +- 0.0000
4     car_eval_34  1.0000 +- 0.0000  1.0000 +- 0.0000  1.0000 +- 0.0000
5      car_eval_4  1.0000 +- 0.0000  1.0000 +- 0.0000  1.0000 +- 0.0000
6     mammography  1.0000 +- 0.0000  1.0000 +- 0.0000  1.0000 +- 0.0000
7      abalone_19  1.0000 +- 0.0000  1.0000 +- 0.0000  1.0000 +- 0.0000


In [7]:
np.random.seed(42)

In [8]:
N_GEN_FITS = 5

# number of nearest neighbors used in SMOTE
K = 2
SAMPLING_STRATEGY = 1

inter_multiple=2

In [9]:
columns = ["dataset", "imbalance", "precision", "recall", "f1"]
scores_df = pd.DataFrame(columns=columns, dtype=str)

for data_name in IMB_DATASETS:
    X, y = load_data(data_name)
    X, y = prepare_data(X, y)
    
    precs, recs, f1s = [], [], []
    for i in range(N_GEN_FITS):
        generator = SMOTE(k_neighbors=K, sampling_strategy=SAMPLING_STRATEGY)
        X_augmented, y_augmented = generator.fit_resample(X, y)
        
        X_synthetic = X_augmented[len(y):]
        y_synthetic = y_augmented[len(y):]
        r = len(y_synthetic) / (y == 1).sum()

        reconstructed_real_minority = smote_reconstruction_attack(X_synthetic, y_synthetic, k=K, r=r, inter_multiple=inter_multiple)
        precision, recall, f1 = calculate_scores(X, y, reconstructed_real_minority, exact_match=False, line_eps=1e-12)
        precs.append(precision)
        recs.append(recall)
        f1s.append(f1)

    data_socres = [f"{data_name}",
                   f"{r:.2f}",
                   f"{np.array(precs).mean():.4f} +- {np.array(precs).std():.4f}",
                   f"{np.array(recs).mean():.4f} +- {np.array(recs).std():.4f}",
                   f"{np.array(f1s).mean():.4f} +- {np.array(f1s).std():.4f}"
                ]
    scores_df = pd.concat([scores_df, pd.DataFrame([data_socres], columns=columns)], ignore_index=True)
    print(f"{data_name}, s/n_1 {r:.2f}: f1 -- {np.array(f1s).mean():.4f} +- {np.array(f1s).std():.4f}")
    # print(f"{data_name}, s/n_1 {r:.2f}: precision -- {np.array(precs).mean():.4f} +- {np.array(precs).std():.4f}")
    # print(f"{data_name}, s/n_1 {r:.2f}: recall -- {np.array(recs).mean():.4f} +- {np.array(recs).std():.4f}")

print(scores_df)
# scores_df.to_csv("results/synth/recon_smote.csv", index=False)

ecoli, s/n_1 7.60: f1 -- 0.6035 +- 0.0547
yeast_me2, s/n_1 27.10: f1 -- 0.7343 +- 0.0161
solar_flare_m0, s/n_1 18.43: f1 -- 0.5710 +- 0.0235
abalone, s/n_1 8.68: f1 -- 0.5898 +- 0.0163
car_eval_34, s/n_1 10.90: f1 -- 0.7569 +- 0.0112
car_eval_4, s/n_1 24.58: f1 -- 0.7500 +- 0.0000
mammography, s/n_1 41.01: f1 -- 0.7567 +- 0.0053
abalone_19, s/n_1 128.53: f1 -- 0.6610 +- 0.0113
          dataset imbalance         precision            recall  \
0           ecoli      7.60  1.0000 +- 0.0000  0.4343 +- 0.0554   
1       yeast_me2     27.10  1.0000 +- 0.0000  0.5804 +- 0.0200   
2  solar_flare_m0     18.43  1.0000 +- 0.0000  0.4000 +- 0.0229   
3         abalone      8.68  1.0000 +- 0.0000  0.4184 +- 0.0164   
4     car_eval_34     10.90  1.0000 +- 0.0000  0.6090 +- 0.0146   
5      car_eval_4     24.58  1.0000 +- 0.0000  0.6000 +- 0.0000   
6     mammography     41.01  1.0000 +- 0.0000  0.6087 +- 0.0069   
7      abalone_19    128.53  1.0000 +- 0.0000  0.4938 +- 0.0125   

                