In [1]:
import warnings
import numpy as np
import pandas as pd

# generators
from imblearn.over_sampling import SMOTE

# custom
from datasets import IMB_DATASETS, load_data, prepare_data
from attacks import smote_detection_attack, calculate_scores

In [2]:
warnings.simplefilter("ignore", category=FutureWarning)

In [3]:
np.random.seed(42)

In [4]:
N_GEN_FITS = 25

# number of nearest neighbors used in SMOTE
K = 5
SAMPLING_STRATEGY = 1

In [5]:
DATASET_CUSTOM_KWARGS = {
    "solar_flare_m0": {"nbs_multiplte": 5},  # there are duplicates
    "abalone": {"nbs_multiplte": 10},
    "abalone_19": {"nbs_multiplte": 3},
    # "diabetes": {"use_hull": False, "nbs_multiplte": 5}, # there is categorical data
    # "phoneme": {"use_hull": False, "nbs_multiplte": 10}, # there are duplicates
}

# ecoli (nbs_multiplte=2) -- 35/35
# yeast_me2 (nbs_multiplte=2)  -- 51/51
# solar_flare_m0 (nbs_multiplte=5) -- 58/58 (there are duplicates)
# abalone (nbs_multiplte=10) -- 391/391
# car_eval_34 (nbs_multiplte=2) -- 134/134
# car_eval_4 (nbs_multiplte=2) -- 65/65
# mammography (nbs_multiplte=2) -- 254/254 (there are duplicates)
# abalone_19 (nbs_multiplte=2) -- 32/32

# diabetes (nbs_multiplte=5) -- 268/268 (if we remove the hull; there's categorical data)
# phoneme (nbs_multiplte=10) -- 1560/1560 (if we remove the hull; there are duplicates)

In [None]:
columns = ["dataset", "precision", "recall", "f1"]
scores_df = pd.DataFrame(columns=columns, dtype=str)


for data_name in IMB_DATASETS:
    X, y = load_data(data_name)
    X, y = prepare_data(X, y)

    precs, recs, f1s = [], [], []
    for i in range(N_GEN_FITS):
        generator = SMOTE(k_neighbors=K, sampling_strategy=SAMPLING_STRATEGY)
        X_augmented, y_augmented = generator.fit_resample(X, y)

        r = (y_augmented == 1).sum() / (y == 1).sum()
        
        detected_real_minority = smote_detection_attack(X_augmented, y_augmented, k=K, r=r, **DATASET_CUSTOM_KWARGS.get(data_name, {}))
        precision, recall, f1 = calculate_scores(X, y, detected_real_minority)
        precs.append(precision)
        recs.append(recall)
        f1s.append(f1)
    
    data_socres = [f"{data_name}",
                   f"{np.array(precs).mean():.4f} +- {np.array(precs).std():.4f}",
                   f"{np.array(recs).mean():.4f} +- {np.array(recs).std():.4f}",
                   f"{np.array(f1s).mean():.4f} +- {np.array(f1s).std():.4f}"
                   ]
    scores_df = pd.concat([scores_df, pd.DataFrame([data_socres], columns=columns)], ignore_index=True)
    print(f"{data_name}: f1 -- {np.array(f1s).mean():.4f} +- {np.array(f1s).std():.4f}")

print(scores_df)
# scores_df.to_csv("results/augment/detect_smote.csv", index=False)
    

ecoli: f1 -- 1.0000 +- 0.0000
yeast_me2: f1 -- 1.0000 +- 0.0000
solar_flare_m0: f1 -- 1.0000 +- 0.0000
abalone: f1 -- 0.9993 +- 0.0024
car_eval_34: f1 -- 1.0000 +- 0.0000
car_eval_4: f1 -- 1.0000 +- 0.0000
mammography: f1 -- 0.9976 +- 0.0030
abalone_19: f1 -- 0.9954 +- 0.0224
          dataset         precision            recall                f1
0           ecoli  1.0000 +- 0.0000  1.0000 +- 0.0000  1.0000 +- 0.0000
1       yeast_me2  1.0000 +- 0.0000  1.0000 +- 0.0000  1.0000 +- 0.0000
2  solar_flare_m0  1.0000 +- 0.0000  1.0000 +- 0.0000  1.0000 +- 0.0000
3         abalone  0.9988 +- 0.0042  0.9998 +- 0.0007  0.9993 +- 0.0024
4     car_eval_34  1.0000 +- 0.0000  1.0000 +- 0.0000  1.0000 +- 0.0000
5      car_eval_4  1.0000 +- 0.0000  1.0000 +- 0.0000  1.0000 +- 0.0000
6     mammography  0.9952 +- 0.0059  1.0000 +- 0.0000  0.9976 +- 0.0030
7      abalone_19  0.9926 +- 0.0361  0.9988 +- 0.0061  0.9954 +- 0.0224
