In [1]:
import pickle
import warnings
import numpy as np
import pandas as pd

# generators
from imblearn.over_sampling import SMOTE

# custom
from datasets import IMB_DATASETS, load_data, prepare_data
from attacks import smote_reconstruction_attack, calculate_scores

In [2]:
warnings.simplefilter("ignore", category=FutureWarning)

pd.set_option('display.expand_frame_repr', False)

In [3]:
np.random.seed(42)

In [4]:
SYNTH_RATIONS = [1, 2, 5, 10, 20, 25, 50, 75, 100]
N_GEN_FITS = 5

# number of nearest neighbors used in SMOTE
K = 5
SAMPLING_STRATEGY = 1

In [None]:
columns = ["dataset", "imbalance", "precision", "recall", "f1"]
scores_df = pd.DataFrame(columns=columns, dtype=str)

for data_name in IMB_DATASETS:
    X, y = load_data(data_name)
    X, y = prepare_data(X, y)
    
    precs, recs, f1s = [], [], []
    for i in range(N_GEN_FITS * N_GEN_FITS):
        generator = SMOTE(k_neighbors=K, sampling_strategy=SAMPLING_STRATEGY)
        X_augmented, y_augmented = generator.fit_resample(X, y)
        
        X_synthetic = X_augmented[len(y):]
        y_synthetic = y_augmented[len(y):]
        r = len(y_synthetic) / (y == 1).sum()

        reconstructed_real_minority = smote_reconstruction_attack(X_synthetic, y_synthetic, k=K, r=r)
        precision, recall, f1 = calculate_scores(X, y, reconstructed_real_minority, exact_match=False, line_eps=1e-12)
        precs.append(precision)
        recs.append(recall)
        f1s.append(f1)

    data_socres = [f"{data_name}",
                   f"{r:.2f}",
                   f"{np.array(precs).mean():.4f} +- {np.array(precs).std():.4f}",
                   f"{np.array(recs).mean():.4f} +- {np.array(recs).std():.4f}",
                   f"{np.array(f1s).mean():.4f} +- {np.array(f1s).std():.4f}"
                ]
    scores_df = pd.concat([scores_df, pd.DataFrame([data_socres], columns=columns)], ignore_index=True)
    print(f"{data_name}, s/n_1 {r:.2f}: f1 -- {np.array(f1s).mean():.4f} +- {np.array(f1s).std():.4f}")

print(scores_df)
# scores_df.to_csv("results/synth/recon_smote.csv", index=False)

ecoli, s/n_1 7.60: f1 -- 0.6130 +- 0.0585
yeast_me2, s/n_1 27.10: f1 -- 0.9980 +- 0.0040
solar_flare_m0, s/n_1 18.43: f1 -- 0.9816 +- 0.0139
abalone, s/n_1 8.68: f1 -- 0.7650 +- 0.0137
car_eval_34, s/n_1 10.90: f1 -- 0.9138 +- 0.0122
car_eval_4, s/n_1 24.58: f1 -- 1.0000 +- 0.0000
mammography, s/n_1 41.01: f1 -- 1.0000 +- 0.0000
abalone_19, s/n_1 128.53: f1 -- 1.0000 +- 0.0000
          dataset imbalance         precision            recall                f1
0           ecoli      7.60  1.0000 +- 0.0000  0.4446 +- 0.0621  0.6130 +- 0.0585
1       yeast_me2     27.10  1.0000 +- 0.0000  0.9961 +- 0.0078  0.9980 +- 0.0040
2  solar_flare_m0     18.43  1.0000 +- 0.0000  0.9641 +- 0.0267  0.9816 +- 0.0139
3         abalone      8.68  1.0000 +- 0.0000  0.6196 +- 0.0180  0.7650 +- 0.0137
4     car_eval_34     10.90  1.0000 +- 0.0000  0.8415 +- 0.0206  0.9138 +- 0.0122
5      car_eval_4     24.58  1.0000 +- 0.0000  1.0000 +- 0.0000  1.0000 +- 0.0000
6     mammography     41.01  1.0000 +- 0.0000 

In [None]:
columns=["dataset", "imbalance", "precision", "recall", "f1"]
dfs = {}


for r in SYNTH_RATIONS:    
    scores_df = pd.DataFrame(columns=columns, dtype=str)

    for data_name in IMB_DATASETS:
        X, y = load_data(data_name)
        X, y = prepare_data(X, y)
        n_y1 = (y == 1).sum()
        
        precs, recs, f1s = [], [], []
        for i in range(N_GEN_FITS):
            generator = SMOTE(k_neighbors=K, sampling_strategy=SAMPLING_STRATEGY)
            X_augmented, y_augmented = generator.fit_resample(X, y)
            
            X_synthetic = X_augmented[len(y):]
            y_synthetic = y_augmented[len(y):]
            curr_ratio = len(y_synthetic) / n_y1

            while curr_ratio <= r:
                X_augmented, y_augmented = generator.fit_resample(X, y)

                X_synthetic = np.concatenate((X_synthetic, X_augmented[len(y):]))
                y_synthetic = np.concatenate((y_synthetic, y_augmented[len(y):]))               
                curr_ratio = len(y_synthetic) / n_y1

            X_synthetic = X_synthetic[:r*n_y1]
            y_synthetic = y_synthetic[:r*n_y1]

            reconstructed_real_minority = smote_reconstruction_attack(X_synthetic, y_synthetic, k=K, r=r)
            precision, recall, f1 = calculate_scores(X, y, reconstructed_real_minority, exact_match=False, line_eps=1e-12)
            precs.append(precision)
            recs.append(recall)
            f1s.append(f1)

        data_socres = [f"{data_name}",
                       f"{r:.2f}",
                       f"{np.array(precs).mean():.4f} +- {np.array(precs).std():.4f}",
                       f"{np.array(recs).mean():.4f} +- {np.array(recs).std():.4f}",
                       f"{np.array(f1s).mean():.4f} +- {np.array(f1s).std():.4f}"
                       ]
        scores_df = pd.concat([scores_df, pd.DataFrame([data_socres], columns=columns)], ignore_index=True)
        print(f"{data_name}, s/n_1 {r}: f1 -- {np.array(f1s).mean():.4f} +- {np.array(f1s).std():.4f}")
    
    dfs[r] = scores_df
    print(scores_df)


# with open("results/synth/recon_smote_imbalances.pickle", "wb") as handle:
#     pickle.dump(dfs, handle, protocol=pickle.HIGHEST_PROTOCOL)

ecoli, s/n_1 1: f1 -- 0.0000 +- 0.0000
yeast_me2, s/n_1 1: f1 -- 0.0000 +- 0.0000
solar_flare_m0, s/n_1 1: f1 -- 0.0000 +- 0.0000
abalone, s/n_1 1: f1 -- 0.0000 +- 0.0000
car_eval_34, s/n_1 1: f1 -- 0.0000 +- 0.0000
car_eval_4, s/n_1 1: f1 -- 0.0000 +- 0.0000
mammography, s/n_1 1: f1 -- 0.0000 +- 0.0000
abalone_19, s/n_1 1: f1 -- 0.0000 +- 0.0000
          dataset imbalance         precision            recall                f1
0           ecoli      1.00  0.0000 +- 0.0000  0.0000 +- 0.0000  0.0000 +- 0.0000
1       yeast_me2      1.00  0.0000 +- 0.0000  0.0000 +- 0.0000  0.0000 +- 0.0000
2  solar_flare_m0      1.00  0.0000 +- 0.0000  0.0000 +- 0.0000  0.0000 +- 0.0000
3         abalone      1.00  0.0000 +- 0.0000  0.0000 +- 0.0000  0.0000 +- 0.0000
4     car_eval_34      1.00  0.0000 +- 0.0000  0.0000 +- 0.0000  0.0000 +- 0.0000
5      car_eval_4      1.00  0.0000 +- 0.0000  0.0000 +- 0.0000  0.0000 +- 0.0000
6     mammography      1.00  0.0000 +- 0.0000  0.0000 +- 0.0000  0.0000 +- 0.