In [1]:
import warnings
import numpy as np
import pandas as pd

# generators
from imblearn.over_sampling import SMOTE

# custom
from datasets import load_data, prepare_data
from attacks import smote_detection_attack, smote_reconstruction_attack, calculate_scores

In [2]:
warnings.simplefilter("ignore", category=FutureWarning)
warnings.simplefilter('ignore', category=RuntimeWarning)

In [3]:
DATASETS = ["yeast_me2"]
NOISES = [1e-10, 1e-7, 1e-5, 1e-3]

In [4]:
for data_name in DATASETS:
    X, y = load_data(data_name)
    X, y = prepare_data(X, y)

    print(data_name, X.shape, pd.Series(y).value_counts(normalize=True)[1])

yeast_me2 (1484, 8) 0.03436657681940701


In [5]:
np.random.seed(42)

In [6]:
N_GEN_FITS = 3

# number of nearest neighbors used in SMOTE
K = 5
SAMPLING_STRATEGY = 1

In [7]:
DATASET_CUSTOM_KWARGS = {
}

In [8]:
columns = ["dataset", "i", "smote_eps", "attack_eps", "precision", "recall", "f1"]
scores_df = pd.DataFrame(columns=columns, dtype=str)


for data_name in DATASETS:
    X, y = load_data(data_name)
    X, y = prepare_data(X, y)

    for i in range(N_GEN_FITS):
        generator = SMOTE(k_neighbors=K, sampling_strategy=SAMPLING_STRATEGY)
        X_augmented, y_augmented = generator.fit_resample(X, y)

        r = (y_augmented == 1).sum() / (y == 1).sum()

        # for smote_eps in [0]:
        for smote_eps in NOISES:
            X_augmented_noise = X_augmented.copy()
            X_augmented_noise[len(y):] = X_augmented[len(y):] + np.random.random(X_augmented[len(y):].shape) * np.random.choice([1, -1], X_augmented[len(y):].shape) * smote_eps
            X_augmented_noise[len(y):] = np.clip(X_augmented_noise[len(y):], -1, 1)

            for attack_eps in NOISES:
                detected_real_minority = smote_detection_attack(X_augmented_noise, y_augmented, k=K, r=r, line_eps=attack_eps, **DATASET_CUSTOM_KWARGS.get(data_name, {}))
                precision, recall, f1 = calculate_scores(X, y, detected_real_minority)
                # precs.append(precision)
                # recs.append(recall)
                # f1s.append(f1)

                data_socres = [f"{data_name}",
                            i,
                            smote_eps,
                            attack_eps,
                            precision,
                            recall,
                            f1
                            ]
                scores_df = pd.concat([scores_df, pd.DataFrame([data_socres], columns=columns)], ignore_index=True)
                # print(f"{i}, {smote_eps}, {attack_eps}, f1 -- {f1:.4f}")

print(scores_df)
# scores_df.to_csv("results/augment/detect_smote.csv", index=False)
    

      dataset  i     smote_eps    attack_eps  precision    recall        f1
0   yeast_me2  0  1.000000e-10  1.000000e-10   0.962264  1.000000  0.980769
1   yeast_me2  0  1.000000e-10  1.000000e-07   0.962264  1.000000  0.980769
2   yeast_me2  0  1.000000e-10  1.000000e-05   0.366972  0.784314  0.500000
3   yeast_me2  0  1.000000e-10  1.000000e-03   0.083770  0.313725  0.132231
4   yeast_me2  0  1.000000e-07  1.000000e-10   0.879310  1.000000  0.935780
5   yeast_me2  0  1.000000e-07  1.000000e-07   0.962264  1.000000  0.980769
6   yeast_me2  0  1.000000e-07  1.000000e-05   0.366972  0.784314  0.500000
7   yeast_me2  0  1.000000e-07  1.000000e-03   0.076923  0.294118  0.121951
8   yeast_me2  0  1.000000e-05  1.000000e-10   0.035689  1.000000  0.068919
9   yeast_me2  0  1.000000e-05  1.000000e-07   0.069672  1.000000  0.130268
10  yeast_me2  0  1.000000e-05  1.000000e-05   0.065753  0.941176  0.122919
11  yeast_me2  0  1.000000e-05  1.000000e-03   0.058424  0.843137  0.109276
12  yeast_me

In [9]:
scores_df.groupby(["smote_eps", "attack_eps"])[["precision", "recall"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall
smote_eps,attack_eps,Unnamed: 2_level_1,Unnamed: 3_level_1
1e-10,1e-10,0.962264,1.0
1e-10,1e-07,0.914733,0.993464
1e-10,1e-05,0.408435,0.797386
1e-10,0.001,0.089465,0.313725
1e-07,1e-10,0.880008,1.0
1e-07,1e-07,0.914733,0.993464
1e-07,1e-05,0.37472,0.784314
1e-07,0.001,0.087822,0.313725
1e-05,1e-10,0.035664,1.0
1e-05,1e-07,0.070351,1.0


In [10]:
np.random.seed(42)

In [11]:
N_GEN_FITS = 3

# number of nearest neighbors used in SMOTE
K = 5
SAMPLING_STRATEGY = 1

In [12]:
columns = ["dataset", "imbalance", "i", "smote_eps", "attack_eps", "precision", "recall", "f1"]
scores_df = pd.DataFrame(columns=columns, dtype=str)

for data_name in DATASETS:
    X, y = load_data(data_name)
    X, y = prepare_data(X, y)
    
    precs, recs, f1s = [], [], []
    for i in range(N_GEN_FITS):
        generator = SMOTE(k_neighbors=K, sampling_strategy=SAMPLING_STRATEGY)
        X_augmented, y_augmented = generator.fit_resample(X, y)
        
        X_synthetic = X_augmented[len(y):]
        y_synthetic = y_augmented[len(y):]
        r = len(y_synthetic) / (y == 1).sum()

        # for smote_eps in [0]:
        for smote_eps in NOISES:
            X_synthetic_noise = X_synthetic.copy()
            X_synthetic_noise = X_synthetic + np.random.random(X_synthetic.shape) * np.random.choice([1, -1], X_synthetic.shape) * smote_eps
            X_synthetic_noise = np.clip(X_synthetic_noise, -1, 1)

            for attack_eps in NOISES:
                reconstructed_real_minority = smote_reconstruction_attack(X_synthetic_noise, y_synthetic, k=K, r=r, line_eps=attack_eps)
                precision, recall, f1 = calculate_scores(X, y, reconstructed_real_minority, exact_match=False, line_eps=smote_eps)
                # precs.append(precision)
                # recs.append(recall)
                # f1s.append(f1)

                data_socres = [f"{data_name}",
                            f"{r:.2f}",
                            i,
                            smote_eps,
                            attack_eps,
                            precision,
                            recall,
                            f1
                            ]
                scores_df = pd.concat([scores_df, pd.DataFrame([data_socres], columns=columns)], ignore_index=True)
                # print(f"{i}, {smote_eps}, {attack_eps}, f1 -- {f1:.4f}")

print(scores_df)
# scores_df.to_csv("results/synth/recon_smote.csv", index=False)

      dataset imbalance  i     smote_eps    attack_eps  precision    recall  \
0   yeast_me2     27.10  0  1.000000e-10  1.000000e-10   1.000000  1.000000   
1   yeast_me2     27.10  0  1.000000e-10  1.000000e-07   1.000000  1.000000   
2   yeast_me2     27.10  0  1.000000e-10  1.000000e-05   0.980000  0.960784   
3   yeast_me2     27.10  0  1.000000e-10  1.000000e-03   0.640000  0.627451   
4   yeast_me2     27.10  0  1.000000e-07  1.000000e-10   0.894737  1.000000   
5   yeast_me2     27.10  0  1.000000e-07  1.000000e-07   0.981818  1.058824   
6   yeast_me2     27.10  0  1.000000e-07  1.000000e-05   0.880000  0.862745   
7   yeast_me2     27.10  0  1.000000e-07  1.000000e-03   0.580000  0.568627   
8   yeast_me2     27.10  0  1.000000e-05  1.000000e-10   0.000000  0.000000   
9   yeast_me2     27.10  0  1.000000e-05  1.000000e-07   0.000000  0.000000   
10  yeast_me2     27.10  0  1.000000e-05  1.000000e-05   0.666667  0.078431   
11  yeast_me2     27.10  0  1.000000e-05  1.000000e-

In [13]:
scores_df.groupby(["smote_eps", "attack_eps"])[["precision", "recall"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall
smote_eps,attack_eps,Unnamed: 2_level_1,Unnamed: 3_level_1
1e-10,1e-10,1.0,1.0
1e-10,1e-07,1.0,1.0
1e-10,1e-05,0.986797,0.973856
1e-10,0.001,0.633118,0.607843
1e-07,1e-10,0.859519,0.960784
1e-07,1e-07,0.963412,1.03268
1e-07,1e-05,0.853333,0.836601
1e-07,0.001,0.551043,0.529412
1e-05,1e-10,0.0,0.0
1e-05,1e-07,0.0,0.0
