In [1]:
import warnings
import numpy as np
import pandas as pd

# generators
from imblearn.over_sampling import SMOTENC

# custom
from datasets import MIX_DATASETS, dtypes, load_data, prepare_data
from attacks import smote_detection_attack, smote_reconstruction_attack, calculate_scores

In [2]:
warnings.simplefilter("ignore", category=FutureWarning)

In [3]:
for data_name in MIX_DATASETS:
    X, y = load_data(data_name)
    X, y = prepare_data(X, y, dtypes[data_name])

    print(data_name, X.shape, pd.Series(y).value_counts(normalize=True)[1])

cardio (7256, 11) 0.03693495038588754
churn (8269, 10) 0.037005683879550125


In [4]:
np.random.seed(42)

In [5]:
N_GEN_FITS = 3

# number of nearest neighbors used in SMOTE
K = 5
SAMPLING_STRATEGY = 1

In [6]:
DATASET_CUSTOM_KWARGS = {
    "cardio": {"nbs_multiplte": 15},
    "churn": {"nbs_multiplte": 15},
}

In [7]:
columns = ["dataset", "precision", "recall", "f1"]
scores_df = pd.DataFrame(columns=columns, dtype=str)


for data_name in MIX_DATASETS:
    X, y = load_data(data_name)
    X, y = prepare_data(X, y, dtypes[data_name])

    num_cols = list(range(len([i for i, (col, dtype) in enumerate(dtypes[data_name].items()) if dtype in [int, float]])))
    cat_cols = list(range(len(num_cols), len(dtypes[data_name])-1))

    precs, recs, f1s = [], [], []
    for i in range(N_GEN_FITS):
        generator = SMOTENC(categorical_features=cat_cols, k_neighbors=K, sampling_strategy=SAMPLING_STRATEGY)
        X_augmented, y_augmented = generator.fit_resample(X, y)

        r = (y_augmented == 1).sum() / (y == 1).sum()

        detected_real_minority = smote_detection_attack(X_augmented[:, num_cols], y_augmented, k=K, r=r, **DATASET_CUSTOM_KWARGS.get(data_name, {}))
        precision, recall, f1 = calculate_scores(X[:, num_cols], y, detected_real_minority)
        precs.append(precision)
        recs.append(recall)
        f1s.append(f1)
    
    data_socres = [f"{data_name}",
                   f"{np.array(precs).mean():.4f} +- {np.array(precs).std():.4f}",
                   f"{np.array(recs).mean():.4f} +- {np.array(recs).std():.4f}",
                   f"{np.array(f1s).mean():.4f} +- {np.array(f1s).std():.4f}"
                   ]
    scores_df = pd.concat([scores_df, pd.DataFrame([data_socres], columns=columns)], ignore_index=True)
    print(f"{data_name}: f1 -- {np.array(f1s).mean():.4f} +- {np.array(f1s).std():.4f}")

print(scores_df)
# scores_df.to_csv("results/augment/detect_smote.csv", index=False)
    

cardio: f1 -- 0.9988 +- 0.0018
churn: f1 -- 0.9989 +- 0.0015
  dataset         precision            recall                f1
0  cardio  0.9975 +- 0.0035  1.0000 +- 0.0000  0.9988 +- 0.0018
1   churn  0.9978 +- 0.0031  1.0000 +- 0.0000  0.9989 +- 0.0015


In [8]:
np.random.seed(42)

In [9]:
N_GEN_FITS = 3

# number of nearest neighbors used in SMOTE
K = 5
SAMPLING_STRATEGY = 1

In [10]:
columns = ["dataset", "imbalance", "precision", "recall", "f1"]
scores_df = pd.DataFrame(columns=columns, dtype=str)

for data_name in MIX_DATASETS:
    X, y = load_data(data_name)
    X, y = prepare_data(X, y, dtypes[data_name])

    num_cols = list(range(len([i for i, (col, dtype) in enumerate(dtypes[data_name].items()) if dtype in [int, float]])))
    cat_cols = list(range(len(num_cols), len(dtypes[data_name])-1))
    
    precs, recs, f1s = [], [], []
    for i in range(N_GEN_FITS):
        generator = SMOTENC(categorical_features=cat_cols, k_neighbors=K, sampling_strategy=SAMPLING_STRATEGY)
        X_augmented, y_augmented = generator.fit_resample(X, y)
        
        X_synthetic = X_augmented[len(y):]
        y_synthetic = y_augmented[len(y):]
        r = len(y_synthetic) / (y == 1).sum()

        reconstructed_real_minority = smote_reconstruction_attack(X_synthetic[:, num_cols], y_synthetic, k=K, r=r)
        precision, recall, f1 = calculate_scores(X[:, num_cols], y, reconstructed_real_minority, exact_match=False, line_eps=1e-12)
        precs.append(precision)
        recs.append(recall)
        f1s.append(f1)

    data_socres = [f"{data_name}",
                   f"{r:.2f}",
                   f"{np.array(precs).mean():.4f} +- {np.array(precs).std():.4f}",
                   f"{np.array(recs).mean():.4f} +- {np.array(recs).std():.4f}",
                   f"{np.array(f1s).mean():.4f} +- {np.array(f1s).std():.4f}"
                ]
    scores_df = pd.concat([scores_df, pd.DataFrame([data_socres], columns=columns)], ignore_index=True)
    print(f"{data_name}, s/n_1 {r:.2f}: f1 -- {np.array(f1s).mean():.4f} +- {np.array(f1s).std():.4f}")

print(scores_df)
# scores_df.to_csv("results/synth/recon_smote.csv", index=False)

cardio, s/n_1 25.07: f1 -- 0.9918 +- 0.0024
churn, s/n_1 25.02: f1 -- 0.9912 +- 0.0034
  dataset imbalance         precision            recall                f1
0  cardio     25.07  1.0000 +- 0.0000  0.9838 +- 0.0047  0.9918 +- 0.0024
1   churn     25.02  1.0000 +- 0.0000  0.9826 +- 0.0067  0.9912 +- 0.0034
