In [1]:
from toy_reg import *
import ot
import os
import smogn
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

  from pandas.core.computation.check import NUMEXPR_INSTALLED


## SMOGN

In [None]:
ce_smogn=[]

num_sim=10
num_split= len(ratio_list )
# Generate imbalanced data
n_minority= 800
n_majority = 3200
n_val=400
n_test=800

list_origin=[]
W_list=[[] for j in range(10)]
X_total, y_total, regions_total = generate_imbalanced_data(
        n_minority + n_val + n_test, 
        n_majority + n_val + n_test, 
        seed=0
    )


for j in range(num_sim):
    set_seed(j)
    
    # Separate the minority and majority groups based on region labels.
    # Here, we assume that region label 0 indicates minority and 1 indicates majority.
    # Separate minority and majority indices
    minority_idx = np.where(regions_total == 0)[0]
    majority_idx = np.where(regions_total == 1)[0]

    # Shuffle indices separately
    np.random.shuffle(minority_idx)
    np.random.shuffle(majority_idx)

    # For minority:
    min_train_idx = minority_idx[:n_minority]
    min_val_idx   = minority_idx[n_minority : n_minority + n_val]
    min_test_idx  = minority_idx[n_minority + n_val : n_minority + n_val + n_test]

    # For majority:
    maj_train_idx = majority_idx[:n_majority]
    maj_val_idx   = majority_idx[n_majority : n_majority + n_val]
    maj_test_idx  = majority_idx[n_majority + n_val : n_majority + n_val + n_test]

    # Combine directly for each set
    X_train_orig = np.vstack((X_total[min_train_idx], X_total[maj_train_idx]))
    y_train_orig = np.vstack((y_total[min_train_idx], y_total[maj_train_idx]))
    regions_train = np.concatenate((regions_total[min_train_idx], regions_total[maj_train_idx]))

    X_val = np.vstack((X_total[min_val_idx], X_total[maj_val_idx]))
    y_val = np.vstack((y_total[min_val_idx], y_total[maj_val_idx]))
    regions_val = np.concatenate((regions_total[min_val_idx], regions_total[maj_val_idx]))

    X_test = np.vstack((X_total[min_test_idx], X_total[maj_test_idx]))
    y_test = np.vstack((y_total[min_test_idx], y_total[maj_test_idx]))
    regions_test = np.concatenate((regions_total[min_test_idx], regions_total[maj_test_idx]))


    XY_train_orig = combine_XY(X_train_orig, y_train_orig)
    
    n_train= XY_train_orig.shape[0]

    origin_model, mse_val_origin, mse_test_origin, y_val_opred, y_test_opred = train_and_evaluate(
    X_train_orig, y_train_orig, X_val, y_val, X_test, y_test)
    list_origin.append([mse_val_origin, mse_test_origin])
    print([mse_val_origin, mse_test_origin])

    feature_cols = [f"x{i}" for i in range(X_train_orig.shape[1])]
    df_train = pd.DataFrame(
        np.hstack([X_train_orig, y_train_orig]),
        columns=feature_cols + ["target"]
    )

    # 3. Define SMOGN hyperparameter grid
    k_list        = [2]             # nearest neighbors
    pert_list     = [0.02, 0.04, 0.06]     # perturbation
    rel_thres_list= [0.2]     # relevance thresholds

    best_mse_val = float("inf")
    best_params  = None

    # 4. Grid search on validation set
    for k in k_list:
        for pert in pert_list:
            for rel_thres in rel_thres_list:                
                try:
                    df_res = smogn.smoter(
                        data=df_train,
                        y="x0",
                        k=k,
                        pert=pert,
                        samp_method="balance",
                        drop_na_col=True,
                        drop_na_row=True,
                        replace=False,
                        rel_method="auto",
                        rel_thres=rel_thres,
                        rel_ctrl_pts_rg = [
                            [0.5,  1, 0],  ## over-sample ("minority")
                        ]
                    )
                except ValueError as e:
                    # this often means rel_thres too low or phi all 1 → skip
                    print(f"Skipping k={k}, pert={pert}, rel_thres={rel_thres}: {e}")
                    continue
                X_res = df_res[feature_cols].values
                y_res = df_res["target"].values.reshape(-1, 1)

                _, mse_val, _, _, _ = train_and_evaluate(
                    X_res, y_res,
                    X_val, y_val,
                    X_test, y_test
                )
                print(f"SMOGN k={k}, pert={pert}, rel_thres={rel_thres} → val MSE: {mse_val:.4f}")
                if mse_val < best_mse_val:
                    best_mse_val = mse_val
                    best_params   = (k, pert, rel_thres)

    print(f"Best SMOGN params: k={best_params[0]}, pert={best_params[1]}, rel_thres={best_params[2]} → val MSE: {best_mse_val:.4f}")

    if  best_params  == None:
        continue
    # 5. Resample with best hyperparameters and evaluate on test set
    k_best, pert_best, rel_thres_best = best_params
    df_res_best = smogn.smoter(
        data=df_train,
        y="x0",
        k=k_best,
        pert=pert_best,
        samp_method="balance",
        drop_na_col=True,
        drop_na_row=True,
        replace=False,
        rel_method="auto",
        rel_thres=rel_thres_best,
        rel_ctrl_pts_rg = [
                            [0.5,  1, 0],  ## over-sample ("minority")
                        ]
    )
    X_smogn = df_res_best[feature_cols].values
    y_smogn = df_res_best["target"].values.reshape(-1, 1)

    _, _, mse_test_smogn, _, _ = train_and_evaluate(
        X_smogn, y_smogn,
        X_val, y_val,
        X_test, y_test
    )
    print(f"{j}:SMOGN final → val MSE: {mse_test_smogn:.4f}")

    # 6. Record the result
    ce_smogn.append({
        "avg": mse_val_smogn,
        "major": train_and_evaluate(X_smogn, y_smogn, X_val, y_val, X_test[regions_test==1], y_test[regions_test==1])[2],
        "minor": train_and_evaluate(X_smogn, y_smogn, X_val, y_val, X_test[regions_test==0], y_test[regions_test==0])[2],
    })


### Summary statistics for Table 2

In [21]:
[np.mean([x['major'] for x in ce_smogn]),np.mean([x['minor'] for x in ce_smogn]),np.mean([x['avg'] for x in ce_smogn])] 

[0.23120323608828502, 1.5534241096310915, 0.8413967777816665]

In [22]:
[np.std([x['major'] for x in ce_smogn]),np.std([x['minor'] for x in ce_smogn]),np.std([x['avg'] for x in ce_smogn])] 

[0.10755476039767976, 1.0576341101608233, 0.4715261182790246]