In [1]:
import sys, os
sys.path.insert(0, os.path.abspath(".."))  # go up one level

import pandas as pd
import numpy as np
import os, glob
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score

from models import make_child_model
from models import get_base_models
from utils.resampling_registry import get_resamplers
from utils.battle_logger import BattleLogger
from utils.breed_and_battle import breed_and_battle_with_population

import gc

In [3]:
import os
import glob
import pandas as pd

# 📂 Load all datasets with baseline PR AUC
def load_all_datasets(folder="./synthetic_datasets"):
    files = sorted(glob.glob(os.path.join(folder, "*.csv")))
    datasets = []

    for f in files:
        df = pd.read_csv(f)
        X = df.drop("rare_event", axis=1)
        y = df["rare_event"]
        baseline_pr_auc = y.mean()

        datasets.append({
            "name": os.path.basename(f),
            "X": X,
            "y": y,
            "baseline_pr_auc": baseline_pr_auc
        })

        print(f"📊 {os.path.basename(f)} — Baseline PR AUC: {baseline_pr_auc:.3f}")

    return datasets

# Now load + sort
datasets = load_all_datasets()
datasets = sorted(datasets, key=lambda d: d["baseline_pr_auc"])

📊 baseline_easy_w5_n5_d0.csv — Baseline PR AUC: 0.055
📊 high_drift_w3_n10_d60.csv — Baseline PR AUC: 0.036
📊 imbalanced_sparse_w0_n10_d10.csv — Baseline PR AUC: 0.011
📊 mixed_realistic_w4_n15_d30.csv — Baseline PR AUC: 0.045
📊 noisy_overlap_w2_n20_d0.csv — Baseline PR AUC: 0.026


In [5]:
from sklearn.model_selection import train_test_split
# from models.core_models import make_child_model  # ✅ Toggle this if using logistic regression
from models.core_models2 import make_child_model  # ✅ Toggle this if using SGDClassifier

# 📊 Sort datasets by baseline PR AUC (ascending difficulty)
datasets = sorted(datasets, key=lambda d: d["baseline_pr_auc"], reverse=True)  # Easiest to hardest

# ✅ Start with base models — include name, params, and model object
survivors = [
    {
        "name": cfg["name"],
        "params": cfg["params"],
        "model": make_child_model(cfg["params"], max_iter=300)
    }
    for cfg in get_base_models()
]
finalists = []  # ← We'll keep ALL stage survivors here

# 🌊 Stage-by-stage tournament
for stage_idx, dataset in enumerate(datasets):
    dataset_name = dataset["name"]
    print(f"\n🌊 Stage {stage_idx + 1}: {dataset_name}")

    X_train, X_test, y_train, y_test = train_test_split(
        dataset["X"], dataset["y"], stratify=dataset["y"], test_size=0.3, random_state=42
    )

    resamplers = get_resamplers(X_train, y_train, target_col="rare_event")
    resampled_datasets = {name: fn() for name, fn in resamplers.items()}

    # ✅ Add metadata for logging/debug
    for s in survivors:
        s["dataset_name"] = dataset_name
        s["baseline_pr_auc"] = dataset["baseline_pr_auc"]

    with BattleLogger(
        to_file=f"logs/battle_stage_{stage_idx + 1}.txt",
        # js_file=f"logs/battle_stage_{stage_idx + 1}.js",
        # inject_html=True,
        # html_template="battle_template.html",
        # html_output=f"logs/battle_stage_{stage_idx + 1}.html"
    ):
        survivors = breed_and_battle_with_population(
            model_population=survivors,
            resampled_datasets=resampled_datasets,
            X_test=X_test,
            y_test=y_test,
            generations=50,
            top_k=3,
            dataset_name=dataset_name,
            baseline_pr_auc=dataset["baseline_pr_auc"],
            debug=True
        )

    # ✅ Accumulate stage winners for leaderboard
    finalists.extend(survivors)

    # 🏆 Save the top survivor for this stage
    top_survivor = sorted(survivors, key=lambda s: s["score"]["pr_auc"], reverse=True)[0]
    
    # Extract model and metadata
    model_to_save = top_survivor["model"]
    meta_to_save = {
        "label": top_survivor["label"],
        "generation": top_survivor["generation"],
        "score": top_survivor["score"],
        "dataset": top_survivor["dataset_name"],
        "params": top_survivor["params"]
    }
    
    # Save both
    import joblib
    output_path = "output"  # or use "output" if working locally
    os.makedirs(output_path, exist_ok=True)
    
    joblib.dump(model_to_save, f"{output_path}/winner_{dataset_name}.pkl")
    joblib.dump(meta_to_save, f"{output_path}/winner_{dataset_name}_meta.pkl")
    
    print(f"[💾] Saved winner from {dataset_name} with PR AUC = {top_survivor['score']['pr_auc']:.3f}")

    # ✅ Keep only top 3 for next stage (to limit memory)
    survivors = sorted(survivors, key=lambda s: s["score"]["pr_auc"], reverse=True)[:3]

    # 🧼 Explicit cleanup
    del X_train, X_test, y_train, y_test, resamplers, resampled_datasets
    import gc
    gc.collect()

    # 🔁 Rebuild new models for next generation
    survivors = [
        {
            "name": s["label"].split(" + ")[0],
            "params": s["params"],
            "model": make_child_model(s["params"], max_iter=300)
        }
        for s in survivors
    ]


🌊 Stage 1: baseline_easy_w5_n5_d0.csv
[💾] Saved winner from baseline_easy_w5_n5_d0.csv with PR AUC = 0.673

🌊 Stage 2: mixed_realistic_w4_n15_d30.csv
[💾] Saved winner from mixed_realistic_w4_n15_d30.csv with PR AUC = 0.209

🌊 Stage 3: high_drift_w3_n10_d60.csv
[💾] Saved winner from high_drift_w3_n10_d60.csv with PR AUC = 0.178

🌊 Stage 4: noisy_overlap_w2_n20_d0.csv
[💾] Saved winner from noisy_overlap_w2_n20_d0.csv with PR AUC = 0.110

🌊 Stage 5: imbalanced_sparse_w0_n10_d10.csv




[💾] Saved winner from imbalanced_sparse_w0_n10_d10.csv with PR AUC = 0.505


In [6]:
# last working 8:53pm Tue Apr 1
# from sklearn.model_selection import train_test_split
# # from models.core_models import make_child_model  # ✅ Required for model regeneration
# # toggle to test sgd
# from models.core_models2 import make_child_model

# # 📊 Sort datasets by baseline PR AUC (ascending difficulty)
# # datasets = sorted(datasets, key=lambda d: d["baseline_pr_auc"])

# # ✅ To sort from easiest to hardest:
# datasets = sorted(datasets, key=lambda d: d["baseline_pr_auc"], reverse=True)

# # Start with base models — include name, params, and model
# survivors = [
#     {
#         "name": cfg["name"],
#         "params": cfg["params"],
#         "model": make_child_model(cfg["params"], max_iter=300)  # initial model objects
#     }
#     for cfg in get_base_models()
# ]
# finalists = []

# # 🌊 Stage-by-stage tournament
# for stage_idx, dataset in enumerate(datasets):
#     dataset_name = dataset["name"]
#     print(f"\n🌊 Stage {stage_idx + 1}: {dataset_name}")

#     X_train, X_test, y_train, y_test = train_test_split(
#         dataset["X"], dataset["y"], stratify=dataset["y"], test_size=0.3, random_state=42
#     )

#     resamplers = get_resamplers(X_train, y_train, target_col="rare_event")
#     resampled_datasets = {name: fn() for name, fn in resamplers.items()}

#     # ✅ Add metadata for this round
#     for s in survivors:
#         s["dataset_name"] = dataset_name
#         s["baseline_pr_auc"] = dataset["baseline_pr_auc"]

#     with BattleLogger(
#         to_file=f"logs/battle_stage_{stage_idx + 1}.txt",
#         # js_file=f"logs/battle_stage_{stage_idx + 1}.js",
#         # inject_html=True,
#         # html_template="battle_template.html",
#         # html_output=f"logs/battle_stage_{stage_idx + 1}.html"
#     ):
#         survivors = breed_and_battle_with_population(
#             model_population=survivors,
#             resampled_datasets=resampled_datasets,
#             X_test=X_test,
#             y_test=y_test,
#             generations=50,
#             top_k=3,
#             dataset_name=dataset_name,
#             baseline_pr_auc=dataset["baseline_pr_auc"],
#             debug=True # False to skip intensive logging
#         )

#     finalists = survivors  # Save finalists after this round
#     # speedup technique (clearing memory)
#     # 🧹 Keep only top performers to limit memory bloat
#     survivors = sorted(survivors, key=lambda s: s["score"]["pr_auc"], reverse=True)[:3]
#     finalists = survivors.copy()

#     # 🧼 Clean up explicitly and rebuild new survivors for next stage
#     del X_train, X_test, y_train, y_test, resamplers, resampled_datasets
#     # Force garbage collection     
#     gc.collect()

#     # 🔁 Rebuild survivor configs with regenerated model objects
#     survivors = [
#         {
#             "name": s["label"].split(" + ")[0],
#             "params": s["params"],
#             "model": make_child_model(s["params"], max_iter=300)
#         }
#         for s in survivors
#     ]

In [6]:
# 🏆 Final leaderboard
# print(f"\n🏆 Finalists after all rounds: {[m['label'] for m in finalists]}")

# print("\n🏆 Finalists after all rounds:")
# for i, model in enumerate(finalists, 1):
#     print(f"{i}. {model['label']}_G{model['generation']} | PR AUC: {model['score']['pr_auc']:.3f} | Lineage: {model['lineage']}")

In [7]:
import pandas as pd

# 🧾 Build summary table from finalists
final_summary = []
for model in finalists:
    pr_auc = model['score']['pr_auc']
    baseline = model.get('baseline_pr_auc', 0.0)
    final_summary.append({
        "Label": f"{model['label']}_G{model['generation']}",
        "Generation": model['generation'],
        "PR AUC": round(pr_auc, 3),
        "Lineage": model.get("lineage", "—"),
        "Beats Baseline?": "✅ Yes" if pr_auc > baseline else "❌ No",
        "Baseline PR AUC": round(baseline, 3),
        "Dataset": model.get("dataset_name", "unknown")
    })

# df_final = pd.DataFrame(final_summary).sort_values(by=["PR AUC"], ascending=False)
df_final = pd.DataFrame(final_summary)

# 📊 Display final summary
# print("\n📋 Finalist Leaderboard:")
# print(df_final.to_string(index=False))

# 💾 Optionally save
df_final.to_csv("logs/finalists_summary.csv", index=False)

In [9]:
df_final.head(30)

Unnamed: 0,Label,Generation,PR AUC,Lineage,Beats Baseline?,Baseline PR AUC,Dataset
0,Elastic_L30_C1 + Manual Upsampling_G0,0,0.664,origin of Elastic_L30_C1 + Manual Upsampling,✅ Yes,0.055,baseline_easy_w5_n5_d0.csv
1,Elastic_L30_C1 + SMOTE_G0,0,0.671,origin of Elastic_L30_C1 + SMOTE,✅ Yes,0.055,baseline_easy_w5_n5_d0.csv
2,Elastic_L30_C1 + ADASYN_G0,0,0.484,origin of Elastic_L30_C1 + ADASYN,✅ Yes,0.055,baseline_easy_w5_n5_d0.csv
3,Elastic_L30_C1 + Borderline SMOTE_G0,0,0.515,origin of Elastic_L30_C1 + Borderline SMOTE,✅ Yes,0.055,baseline_easy_w5_n5_d0.csv
4,Elastic_L30_C1 + SMOTETomek_G0,0,0.671,origin of Elastic_L30_C1 + SMOTETomek,✅ Yes,0.055,baseline_easy_w5_n5_d0.csv
5,Elastic_L30_C1 + SMOTEENN_G0,0,0.658,origin of Elastic_L30_C1 + SMOTEENN,✅ Yes,0.055,baseline_easy_w5_n5_d0.csv
6,Elastic_L30_C1 + Random Undersample_G0,0,0.663,origin of Elastic_L30_C1 + Random Undersample,✅ Yes,0.055,baseline_easy_w5_n5_d0.csv
7,Elastic_L30_C1 + Cluster Centroids_G0,0,0.669,origin of Elastic_L30_C1 + Cluster Centroids,✅ Yes,0.055,baseline_easy_w5_n5_d0.csv
8,Elastic_L30_C1 + No Resampling_G0,0,0.671,origin of Elastic_L30_C1 + No Resampling,✅ Yes,0.055,baseline_easy_w5_n5_d0.csv
9,Elastic_L50_C5 + Manual Upsampling_G0,0,0.663,origin of Elastic_L50_C5 + Manual Upsampling,✅ Yes,0.055,baseline_easy_w5_n5_d0.csv
