In [None]:
# Breast Cancer EDA & Modeling Notebook

## Imports & Global Config

In [None]:
# Basic libraries
import sys
import os
import logging
import importlib.util
import inspect
import subprocess
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report, ConfusionMatrixDisplay
import joblib
from xgboost import XGBClassifier

# Display + plotting style
pd.set_option("display.max_columns", 200)
sns.set(style="whitegrid")

# For reproducibility
RANDOM_STATE = 42


: 

## Repository Root & Project Paths
This cell detects the project’s root folder and sets up all directory paths (data, engineered data, artifacts) so the notebook can reliably read and save files no matter where it’s run.


In [None]:
# Find the project root so all file paths work no matter where the notebook is run
def get_repo_root():
    try:
        # Ask Git for the top-level directory of this repository
        return Path(subprocess.check_output(["git", "rev-parse", "--show-toplevel"], text=True).strip())
    except Exception:
        # Walk up a few levels looking for a .git folder
        p = Path.cwd()
        for _ in range(6):
            if (p / ".git").exists():
                return p
            p = p.parent
        # Fallback: current working directory
        return Path.cwd()

try:
    ROOT = get_repo_root()
except Exception:
    ROOT = Path.cwd()

FALLBACK_ROOT = Path(r"C:\Users\rajni\Documents\breast-cancer-agentic")
if ROOT != FALLBACK_ROOT and not (ROOT / "data" / "raw").exists():
    ROOT = FALLBACK_ROOT

print("Repo root:", ROOT)

# Define key directories
DATA_RAW        = ROOT / "data" / "raw"
DATA_ENGINEERED = ROOT / "data" / "engineered"
ARTIFACTS_ENG   = ROOT / "artifacts" / "engineering"
ARTIFACTS_EDA   = ROOT / "artifacts" / "eda"

for p in [DATA_ENGINEERED, ARTIFACTS_ENG, ARTIFACTS_EDA]:
    p.mkdir(parents=True, exist_ok=True)

# Source CSV + target column
SRC_FILE   = DATA_RAW / "breast_cancer_with_columns.csv"
TARGET_COL = "diagnosis"

print("Using source file:", SRC_FILE)
assert SRC_FILE.exists(), f"Missing source file: {SRC_FILE}"


## Load Dataset & Basic Checks
This cell loads the raw dataset, verifies that the target column exists, splits the data into features and labels, and provides basic previews (head, summary stats, missing values) to confirm the data was loaded correctly.

In [None]:
# Read the dataset from disk
df = pd.read_csv(SRC_FILE)
print("Loaded shape:", df.shape)

# Make sure the target column exists before continuing
if TARGET_COL not in df.columns:
    print("Columns available (first 40):", list(df.columns)[:40])
    raise AssertionError(f"Target column '{TARGET_COL}' not found in CSV")

# Separate features (X) from the target (y)
y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL])

print("Target distribution (normalized):")
print(y.value_counts(normalize=True))

# Glance at the data and summary statistics to sanity-check values and types
display(df.head())
display(df.describe(include="all").T)

print("Missing values (top 15):")
print(df.isna().sum().sort_values(ascending=False).head(15))


## Correlation Heatmap & Correlation with Target
This cell computes a correlation matrix for all numeric features and visualizes it as a heatmap, then converts the diagnosis column to a binary target (y_bin) and ranks features by how strongly they’re linearly correlated with that target.

In [None]:
# Correlation heatmap among numeric features
corr = X.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Feature correlation heatmap")
plt.show()

# Correlation with target: handle object target by mapping to binary
if y.dtype == 'O' or y.dtype.name == 'category':
    uniq = list(y.unique())
    if len(uniq) == 2:
        y_bin = y.map({uniq[0]: 0, uniq[1]: 1})
    else:
        # fallback: try common names
        y_bin = y.map(lambda v: 1 if str(v).lower().startswith('m') or str(v).lower().startswith('b') else 0)
else:
    y_bin = y

tmp = X.copy()
tmp[TARGET_COL] = y_bin.values
target_corr = tmp.corr()[TARGET_COL].abs().sort_values(ascending=False)
print("Top features by absolute correlation with target:")
display(target_corr.head(20))


## Numeric Overview & KDE Plots
This cell reviews all numeric columns, checks missing/unique counts, and generates KDE plots to visualize how feature distributions differ between malignant and benign cases.

In [None]:
# Quick numeric overview and sample KDE plots
num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
print("Numeric columns count:", len(num_cols))
display(pd.DataFrame({
    "col": num_cols,
    "n_missing": [X[c].isna().sum() for c in num_cols],
    "n_unique": [X[c].nunique() for c in num_cols],
}).sort_values(["n_missing", "n_unique"], ascending=[False, True]).head(20))

# Plot a few features
sel = num_cols[:6]
for col in sel:
    plt.figure(figsize=(6, 3))
    sns.kdeplot(data=df, x=col, hue=TARGET_COL, fill=True, common_norm=False)
    plt.title(f"{col} by {TARGET_COL}")
    plt.tight_layout()
    plt.show()


## Target Distribution Plot
This cell creates a simple countplot of the diagnosis column to show the class distribution (malignant vs. benign) and verify the dataset’s balance.

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x=TARGET_COL, data=df,
    palette=["#FF6B6B", "#4D96FF"]   # red (malignant), blue (benign)
)
plt.title("Distribution of Diagnosis")
plt.xlabel("Diagnosis")
plt.ylabel("Count")
plt.show()


## Agent Loader & Safe Adapter
This cell makes sure our notebook can load and run the AI agents stored in the agents/ folder — no matter who runs the notebook or how their environment is set up.
This cell handles these possible situations:
1. Try normal package import
2. If that fails, try loading agent modules directly from file paths
3. If both imports fail, we use "safe stub versions"
4. Safe-call adapter - No matter how the agent is written, the notebook can still call it without errors.
5. Finally, it executes the agent

In [None]:
# If not set, adapter will only use in-memory df. DATA_PATH can be set to a CSV path
# so agents that expect file-based input can be invoked without writing temp files.
DATA_PATH = globals().get("DATA_PATH", None)

# Resolve project root so notebook works when executed from subfolders
# If the notebook runs inside a `notebooks/` folder we assume the repo root is its parent.
ROOT = Path.cwd().parents[0] if (Path.cwd().name == "notebooks") else Path.cwd()
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

# Directory where agent modules are expected to live (project-local adapters/agents)
AGENTS_DIR = ROOT / "agents"

# Ensure artifact directory exists and configure structured logging (file + console)
os.makedirs(str(ARTIFACTS_EDA), exist_ok=True)
log_file = ARTIFACTS_EDA / f"agent_adapter_{datetime.now(datetime.UTC).strftime('%Y%m%dT%H%M%SZ')}.log"
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s: %(message)s',
                    filename=str(log_file))
console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
logging.getLogger().addHandler(console)

logging.info("ROOT: %s", ROOT)
logging.info("AGENTS_DIR: %s | exists: %s", AGENTS_DIR, AGENTS_DIR.exists())
logging.info("__init__.py present: %s", (AGENTS_DIR / "__init__.py").exists())

# Audit logger (writes newline-delimited JSON records to artifacts)
try:
    from agents.audit_logger import get_default_audit
    audit = get_default_audit(artifacts_dir=ARTIFACTS_EDA)
    logging.info("Audit logger initialized at: %s", audit.filepath)
except Exception as e_audit:
    logging.info("Audit logger not available: %s", e_audit)
    audit = None

# Privacy utilities (de-identification)
try:
    from agents.privacy import deidentify_to_temp_csv, deidentify_dataframe
    logging.info("Privacy utilities available (deidentify_to_temp_csv)")
except Exception as e_priv:
    logging.info("Privacy utilities not available: %s", e_priv)
    deidentify_to_temp_csv = None
    deidentify_dataframe = None

# Track whether we found real agent implementations
AGENTS_AVAILABLE = False

# Helper: import a module directly from a file path (keeps our flexible loading strategy)
# This allows the notebook to load `agents/eda_agent.py` without requiring package install.
def load_module_by_path(mod_name, path: Path):
    spec = importlib.util.spec_from_file_location(mod_name, str(path))
    mod = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(mod)
    return mod

# Import strategy (resilient):
# 1) Try normal package import (preferred for packaged deployments)
# 2) If that fails, try loading agent modules by explicit file path (developer convenience)
# 3) If both fail, provide safe no-op stubs so the notebook remains runnable.
try:
    from agents.eda_agent import run_eda_report  # expected callable
    from agents.fe_agent import propose_features, apply_features
    AGENTS_AVAILABLE = True
    logging.info("Imported agents via package imports.")
except Exception as e_pkg:
    logging.info("Package import failed: %s", e_pkg)
    try:
        # Try loading agent files directly from `agents/`
        eda_path = AGENTS_DIR / "eda_agent.py"
        fe_path = AGENTS_DIR / "fe_agent.py"

        eda_mod = load_module_by_path("agents.eda_agent", eda_path)
        run_eda_report = getattr(eda_mod, "run_eda_report", None)

        fe_mod = load_module_by_path("agents.fe_agent", fe_path) if fe_path.exists() else None
        propose_features = getattr(fe_mod, "propose_features", None) if fe_mod else None
        apply_features = getattr(fe_mod, "apply_features", None) if fe_mod else None

        if callable(run_eda_report):
            AGENTS_AVAILABLE = True
            logging.info("Loaded agents via file path.")
        else:
            raise ImportError("run_eda_report not found in eda_agent.py")
    except Exception as e_path:
        logging.info("File-path import failed: %s", e_path)

        # Safe no-op stubs: allow notebook to proceed even without real agents
        def run_eda_report(df, target_col=None, out_dir="artifacts/eda"):
            # Stub: indicates no EDA was performed on purpose
            logging.info("run_eda_report stub called — no real EDA performed.")
            return {"summary_path": None, "out_dir": out_dir}

        def propose_features(X, y=None, max_interactions=20):
            logging.info("propose_features stub called.")
            return []

        def apply_features(X, proposals, dry_run=True, confirm=False):
            """Apply feature proposals to `X` (stubbed).

            Behavior:
            - `dry_run=True` (default): do not persist changes; return transformed DataFrame + metadata.
            - `dry_run=False`: requires explicit confirmation. Confirmation sources (in order):
                1) `confirm=True` argument
                2) `globals().get('CONFIRM_APPLY', False)` set in the notebook
                3) interactive user input (type 'yes')

            This pattern prevents accidental destructive edits when running notebooks non-interactively.
            """
            logging.info("apply_features called (dry_run=%s, confirm=%s)", dry_run, confirm)
            X_new = X.copy()
            applied = []

            # Emit an audit event for the apply attempt (if audit logger is present)
            if 'audit' in globals() and audit is not None:
                try:
                    ctx = audit.audit_event(event_type='apply_features', actor='notebook', action='apply',
                                            target=str(ARTIFACTS_ENG / 'transformers.pkl'),
                                            details={'dry_run': dry_run, 'n_proposals': len(proposals) if proposals else 0})
                except Exception:
                    ctx = None
            else:
                ctx = None

            if ctx:
                cm = ctx.__enter__()

            # In a real implementation we'd apply transformations described by `proposals`.
            # Here the stub simply records proposal names to demonstrate metadata structure.
            for p in proposals or []:
                name = p.get("name") if isinstance(p, dict) else str(p)
                applied.append(name)

            metadata = {"applied": applied, "count": len(applied)}

            if dry_run:
                logging.info("Dry-run mode: no changes persisted. Metadata: %s", metadata)
                if ctx:
                    ctx.__exit__(None, None, None)
                return X_new, metadata

            # Non-dry run path: require confirmation
            global_confirm = globals().get("CONFIRM_APPLY", False)
            if not confirm and not global_confirm:
                # Try to prompt interactively; this will raise in headless CI if input() is unavailable
                try:
                    ans = input("Apply feature proposals to dataset? Type 'yes' to proceed: ").strip().lower()
                    if ans != "yes":
                        logging.info("User declined to apply features via prompt.")
                        raise RuntimeError("User declined to apply features (no changes made).")
                except Exception:
                    raise RuntimeError("Confirmation required to apply features; set CONFIRM_APPLY=True or pass confirm=True.")

            # At this point we consider changes "approved". Stub does not persist to disk.
            logging.info("Applying features (stub) — metadata: %s", metadata)
            if ctx:
                ctx.__exit__(None, None, None)
            return X_new, metadata

        logging.info("Using stubbed agent functions (no-op).")

# Compatibility adapter: call run_eda_report regardless of its exact signature
# The adapter supports functions that accept either a DataFrame (`df`) or a file path
# argument (`dataset_path` or `path`), and it also tries common positional forms.
def call_run_eda_safely(run_eda_report_fn, df, target_col, out_dir, data_path=None, **extra):
    """Call run_eda_report whether it expects a DataFrame or a file path.
    The adapter inspects the function signature and tries the most likely calling conventions.
    """
    if run_eda_report_fn is None or not callable(run_eda_report_fn):
        raise RuntimeError("run_eda_report is not callable; agent import failed.")

    params = inspect.signature(run_eda_report_fn).parameters
    kw = dict(target_col=target_col, out_dir=str(out_dir), **extra)

    # Case 1: function expects an explicit DataFrame named `df`
    if "df" in params:
        return run_eda_report_fn(df=df, **kw)

    # Case 2: function expects a dataset path argument (common in detached agents)
    if "dataset_path" in params or "path" in params:
        if data_path is None:
            raise ValueError("Agent expects a file path — set DATA_PATH before calling.")
        key = "dataset_path" if "dataset_path" in params else "path"
        kw[key] = data_path
        return run_eda_report_fn(**kw)

    # Case 3: positional arguments fallback (df, target_col, out_dir) or (path, target_col, out_dir)
    try:
        return run_eda_report_fn(df, target_col, str(out_dir), **extra)
    except TypeError:
        if data_path is None:
            raise
        return run_eda_report_fn(data_path, target_col, str(out_dir), **extra)

# --- Safety improvement: run agents in a subprocess with timeout when possible ---
# We prefer subprocess runs because they are interruptible via the OS, easier to sandbox,
# and their stdout/stderr can be captured independently of the notebook process.
import json, tempfile, textwrap

# Write a DataFrame to a temp CSV and return the path. Caller should remove the file.
def _df_to_temp_csv(df):
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
    df.to_csv(tmp.name, index=False)
    tmp.close()
    return tmp.name

# Create a small runner script that imports the agent and calls it using a dataset_path.
# The runner prints a JSON object to stdout so the parent process can parse structured results.
def _write_runner_script(use_file_path=None):
    if use_file_path:
        # Load the agent directly from the source file to avoid altering sys.path in the
        # runner process (safer and more deterministic).
        import_block = f"import importlib.util\nspec = importlib.util.spec_from_file_location('agent_mod', r'{use_file_path}')\nmod = importlib.util.module_from_spec(spec)\nspec.loader.exec_module(mod)\nfrom agent_mod import run_eda_report"
    else:
        # Default to importing the package name `agents.eda_agent` in environments
        # where the project is installed/packaged.
        import_block = "from agents.eda_agent import run_eda_report"

    script = textwrap.dedent(f"""
import json, sys, traceback
try:
    {import_block}
    dataset = sys.argv[1]
    target = sys.argv[2]
    outdir = sys.argv[3]
    res = run_eda_report(dataset_path=dataset, target_col=target, out_dir=outdir)
    print(json.dumps({{'success': True, 'result': res}}, default=str))
except Exception as e:
    print(json.dumps({{'success': False, 'error': str(e), 'traceback': traceback.format_exc()}}))
""")
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".py", mode="w", encoding="utf-8")
    tmp.write(script)
    tmp.close()
    return tmp.name

# Run the tiny runner script in a subprocess, capture stdout/stderr, save them to ARTIFACTS_EDA,
# and return the parsed result object (or raise on errors).
def run_eda_via_subprocess(eda_py_path_or_none, dataset_path, target_col, out_dir, timeout=30):
    runner = _write_runner_script(use_file_path=eda_py_path_or_none)
    try:
        cmd = [sys.executable, runner, dataset_path, str(target_col), str(out_dir)]
        cp = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
        stdout = cp.stdout.strip()
        stderr = cp.stderr or ''

        # Persist stdout/stderr to artifact logs for later debugging (timestamped)
        ts = datetime.now(datetime.UTC).strftime('%Y%m%dT%H%M%SZ')
        try:
            out_path = ARTIFACTS_EDA / f'agent_stdout_{ts}.log'
            err_path = ARTIFACTS_EDA / f'agent_stderr_{ts}.log'
            out_path.write_text(cp.stdout)
            err_path.write_text(cp.stderr or '')
            logging.info("Saved agent stdout to %s and stderr to %s", out_path, err_path)
            # Record audit info about the subprocess result
            try:
                if 'audit' in globals() and audit is not None:
                    audit.log('agent_subprocess', actor='agent_runner', action='subprocess_complete',
                              target=str(dataset_path),
                              details={'stdout_log': str(out_path), 'stderr_log': str(err_path), 'runner': str(runner)})
            except Exception:
                logging.exception("Failed to emit audit record for agent subprocess")
        except Exception as e_write:
            logging.exception("Failed to write agent stdout/stderr to artifacts: %s", e_write)

        # Expect the runner to print a JSON object. If not, raise with output included.
        try:
            obj = json.loads(stdout)
        except Exception:
            logging.error("Agent runner did not return valid JSON. stdout=%r stderr=%r", stdout, stderr)
            raise RuntimeError(f"Agent runner failed to return JSON. stdout={stdout!r}, stderr={cp.stderr!r}")

        if not obj.get('success'):
            logging.error("Agent reported failure: %s", obj.get('error',''))
            raise RuntimeError('Agent error: ' + obj.get('error','') + '\n' + obj.get('traceback',''))

        logging.info("Agent subprocess completed successfully")
        return obj.get('result')
    finally:
        # Try to clean up the temporary runner script file.
        try:
            Path(runner).unlink()
        except Exception:
            pass

# High-level wrapper that prefers subprocess execution paths and falls back to in-process calls.
def run_eda_with_timeout(run_eda_report_fn, df, target_col, out_dir, data_path=None, timeout=30):
    """High-level wrapper: prefer subprocess runs (file or package), otherwise in-process fallback.

    Steps:
    1. If `agents/eda_agent.py` exists locally, run it in a subprocess using that file.
    2. Else try importing `agents.eda_agent` in a subprocess (works when project is installed).
    3. If subprocess paths fail, fall back to calling the function in-process with a thread-based timeout.
       Note: thread-based timeouts cannot terminate CPU-bound C extensions; subprocess is preferred.
    """
    # 1) If there's a local eda_agent.py, prefer running it in a subprocess (safer)
    eda_file = AGENTS_DIR / "eda_agent.py"
    if eda_file.exists():
        # prepare dataset path: de-identify if possible
        if data_path:
            ds = data_path
        else:
            deid_strategy = globals().get('DEID_STRATEGY', 'drop')
            deid_cols = globals().get('DEID_COLS', None)
            deid_salt = globals().get('DEID_SALT', 'agentic-default-salt')
            if deidentify_to_temp_csv:
                ds = deidentify_to_temp_csv(df, strategy=deid_strategy, cols=deid_cols, salt=deid_salt)
                try:
                    if audit:
                        audit.log('deidentify', actor='notebook', action='deidentify_to_temp_csv', target=str(ds), details={'strategy': deid_strategy, 'cols': deid_cols})
                except Exception:
                    logging.exception("Failed to emit audit record for de-identification")
            else:
                ds = _df_to_temp_csv(df)
        try:
            return run_eda_via_subprocess(str(eda_file), ds, target_col, out_dir, timeout=timeout)
        finally:
            if data_path is None:
                try: Path(ds).unlink()
                except: pass

    # 2) Try package import path via subprocess
    try:
        __import__("agents.eda_agent")
        if data_path:
            ds = data_path
        else:
            deid_strategy = globals().get('DEID_STRATEGY', 'drop')
            deid_cols = globals().get('DEID_COLS', None)
            deid_salt = globals().get('DEID_SALT', 'agentic-default-salt')
            if deidentify_to_temp_csv:
                ds = deidentify_to_temp_csv(df, strategy=deid_strategy, cols=deid_cols, salt=deid_salt)
                try:
                    if audit:
                        audit.log('deidentify', actor='notebook', action='deidentify_to_temp_csv', target=str(ds), details={'strategy': deid_strategy, 'cols': deid_cols})
                except Exception:
                    logging.exception("Failed to emit audit record for de-identification")
            else:
                ds = _df_to_temp_csv(df)
        try:
            return run_eda_via_subprocess(None, ds, target_col, out_dir, timeout=timeout)
        finally:
            if data_path is None:
                try: Path(ds).unlink()
                except: pass
    except Exception as e_pkg2:
        logging.info("Package-based subprocess run not available: %s", e_pkg2)

    # 3) Last resort: call in-process with a thread timeout (cannot kill CPU-bound tasks)
    from concurrent.futures import ThreadPoolExecutor, TimeoutError
    with ThreadPoolExecutor(max_workers=1) as ex:
        fut = ex.submit(call_run_eda_safely, run_eda_report_fn, df, target_col, out_dir, data_path=data_path)
        try:
            return fut.result(timeout=timeout)
        except TimeoutError:
            raise TimeoutError(f"Agent call timed out after {timeout} seconds (in-process fallback)")

# Execute via safe adapter (works for real agent or stub), with a sensible default timeout
try:
    logging.info("Calling run_eda_report via safe adapter (timeout=%ss)", 30)
    if "df" not in globals():
        raise NameError("df is not defined in this notebook cell scope.")
    if audit:
        with audit.audit_event(event_type='eda_run', actor='notebook', action='run_eda', target=str(SRC_FILE), details={'timeout': 30}):
            result = run_eda_with_timeout(
                run_eda_report_fn=run_eda_report,
                df=df,
                target_col=TARGET_COL,
                out_dir=ARTIFACTS_EDA,
                data_path=DATA_PATH,
                timeout=30,  # seconds; adjust as needed
            )
    else:
        result = run_eda_with_timeout(
            run_eda_report_fn=run_eda_report,
            df=df,
            target_col=TARGET_COL,
            out_dir=ARTIFACTS_EDA,
            data_path=DATA_PATH,
            timeout=30,  # seconds; adjust as needed
        )
    logging.info("run_eda_report executed; artifacts (if any) saved to: %s", ARTIFACTS_EDA)
    if result is not None:
        try:
            logging.info("Agent result keys: %s", list(result.keys()))
        except Exception:
            logging.info("Agent result type: %s", type(result))
except Exception:
    logging.exception("run_eda_report execution failed")


## Baseline Feature Engineering
This cell applies our baseline feature engineering: it creates ratio features (worst / mean) for key measurements and removes all _se standard-error columns to reduce noise and dimensionality.

In [None]:
# Create ratio features and drop *_se columns
def add_ratio_features(df):
    df = df.copy()
    pairs = [
        ('radius_mean','radius_worst'),
        ('texture_mean','texture_worst'),
        ('perimeter_mean','perimeter_worst'),
        ('area_mean','area_worst'),
        ('smoothness_mean','smoothness_worst'),
        ('compactness_mean','compactness_worst'),
        ('concavity_mean','concavity_worst'),
        ('concave points_mean','concave points_worst'),
        ('symmetry_mean','symmetry_worst'),
        ('fractal_dimension_mean','fractal_dimension_worst'),
    ]
    for a, b in pairs:
        if a in df.columns and b in df.columns:
            with np.errstate(divide='ignore', invalid='ignore'):
                df[f'{b}_over_{a}'] = df[b] / df[a]
    return df

def drop_se_columns(df):
    return df.drop(columns=[c for c in df.columns if str(c).endswith('_se')], errors='ignore')

X_fe = add_ratio_features(X)
X_fe = drop_se_columns(X_fe)
print("Feature-engineered shape:", X_fe.shape)


## Cap Percentiles, Scale, Save Engineered Data & Transformers
This cell caps outliers using percentile clipping, scales all features with StandardScaler, and then saves the engineered dataset and transformer metadata for consistent reuse across the project.

In [None]:
def cap_percentiles(df, lower=0.01, upper=0.99):
    df = df.copy()
    for col in df.select_dtypes(include=[np.number]).columns:
        lo = df[col].quantile(lower)
        hi = df[col].quantile(upper)
        df[col] = df[col].clip(lo, hi)
    return df

lower_pct, upper_pct = 0.01, 0.99
X_cap = cap_percentiles(X_fe, lower=lower_pct, upper=upper_pct)

# Record clip bounds for metadata
clip_lower = {}
clip_upper = {}
for col in X_fe.select_dtypes(include=[np.number]).columns:
    clip_lower[col] = float(X_fe[col].quantile(lower_pct))
    clip_upper[col] = float(X_fe[col].quantile(upper_pct))

# Scale features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_cap), columns=X_cap.columns)

fe_metadata = {
    "created_at": datetime.utcnow().isoformat() + "Z",
    "clip_percentiles": {"lower": lower_pct, "upper": upper_pct},
    "clip_bounds": {"lower": clip_lower, "upper": clip_upper},
    "feature_columns": list(X_scaled.columns),
    "transform": "cap_percentiles + StandardScaler",
}

out_df = X_scaled.copy()
out_df[TARGET_COL] = y.values

ENGINEERED_PATH = DATA_ENGINEERED / "breast_cancer_engineered.csv"
out_df.to_csv(ENGINEERED_PATH, index=False)

joblib.dump({'scaler': scaler, 'columns': list(X_scaled.columns), 'fe_metadata': fe_metadata},
            ARTIFACTS_ENG / 'transformers.pkl')

print("Saved engineered data to:", ENGINEERED_PATH)
print("Saved transformers to:", ARTIFACTS_ENG / "transformers.pkl")


## Mutual Information Ranking
This cell computes Mutual Information scores to measure how strongly each feature relates to the target, and saves a ranked list of the most informative features for later modeling.

In [None]:
# Impute missing values with column mean
imp = SimpleImputer(strategy="mean")
X_imputed = pd.DataFrame(imp.fit_transform(X_scaled),
                         columns=X_scaled.columns, index=X_scaled.index)

# Select correct target (y_bin if present else y)
target = y_bin if 'y_bin' in globals() else y

# Compute mutual information
mi = mutual_info_classif(X_imputed, target, random_state=RANDOM_STATE)
mi_series = pd.Series(mi, index=X_imputed.columns).sort_values(ascending=False)

# Save ranking
mi_csv_path = ARTIFACTS_EDA / "mutual_info_ranking.csv"
mi_series.to_csv(mi_csv_path)
print("Saved mutual info ranking to:", mi_csv_path)

mi_series.head(10)


## Train/Test Split
This cell splits the processed data into training and testing sets (using stratification) so we can train models fairly and evaluate them on unseen data.

In [None]:
# Use the same target as MI (binary if available)
target = y_bin if 'y_bin' in globals() else y

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, target,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=target,
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


## XGBoost + GridSearchCV Modeling
This cell trains an XGBoost model using GridSearchCV to find the best hyperparameters, then evaluates the final model on the test set using ROC-AUC, classification metrics, and a confusion matrix.

In [None]:
# Define base XGBoost model
xgb = XGBClassifier(
    random_state=RANDOM_STATE,
    tree_method="hist",
    eval_metric="logloss",
)

# Hyperparameter grid (your earlier setup)
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [400],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_lambda': [0.1, 1, 10],
    'min_child_weight': [1, 3, 5],
}

# Stratified K-fold CV
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='roc_auc',
    n_jobs=-1,
    cv=cv,
    verbose=1,
)

grid.fit(X_train, y_train)

print("Best Parameters Found:", grid.best_params_)
print("Best ROC-AUC Score from CV:", grid.best_score_)

# Evaluate on test set
y_proba = grid.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_proba)
print("Test ROC-AUC:", test_auc)

y_pred = grid.predict(X_test)
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred))

# Confusion matrix
ConfusionMatrixDisplay.from_estimator(grid, X_test, y_test)
plt.title("Confusion Matrix — Best XGBoost Model")
plt.show()
