In [2]:
from __future__ import annotations
import argparse, json, yaml, joblib, warnings, sys
from pathlib import Path
from typing import List, Optional, Tuple, Dict

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

warnings.filterwarnings("ignore")

# --------- Paths ---------
IN_CSV_DEFAULT  = r"C:\Users\NXTWAVE\Downloads\Return Radar\archive\ecommerce_returns_synthetic_data.csv"
OUT_DIR_DEFAULT = r"C:\Users\NXTWAVE\Downloads\Return Radar"

# --------- Helpers ---------
def ensure_outdir(p: str | Path) -> Path:
    p = Path(p)
    p.mkdir(parents=True, exist_ok=True)
    return p

def map_return_status(s: pd.Series) -> pd.Series:
    """Map Return_Status to {0,1}. Robust to common spellings."""
    x = s.astype(str).str.strip().str.lower()
    pos = {"returned","return","yes","y","true","1","approved","accepted","processed"}
    neg = {"not returned","not_returned","no return","no","n","false","0","rejected","none","nan"}
    def _m(v: str):
        if v in pos: return 1
        if v in neg: return 0
        if "not" in v: return 0
        if "return" in v: return 1
        return np.nan
    return x.map(_m)

def derive_target(df: pd.DataFrame) -> pd.Series:
    """Prefer Return_Status; fallback to presence of Return_Date."""
    y = None
    if "Return_Status" in df.columns:
        y = map_return_status(df["Return_Status"])
    if "Return_Date" in df.columns:
        dt = pd.to_datetime(df["Return_Date"], errors="coerce")
        y_date = dt.notna().astype(int)
        y = y if y is not None else y_date
        y = pd.Series(np.where(pd.isna(y), y_date, y), index=df.index)
    if y is None:
        raise SystemExit("Could not derive target: need Return_Status and/or Return_Date.")
    return y.astype(float)

def parse_order_date(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    if "Order_Date" in out.columns:
        od = pd.to_datetime(out["Order_Date"], errors="coerce", infer_datetime_format=True)
        out["Order_Year"]  = od.dt.year
        out["Order_Month"] = od.dt.month
        out["Order_DOW"]   = od.dt.dayofweek
        out["Order_Hour"]  = od.dt.hour
        out = out.drop(columns=["Order_Date"])
    return out

def detect_features(df: pd.DataFrame, target_col: str, drop_cols: List[str]) -> Tuple[List[str], List[str], Optional[str]]:
    id_col = "Order_ID" if "Order_ID" in df.columns else None
    drop = set(drop_cols + [target_col])
    if id_col: drop.add(id_col)
    num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c not in drop]
    cat_cols = [c for c in df.columns if not pd.api.types.is_numeric_dtype(df[c]) and c not in drop]
    return num_cols, cat_cols, id_col

def make_ohe_dense() -> OneHotEncoder:
    """Create a OneHotEncoder that returns DENSE output across sklearn versions."""
    try:
        # sklearn >= 1.2
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        # sklearn < 1.2
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

def build_pipeline(num_cols: List[str], cat_cols: List[str]) -> Pipeline:
    num_proc = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        # Dense path is fine with with_mean=True, but keeping False is harmless
        ("scaler", StandardScaler(with_mean=False)),
    ])
    cat_proc = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", make_ohe_dense()),
    ])
    pre = ColumnTransformer([
        ("num", num_proc, num_cols),
        ("cat", cat_proc, cat_cols),
    ])
    # Dense features -> lbfgs works well
    clf = LogisticRegression(C=4.0, max_iter=500, class_weight="balanced", solver="lbfgs")
    return Pipeline([("pre", pre), ("clf", clf)])

def top_positive_features(pipe: Pipeline, k: int = 20) -> List[Tuple[str, float]]:
    try:
        pre: ColumnTransformer = pipe.named_steps["pre"]
        clf: LogisticRegression = pipe.named_steps["clf"]
        names = pre.get_feature_names_out()
        coefs = clf.coef_[0]
        order = np.argsort(coefs)[::-1]
        idx = order[:k]
        return [(str(names[i]), float(coefs[i])) for i in idx]
    except Exception:
        return []

# --------- Train & Export ---------
def train_and_export(in_csv: Path, outdir: Path, seed: int = 42):
    if not in_csv.exists():
        raise SystemExit(f"Input CSV not found: {in_csv}")
    df_raw = pd.read_csv(in_csv)
    if df_raw.empty:
        raise SystemExit("Input CSV is empty.")

    # Derive target
    y = derive_target(df_raw)
    mask = y.isin([0,1])
    if mask.sum() < 50:
        raise SystemExit(f"Too few labeled rows after mapping Return_Status/Return_Date: {mask.sum()} (need >= 50).")

    df = df_raw.loc[mask].copy()
    y = y.loc[mask].astype(int)

    # Drop post-shipment leakage
    leak_cols = []
    if "Return_Date" in df.columns: leak_cols.append("Return_Date")
    if "Return_Reason" in df.columns: leak_cols.append("Return_Reason")
    if "Days_to_Return" in df.columns: leak_cols.append("Days_to_Return")
    target_col = "__returned__"
    df[target_col] = y

    # Parse order date into features
    df = parse_order_date(df)

    # Detect feature types
    num_cols, cat_cols, id_col = detect_features(df, target_col, drop_cols=leak_cols)

    # Build X,y
    X = df.drop(columns=[target_col] + leak_cols + ([id_col] if id_col else []))
    y = df[target_col].astype(int)

    # Split
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=seed, stratify=y)

    # Pipeline (dense OHE)
    pipe = build_pipeline(num_cols=[c for c in num_cols if c in X.columns],
                          cat_cols=[c for c in cat_cols if c in X.columns])
    pipe.fit(Xtr, ytr)

    # Evaluate
    yhat = pipe.predict(Xte)
    try:
        yproba = pipe.predict_proba(Xte)[:, 1]
        auc = float(roc_auc_score(yte, yproba))
    except Exception:
        yproba = None
        auc = float("nan")

    rep = classification_report(yte, yhat, output_dict=True)
    cm  = confusion_matrix(yte, yhat, labels=[0, 1])

    # Save model
    model_path = outdir / "returns_model.pkl"
    joblib.dump(pipe, model_path)

    # Save HDF5 with holdout predictions
    te_out = Xte.copy()
    te_out["returned_true"] = yte.values
    te_out["returned_pred"] = yhat
    if yproba is not None:
        te_out["proba_return"] = yproba
    try:
        te_out.to_hdf(outdir / "processed_returns.h5", key="holdout", mode="w")  # needs `tables`
    except Exception as e:
        print("[WARN] Could not write HDF5 (install `tables`):", e)

    # Insights JSON
    insights: Dict[str, object] = {
        "rows_total": int(len(df)),
        "return_rate_total": float(y.mean()),
        "auc": auc,
        "top_positive_features": top_positive_features(pipe, 20),
    }
    for name, col in [("by_category","Product_Category"),
                      ("by_payment","Payment_Method"),
                      ("by_shipping","Shipping_Method"),
                      ("by_location","User_Location")]:
        if col in df_raw.columns:
            tmp = pd.DataFrame({col: df_raw.loc[mask, col], "returned": y})
            grp = tmp.groupby(col)["returned"].mean().sort_values(ascending=False).head(20)
            insights[name] = grp.round(4).to_dict()

    with open(outdir / "insights.json", "w", encoding="utf-8") as f:
        json.dump(insights, f, ensure_ascii=False, indent=2)

    # YAML metadata
    meta = {
        "input_csv": str(in_csv),
        "shape": [int(df.shape[0]), int(df.shape[1])],
        "target_from": ["Return_Status", "Return_Date (fallback)"],
        "dropped_leak_columns": leak_cols,
        "id_column": id_col,
        "numeric_cols": [c for c in num_cols if c in X.columns],
        "categorical_cols": [c for c in cat_cols if c in X.columns],
        "model": "OneHot(dense) + LogisticRegression(class_weight=balanced, C=4.0, max_iter=500)",
        "metrics": {
            "accuracy": float(rep.get("accuracy", 0.0)),
            "precision_pos": float(rep.get("1", {}).get("precision", 0.0)),
            "recall_pos": float(rep.get("1", {}).get("recall", 0.0)),
            "f1_pos": float(rep.get("1", {}).get("f1-score", 0.0)),
            "precision_neg": float(rep.get("0", {}).get("precision", 0.0)),
            "recall_neg": float(rep.get("0", {}).get("recall", 0.0)),
            "f1_neg": float(rep.get("0", {}).get("f1-score", 0.0)),
            "auc": auc,
        },
    }
    with open(outdir / "build_metadata.yaml", "w", encoding="utf-8") as f:
        yaml.safe_dump(meta, f, sort_keys=False)

    # CSV helpers
    pd.DataFrame(rep).to_csv(outdir / "eval_report.csv")
    pd.DataFrame(cm, index=["true_0","true_1"], columns=["pred_0","pred_1"]).to_csv(outdir / "confusion_matrix.csv")

    print("\n[OK] Artifacts written to:", outdir)
    for p in ["processed_returns.h5","returns_model.pkl","build_metadata.yaml","insights.json",
              "eval_report.csv","confusion_matrix.csv"]:
        print(" ", outdir / p)

# --------- Entry Point ---------
def main(argv: Optional[List[str]] = None):
    ap = argparse.ArgumentParser()
    ap.add_argument("--data", type=str, default=IN_CSV_DEFAULT)
    ap.add_argument("--outdir", type=str, default=OUT_DIR_DEFAULT)
    ap.add_argument("--seed", type=int, default=42)
    # Jupyter-safe:
    args, _ = ap.parse_known_args(argv)

    in_csv = Path(args.data)
    outdir = ensure_outdir(args.outdir)
    train_and_export(in_csv, outdir, seed=args.seed)

if __name__ == "__main__":
    main(sys.argv[1:])



[OK] Artifacts written to: C:\Users\NXTWAVE\Downloads\Return Radar
  C:\Users\NXTWAVE\Downloads\Return Radar\processed_returns.h5
  C:\Users\NXTWAVE\Downloads\Return Radar\returns_model.pkl
  C:\Users\NXTWAVE\Downloads\Return Radar\build_metadata.yaml
  C:\Users\NXTWAVE\Downloads\Return Radar\insights.json
  C:\Users\NXTWAVE\Downloads\Return Radar\eval_report.csv
  C:\Users\NXTWAVE\Downloads\Return Radar\confusion_matrix.csv
