In [1]:
from __future__ import annotations
import argparse, json, yaml, joblib, warnings, sys
from pathlib import Path
from typing import List, Optional, Tuple, Dict

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

warnings.filterwarnings("ignore")

# ---------- DEFAULT PATHS (edit if needed) ----------
IN_CSV_DEFAULT = r"C:\Users\NXTWAVE\Downloads\Return Radar\archive\ecommerce_returns_synthetic_data.csv"
OUT_DIR_DEFAULT = r"C:\Users\NXTWAVE\Downloads\Return Radar"

# Candidate columns for the target and common fields
TARGET_CANDS = ["returned", "is_returned", "return", "label", "target"]
ID_CANDS     = ["order_id", "OrderID", "orderId", "id", "ID"]
CAT_CANDS    = ["category", "Category", "product_category", "dept", "Department"]
BRAND_CANDS  = ["brand", "Brand"]
DATE_CANDS   = ["order_time", "order_date", "OrderDate", "purchase_date"]
TEXT_CANDS   = ["review_text", "return_reason", "comments", "notes", "description"]
REGION_CANDS = ["region", "Region", "state", "State", "city", "City"]
DEVICE_CANDS = ["device", "platform", "channel"]
NUM_HINTS    = ["price", "discount", "quantity", "qty", "delivered_days",
                "promised_delivery_days", "shipping_cost", "margin", "weight"]

# ---------- helpers ----------
def ensure_outdir(p: str | Path) -> Path:
    p = Path(p)
    p.mkdir(parents=True, exist_ok=True)
    return p

def pick_col(df: pd.DataFrame, cands: List[str]) -> Optional[str]:
    for c in cands:
        if c in df.columns:
            return c
    return None

def detect_target(df: pd.DataFrame) -> str:
    t = pick_col(df, TARGET_CANDS)
    if not t:
        raise SystemExit(f"Could not find target column among {TARGET_CANDS}. Available: {df.columns.tolist()}")
    return t

def coerce_binary(y: pd.Series) -> pd.Series:
    """
    Map various encodings to {0,1}: 'Yes'/'No', 'y'/'n', 'true'/'false', strings '0'/'1', etc.
    """
    y2 = y.copy()
    if y2.dtype == bool:
        return y2.astype(int)
    # normalize case/strings
    y2 = y2.astype(str).str.strip().str.lower()
    mapping = {
        "1":1, "0":0, "yes":1, "no":0, "y":1, "n":0, "true":1, "false":0, "returned":1, "not returned":0
    }
    return y2.map(lambda v: mapping.get(v, np.nan)).astype(float)

def parse_possible_datetimes(df: pd.DataFrame, date_cols: List[str]) -> pd.DataFrame:
    out = df.copy()
    for col in date_cols:
        if col in out.columns:
            dt = pd.to_datetime(out[col], errors="coerce", utc=False, infer_datetime_format=True)
            valid_frac = dt.notna().mean()
            if valid_frac > 0.5:
                out[f"{col}_year"]  = dt.dt.year
                out[f"{col}_month"] = dt.dt.month
                out[f"{col}_dow"]   = dt.dt.dayofweek
                out[f"{col}_hour"]  = dt.dt.hour
                # keep original column as categorical string (e.g., '2024-06' style) or drop it
                # here we drop to avoid too many columns
                out = out.drop(columns=[col])
    return out

def detect_feature_types(df: pd.DataFrame, target_col: str) -> Tuple[List[str], List[str], Optional[str]]:
    id_col = pick_col(df, ID_CANDS)
    drop = {target_col}
    if id_col:
        drop.add(id_col)
    # Numeric vs categorical
    num_cols = [c for c in df.columns
                if pd.api.types.is_numeric_dtype(df[c]) and c not in drop]
    cat_cols = [c for c in df.columns
                if not pd.api.types.is_numeric_dtype(df[c]) and c not in drop]
    return num_cols, cat_cols, id_col

def build_pipeline(num_cols: List[str], cat_cols: List[str]) -> Pipeline:
    num_proc = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False)),  # works with sparse output
    ])
    cat_proc = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=True)),
    ])
    pre = ColumnTransformer([
        ("num", num_proc, num_cols),
        ("cat", cat_proc, cat_cols),
    ])
    clf = LogisticRegression(
        C=4.0, max_iter=500, class_weight="balanced", n_jobs=-1, solver="lbfgs"
    )
    return Pipeline([("pre", pre), ("clf", clf)])

def top_coef_for_positive(pipe: Pipeline, k: int = 20) -> List[Tuple[str, float]]:
    """
    Extract top positive-class coefficients from LogisticRegression over transformed features.
    """
    try:
        pre: ColumnTransformer = pipe.named_steps["pre"]
        clf: LogisticRegression = pipe.named_steps["clf"]
        names = pre.get_feature_names_out()
        coefs = clf.coef_[0]
        order = np.argsort(coefs)[::-1]
        idx = order[:k]
        return [(str(names[i]), float(coefs[i])) for i in idx]
    except Exception:
        return []

# ---------- main training / export ----------
def train_and_export(in_csv: Path, outdir: Path, seed: int = 42):
    # Load
    if not in_csv.exists():
        raise SystemExit(f"Input CSV not found: {in_csv}")
    df = pd.read_csv(in_csv)
    if df.empty:
        raise SystemExit("Input CSV is empty.")

    # Target
    target_col = detect_target(df)
    y = coerce_binary(df[target_col])
    # keep rows with known label (0/1)
    mask = y.isin([0, 1])
    if mask.sum() < 50:
        raise SystemExit(f"Too few labeled rows after target mapping: {mask.sum()} (need >= 50).")
    df = df.loc[mask].copy()
    y = y.loc[mask].astype(int)

    # Optional date parsing on likely date columns
    candidates = [c for c in DATE_CANDS if c in df.columns]
    df = parse_possible_datetimes(df, candidates)

    # Identify features
    num_cols, cat_cols, id_col = detect_feature_types(df, target_col)
    # Drop target, and optional ID from features
    X = df.drop(columns=[target_col] + ([id_col] if id_col else []))

    # Train/valid split (stratified)
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=seed, stratify=y)

    # Build & fit
    pipe = build_pipeline(num_cols=[c for c in num_cols if c in X.columns],
                          cat_cols=[c for c in cat_cols if c in X.columns])
    pipe.fit(Xtr, ytr)

    # Evaluate
    yhat = pipe.predict(Xte)
    try:
        yproba = pipe.predict_proba(Xte)[:, 1]
        auc = float(roc_auc_score(yte, yproba))
    except Exception:
        yproba = None
        auc = float("nan")

    rep = classification_report(yte, yhat, output_dict=True)
    cm  = confusion_matrix(yte, yhat, labels=[0, 1])

    # Save PKL model
    model_path = outdir / "returns_model.pkl"
    joblib.dump(pipe, model_path)

    # Save HDF5 with holdout predictions
    te_out = Xte.copy()
    te_out["returned_true"] = yte.values
    te_out["returned_pred"] = yhat
    if yproba is not None:
        te_out["proba_return"] = yproba
    # requires `tables` package installed
    te_out.to_hdf(outdir / "processed_returns.h5", key="holdout", mode="w")

    # Insights JSON (quick business views)
    insights: Dict[str, object] = {
        "rows_total": int(len(df)),
        "return_rate_total": float(y.mean()),
        "auc": auc,
        "top_positive_features": top_coef_for_positive(pipe, k=20),
    }
    # Optional group by category / brand / region if columns exist
    for name, cands in [("by_category", CAT_CANDS), ("by_brand", BRAND_CANDS), ("by_region", REGION_CANDS)]:
        col = pick_col(df, cands)
        if col:
            tmp = pd.DataFrame({col: df[col], "returned": y})
            grp = tmp.groupby(col)["returned"].mean().sort_values(ascending=False).head(20)
            insights[name] = grp.round(4).to_dict()

    with open(outdir / "insights.json", "w", encoding="utf-8") as f:
        json.dump(insights, f, ensure_ascii=False, indent=2)

    # YAML metadata
    meta = {
        "input_csv": str(in_csv),
        "shape": [int(df.shape[0]), int(df.shape[1])],
        "target": target_col,
        "id_column": id_col,
        "numeric_cols": [c for c in num_cols if c in X.columns],
        "categorical_cols": [c for c in cat_cols if c in X.columns],
        "model": "OneHot + LogisticRegression(class_weight=balanced, C=4.0, max_iter=500)",
        "metrics": {
            "accuracy": float(rep.get("accuracy", 0.0)),
            "precision_pos": float(rep.get("1", {}).get("precision", 0.0)),
            "recall_pos": float(rep.get("1", {}).get("recall", 0.0)),
            "f1_pos": float(rep.get("1", {}).get("f1-score", 0.0)),
            "precision_neg": float(rep.get("0", {}).get("precision", 0.0)),
            "recall_neg": float(rep.get("0", {}).get("recall", 0.0)),
            "f1_neg": float(rep.get("0", {}).get("f1-score", 0.0)),
            "auc": auc,
        },
    }
    with open(outdir / "build_metadata.yaml", "w", encoding="utf-8") as f:
        yaml.safe_dump(meta, f, sort_keys=False)

    # Also drop CSVs (optional but handy)
    pd.DataFrame(rep).to_csv(outdir / "eval_report.csv")
    pd.DataFrame(cm, index=["true_0", "true_1"], columns=["pred_0", "pred_1"]).to_csv(outdir / "confusion_matrix.csv")

    print("\n[OK] Artifacts written to:", outdir)
    for p in ["processed_returns.h5", "returns_model.pkl", "build_metadata.yaml", "insights.json",
              "eval_report.csv", "confusion_matrix.csv"]:
        print(" ", outdir / p)

# ---------- entry point ----------
def main(argv: Optional[List[str]] = None):
    ap = argparse.ArgumentParser()
    ap.add_argument("--data", type=str, default=IN_CSV_DEFAULT, help="Path to ecommerce returns CSV")
    ap.add_argument("--outdir", type=str, default=OUT_DIR_DEFAULT, help="Where to save artifacts")
    ap.add_argument("--seed", type=int, default=42)
    # Jupyter-safe:
    args, _ = ap.parse_known_args(argv)

    in_csv = Path(args.data)
    outdir = ensure_outdir(args.outdir)
    train_and_export(in_csv, outdir, seed=args.seed)

if __name__ == "__main__":
    main(sys.argv[1:])


SystemExit: Could not find target column among ['returned', 'is_returned', 'return', 'label', 'target']. Available: ['Order_ID', 'Product_ID', 'User_ID', 'Order_Date', 'Return_Date', 'Product_Category', 'Product_Price', 'Order_Quantity', 'Return_Reason', 'Return_Status', 'Days_to_Return', 'User_Age', 'User_Gender', 'User_Location', 'Payment_Method', 'Shipping_Method', 'Discount_Applied']