In [1]:
from __future__ import annotations
import argparse, sys
from pathlib import Path
from typing import Optional, List, Tuple, Dict

import numpy as np
import pandas as pd
import joblib

# ----------- Paths (edit if needed) -----------
MODEL_PATH_DEFAULT = Path(r"C:\Users\NXTWAVE\Downloads\Return Radar\returns_model.pkl")
IN_CSV_DEFAULT     = Path(r"C:\Users\NXTWAVE\Downloads\Return Radar\archive\ecommerce_returns_synthetic_data.csv")
OUT_DIR_DEFAULT    = Path(r"C:\Users\NXTWAVE\Downloads\Return Radar")

# ----------- Helpers -----------
def ensure_outdir(p: Path) -> Path:
    p.mkdir(parents=True, exist_ok=True)
    return p

def map_return_status(series: pd.Series) -> pd.Series:
    """Map Return_Status to {0,1}, robust to common spellings."""
    x = series.astype(str).str.strip().str.lower()
    pos = {"returned","return","yes","y","true","1","approved","accepted","processed"}
    neg = {"not returned","not_returned","no return","no","n","false","0","rejected","none","nan"}
    def _m(v: str):
        if v in pos: return 1
        if v in neg: return 0
        if "not" in v: return 0
        if "return" in v: return 1
        return np.nan
    return x.map(_m)

def derive_ground_truth(df: pd.DataFrame) -> Optional[pd.Series]:
    """
    Try to derive 0/1 ground truth from Return_Status and/or Return_Date.
    Returns a Series of {0,1} or None if can’t be derived.
    """
    y = None
    if "Return_Status" in df.columns:
        y = map_return_status(df["Return_Status"])
    if "Return_Date" in df.columns:
        dt = pd.to_datetime(df["Return_Date"], errors="coerce")
        y_date = dt.notna().astype(int)
        y = y if y is not None else y_date
        y = pd.Series(np.where(pd.isna(y), y_date, y), index=df.index)
    if y is None:
        return None
    return y.astype(float).where(lambda s: s.isin([0,1]), other=np.nan).dropna()

def parse_order_date(df: pd.DataFrame) -> pd.DataFrame:
    """Add features derived from Order_Date (Year/Month/DOW/Hour) if present."""
    out = df.copy()
    if "Order_Date" in out.columns:
        od = pd.to_datetime(out["Order_Date"], errors="coerce", infer_datetime_format=True)
        out["Order_Year"]  = od.dt.year
        out["Order_Month"] = od.dt.month
        out["Order_DOW"]   = od.dt.dayofweek
        out["Order_Hour"]  = od.dt.hour
        # NOTE: during training we dropped Order_Date; do the same here
        out = out.drop(columns=["Order_Date"])
    return out

def drop_leakage_cols(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drop columns that were excluded during training because they leak post-shipment info.
    """
    leak_cols = [c for c in ["Return_Date","Return_Reason","Days_to_Return"] if c in df.columns]
    return df.drop(columns=leak_cols, errors="ignore")

def drop_id_and_target_like(df: pd.DataFrame) -> pd.DataFrame:
    """
    Remove ID/target-like columns that would not be used by the model.
    (The pipeline saved in returns_model.pkl already knows its feature names,
     so we keep inputs as close as possible.)
    """
    cols_to_drop = [c for c in ["Order_ID", "__returned__", "returned", "is_returned", "return", "label", "target"]
                    if c in df.columns]
    return df.drop(columns=cols_to_drop, errors="ignore")

def align_columns_for_pipeline(df: pd.DataFrame, model) -> pd.DataFrame:
    """
    Make sure df has the columns the pipeline expects.
    If the training pipeline used ColumnTransformer with explicit column names,
    sklearn will select by name. We therefore:
      - parse Order_Date
      - drop leakage/id/target-like cols
      - keep all remaining columns; the ColumnTransformer will ignore unknowns
        if it was built with explicit lists. (If it wasn’t, it still works
        because it selects by the exact names used at fit time.)
    """
    df2 = parse_order_date(df)
    df2 = drop_leakage_cols(df2)
    df2 = drop_id_and_target_like(df2)
    return df2

# ----------- Prediction core -----------
def predict_file(model_path: Path, in_csv: Path, out_dir: Path) -> Tuple[Path, Optional[Path], Optional[Path]]:
    if not model_path.exists():
        raise SystemExit(f"Model not found: {model_path}\nRun your training script to create returns_model.pkl.")
    model = joblib.load(model_path)

    if not in_csv.exists():
        raise SystemExit(f"Input CSV not found: {in_csv}")

    # Read input; keep a copy for output
    df_in = pd.read_csv(in_csv)
    if df_in.empty:
        raise SystemExit("Input CSV is empty.")

    # Prepare features to match training
    X = align_columns_for_pipeline(df_in, model)

    # Predict
    try:
        proba = model.predict_proba(X)[:, 1]
        pred  = (proba >= 0.5).astype(int)
    except Exception:
        # If the classifier doesn’t expose proba
        pred = model.predict(X)
        proba = np.zeros(len(pred), dtype=float)

    out = df_in.copy()
    out["proba_return"]  = proba
    out["returned_pred"] = pred

    out_dir = ensure_outdir(out_dir)
    base = in_csv.stem
    pred_path = out_dir / f"predictions_{base}.csv"
    out.to_csv(pred_path, index=False)

    # If we can derive ground truth, compute quick metrics
    y = derive_ground_truth(df_in)
    report_path = None
    cm_path = None

    if y is not None and y.notna().any():
        # Align with rows that had valid ground truth
        mask = y.index
        y_true = y.loc[mask].astype(int)
        y_pred = pd.Series(pred, index=df_in.index).loc[mask].astype(int)

        from sklearn.metrics import classification_report, confusion_matrix
        rep = classification_report(y_true, y_pred, output_dict=True)
        cm  = confusion_matrix(y_true, y_pred, labels=[0,1])

        # Save
        rep_df = pd.DataFrame(rep)
        report_path = out_dir / f"prediction_report_{base}.csv"
        rep_df.to_csv(report_path)

        cm_df = pd.DataFrame(cm, index=["true_0","true_1"], columns=["pred_0","pred_1"])
        cm_path = out_dir / "confusion_matrix_pred.csv"
        cm_df.to_csv(cm_path)

        print("[OK] Metrics saved:", report_path, cm_path)

    print("[OK] Predictions saved:", pred_path)
    return pred_path, report_path, cm_path

# ----------- CLI / Jupyter entry -----------
def main(argv: Optional[List[str]] = None):
    ap = argparse.ArgumentParser()
    ap.add_argument("--model", type=str, default=str(MODEL_PATH_DEFAULT), help="Path to returns_model.pkl")
    ap.add_argument("--data",  type=str, default=str(IN_CSV_DEFAULT),     help="Path to CSV to score")
    ap.add_argument("--outdir",type=str, default=str(OUT_DIR_DEFAULT),    help="Where to write outputs")
    # Jupyter-safe parsing:
    args, _ = ap.parse_known_args(argv)

    pred_path, rep_path, cm_path = predict_file(Path(args.model), Path(args.data), Path(args.outdir))
    print("\nArtifacts:")
    print("  Predictions:", pred_path)
    if rep_path: print("  Report     :", rep_path)
    if cm_path:  print("  CM         :", cm_path)

    main(sys.argv[1:])


  od = pd.to_datetime(out["Order_Date"], errors="coerce", infer_datetime_format=True)


[OK] Metrics saved: C:\Users\NXTWAVE\Downloads\Return Radar\prediction_report_ecommerce_returns_synthetic_data.csv C:\Users\NXTWAVE\Downloads\Return Radar\confusion_matrix_pred.csv
[OK] Predictions saved: C:\Users\NXTWAVE\Downloads\Return Radar\predictions_ecommerce_returns_synthetic_data.csv

Artifacts:
  Predictions: C:\Users\NXTWAVE\Downloads\Return Radar\predictions_ecommerce_returns_synthetic_data.csv
  Report     : C:\Users\NXTWAVE\Downloads\Return Radar\prediction_report_ecommerce_returns_synthetic_data.csv
  CM         : C:\Users\NXTWAVE\Downloads\Return Radar\confusion_matrix_pred.csv
