In [1]:
# ==========================================================
# FraudLens — Train & Predict (with probabilities + plots)
# ==========================================================
# Input:
#   C:\Users\sagni\Downloads\FraudLens\archive\creditcard_2023.csv
#
# Outputs to C:\Users\sagni\Downloads\FraudLens:
#   - fraudlens_model.pkl
#   - fraudlens_test_predictions.csv
#   - fraudlens_metrics.json
#   - fraudlens_confusion_matrix.png
#   - fraudlens_roc_curve.png
#   - fraudlens_pr_curve.png
#   - fraudlens_feature_importance.csv
#   - fraudlens_classification_report.txt
# ==========================================================

from pathlib import Path
import re
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    precision_recall_curve, roc_curve,
    confusion_matrix, classification_report
)
import joblib

# -----------------------------
# Paths
# -----------------------------
CSV_PATH = Path(r"C:\Users\sagni\Downloads\FraudLens\archive\creditcard_2023.csv")
OUT_DIR  = Path(r"C:\Users\sagni\Downloads\FraudLens")
OUT_DIR.mkdir(parents=True, exist_ok=True)

MODEL_PKL   = OUT_DIR / "fraudlens_model.pkl"
PRED_CSV    = OUT_DIR / "fraudlens_test_predictions.csv"
METRICS_JSON= OUT_DIR / "fraudlens_metrics.json"
CM_PNG      = OUT_DIR / "fraudlens_confusion_matrix.png"
ROC_PNG     = OUT_DIR / "fraudlens_roc_curve.png"
PR_PNG      = OUT_DIR / "fraudlens_pr_curve.png"
FI_CSV      = OUT_DIR / "fraudlens_feature_importance.csv"
CLS_TXT     = OUT_DIR / "fraudlens_classification_report.txt"

# -----------------------------
# Load
# -----------------------------
print(f"[INFO] Loading: {CSV_PATH}")
df = pd.read_csv(CSV_PATH, low_memory=False)
print("[INFO] Shape:", df.shape)
print("[INFO] Columns:", list(df.columns))

# -----------------------------
# Detect label column
# -----------------------------
def find_label_col(cols):
    pats = [
        r"\bclass\b", r"\blabel\b", r"\btarget\b",
        r"\bis[_\-\s]*fraud\b", r"\bfraud\b", r"^y$"
    ]
    norm = {c: re.sub(r"[^a-z0-9]+", " ", str(c).lower()).strip() for c in cols}
    for c, nc in norm.items():
        for p in pats:
            if re.search(p, nc):
                return c
    return None

label_col = find_label_col(df.columns)
if label_col is None:
    raise KeyError("Could not find label column. Rename your label to one of: Class, fraud, is_fraud, label, target, y.")

print(f"[INFO] Detected label column: {label_col}")

# -----------------------------
# y (binary) and X (numeric)
# -----------------------------
y_raw = df[label_col]
# Map to 0/1
if pd.api.types.is_numeric_dtype(y_raw):
    y = (y_raw.astype(float) > 0).astype(int).values
else:
    lower = y_raw.astype(str).str.lower().str.strip()
    pos = {"1","true","t","yes","y","fraud"}
    y = lower.apply(lambda v: 1 if v in pos else 0).astype(int).values

# Numeric features
num_df = df.select_dtypes(include=[np.number]).copy()
if label_col in num_df.columns:
    X = num_df.drop(columns=[label_col])
else:
    X = num_df

# Fill missing with 0 (you can choose median/iterative imputation if needed)
X = X.fillna(0.0).astype(float)
print("[INFO] Feature count:", X.shape[1])

# -----------------------------
# Train/Test split (stratified)
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# Pipeline: Standardize + Balanced Logistic Regression
# -----------------------------
pipe = Pipeline(steps=[
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LogisticRegression(
        solver="saga",
        penalty="l2",
        class_weight="balanced",   # handles extreme imbalance robustly
        max_iter=1000,
        n_jobs=-1,
        random_state=42
    ))
])

print("[INFO] Training model...")
pipe.fit(X_train, y_train)

# -----------------------------
# Predict probabilities & labels
# -----------------------------
probs = pipe.predict_proba(X_test)[:, 1]
preds = (probs >= 0.5).astype(int)  # default 0.5 threshold

# -----------------------------
# Metrics
# -----------------------------
roc_auc = roc_auc_score(y_test, probs)
pr_auc  = average_precision_score(y_test, probs)  # area under PR (AP)
cm = confusion_matrix(y_test, preds, labels=[0,1])

print(f"[INFO] ROC-AUC: {roc_auc:.4f} | PR-AUC: {pr_auc:.4f}")
print("[INFO] Confusion matrix:\n", cm)

# -----------------------------
# Save predictions CSV
# -----------------------------
pred_df = pd.DataFrame({
    "row_id": np.arange(len(y_test)),
    "y_true": y_test,
    "prob_fraud": probs,
    "y_pred": preds
})
pred_df.to_csv(PRED_CSV, index=False)
print(f"[SAVED] Predictions -> {PRED_CSV}")

# -----------------------------
# Curves: ROC & Precision-Recall
# -----------------------------
# ROC
fpr, tpr, _ = roc_curve(y_test, probs)
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"ROC-AUC={roc_auc:.3f}")
plt.plot([0,1], [0,1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("FraudLens: ROC Curve")
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.savefig(ROC_PNG, dpi=220)
plt.close()
print(f"[SAVED] ROC curve -> {ROC_PNG}")

# PR
prec, rec, _ = precision_recall_curve(y_test, probs)
plt.figure(figsize=(6, 5))
plt.plot(rec, prec, label=f"PR-AUC={pr_auc:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("FraudLens: Precision-Recall Curve")
plt.legend(loc="upper right")
plt.grid(True)
plt.tight_layout()
plt.savefig(PR_PNG, dpi=220)
plt.close()
print(f"[SAVED] PR curve -> {PR_PNG}")

# -----------------------------
# Confusion Matrix heatmap
# -----------------------------
plt.figure(figsize=(5.2, 4.6))
im = plt.imshow(cm, aspect='equal')
plt.title("FraudLens: Confusion Matrix (Thresh=0.5)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.colorbar(im)
plt.xticks([0,1], ["Legit (0)", "Fraud (1)"])
plt.yticks([0,1], ["Legit (0)", "Fraud (1)"])
for (i, j), v in np.ndenumerate(cm):
    plt.text(j, i, str(v), ha='center', va='center')
plt.tight_layout()
plt.savefig(CM_PNG, dpi=220)
plt.close()
print(f"[SAVED] Confusion matrix -> {CM_PNG}")

# -----------------------------
# Feature importance (coef magnitude)
# -----------------------------
# For LR: coef_ after scaling are interpretable
# We'll compute absolute coefficients and map back to column names
clf = pipe.named_steps["clf"]
scaler = pipe.named_steps["scaler"]

if hasattr(clf, "coef_"):
    coefs = clf.coef_.ravel()  # shape (n_features,)
    # map to original feature names
    feat_names = X.columns.tolist()
    imp = pd.DataFrame({
        "feature": feat_names,
        "coef": coefs,
        "importance_abs": np.abs(coefs)
    }).sort_values("importance_abs", ascending=False)
    imp.to_csv(FI_CSV, index=False)
    print(f"[SAVED] Feature importance -> {FI_CSV}")

# -----------------------------
# Save classification report & metrics JSON
# -----------------------------
report = classification_report(y_test, preds, digits=4)
with open(CLS_TXT, "w", encoding="utf-8") as f:
    f.write("=== FraudLens: Logistic Regression (balanced) ===\n\n")
    f.write(report + "\n")
print(f"[SAVED] Classification report -> {CLS_TXT}")

metrics = {
    "roc_auc": float(roc_auc),
    "pr_auc": float(pr_auc),
    "confusion_matrix": {
        "tn": int(cm[0,0]), "fp": int(cm[0,1]),
        "fn": int(cm[1,0]), "tp": int(cm[1,1])
    }
}
with open(METRICS_JSON, "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2)
print(f"[SAVED] Metrics JSON -> {METRICS_JSON}")

# -----------------------------
# Save trained pipeline
# -----------------------------
joblib.dump(pipe, MODEL_PKL)
print(f"[SAVED] Model -> {MODEL_PKL}")

print("\n[DONE] All artifacts saved in:", OUT_DIR)


[INFO] Loading: C:\Users\sagni\Downloads\FraudLens\archive\creditcard_2023.csv
[INFO] Shape: (568630, 31)
[INFO] Columns: ['id', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class']
[INFO] Detected label column: Class
[INFO] Feature count: 30
[INFO] Training model...




[INFO] ROC-AUC: 0.9998 | PR-AUC: 0.9999
[INFO] Confusion matrix:
 [[56810    53]
 [  173 56690]]
[SAVED] Predictions -> C:\Users\sagni\Downloads\FraudLens\fraudlens_test_predictions.csv
[SAVED] ROC curve -> C:\Users\sagni\Downloads\FraudLens\fraudlens_roc_curve.png
[SAVED] PR curve -> C:\Users\sagni\Downloads\FraudLens\fraudlens_pr_curve.png
[SAVED] Confusion matrix -> C:\Users\sagni\Downloads\FraudLens\fraudlens_confusion_matrix.png
[SAVED] Feature importance -> C:\Users\sagni\Downloads\FraudLens\fraudlens_feature_importance.csv
[SAVED] Classification report -> C:\Users\sagni\Downloads\FraudLens\fraudlens_classification_report.txt
[SAVED] Metrics JSON -> C:\Users\sagni\Downloads\FraudLens\fraudlens_metrics.json
[SAVED] Model -> C:\Users\sagni\Downloads\FraudLens\fraudlens_model.pkl

[DONE] All artifacts saved in: C:\Users\sagni\Downloads\FraudLens
