In [1]:
# ==========================================================
# FraudLens â€” Accuracy Graph + Heatmap (Correlation + CM)
# ==========================================================
# Input CSV:
#   C:\Users\sagni\Downloads\FraudLens\archive\creditcard_2023.csv
#
# Outputs (saved to C:\Users\sagni\Downloads\FraudLens):
#   - fraudlens_feature_corr_heatmap.png
#   - fraudlens_accuracy_over_epochs.png
#   - fraudlens_accuracy_over_epochs.csv
#   - fraudlens_confusion_matrix.png     (bonus)
#   - fraudlens_model_report.txt         (bonus: quick metrics)
# ==========================================================

from pathlib import Path
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    confusion_matrix, classification_report
)

# -----------------------------
# Paths
# -----------------------------
CSV_PATH = Path(r"C:\Users\sagni\Downloads\FraudLens\archive\creditcard_2023.csv")
OUT_DIR  = Path(r"C:\Users\sagni\Downloads\FraudLens")
OUT_DIR.mkdir(parents=True, exist_ok=True)

HEATMAP_PNG = OUT_DIR / "fraudlens_feature_corr_heatmap.png"
ACC_PNG     = OUT_DIR / "fraudlens_accuracy_over_epochs.png"
ACC_CSV     = OUT_DIR / "fraudlens_accuracy_over_epochs.csv"
CM_PNG      = OUT_DIR / "fraudlens_confusion_matrix.png"
REPORT_TXT  = OUT_DIR / "fraudlens_model_report.txt"

# -----------------------------
# Load
# -----------------------------
print(f"[INFO] Loading: {CSV_PATH}")
df = pd.read_csv(CSV_PATH, low_memory=False)
print("[INFO] Shape:", df.shape)
print("[INFO] Columns:", list(df.columns))

# -----------------------------
# Detect label column
# -----------------------------
def find_label_col(cols):
    candidates = [
        r"\bclass\b", r"\blabel\b", r"\btarget\b", r"\bis[_\- ]?fraud\b",
        r"\bfraud\b", r"\by\b"
    ]
    norm = {c: re.sub(r"[^a-z0-9]+", " ", str(c).lower()).strip() for c in cols}
    for c, nc in norm.items():
        for pat in candidates:
            if re.search(pat, nc):
                return c
    return None

label_col = find_label_col(df.columns)
if label_col is None:
    raise KeyError(
        "Could not auto-detect a label column. "
        "Please rename your fraud flag to one of: Class, is_fraud, fraud, label, target, y."
    )
print(f"[INFO] Detected label column: {label_col}")

# -----------------------------
# Prepare X (numeric) and y (binary)
# -----------------------------
# Convert common string labels to binary if needed
y_raw = df[label_col]

if pd.api.types.is_numeric_dtype(y_raw):
    y = (y_raw.astype(float) > 0).astype(int).values  # treat >0 as fraud
else:
    low = y_raw.astype(str).str.lower().str.strip()
    pos_tokens = {"fraud", "true", "yes", "1", "y", "t"}
    y = low.apply(lambda v: 1 if v in pos_tokens else 0).astype(int).values

# Use only numeric features for modeling
num_df = df.select_dtypes(include=[np.number]).copy()
if label_col in num_df.columns:
    X = num_df.drop(columns=[label_col])
else:
    # label might be non-numeric; drop nothing in that case
    X = num_df

# If your dataset includes Time/Amount etc., they are numeric and will be included in X
print("[INFO] Feature count (numeric):", X.shape[1])

# -----------------------------
# Correlation Heatmap (numeric features)
# -----------------------------
# Compute correlation on numeric columns only (safe)
corr = X.corr(numeric_only=True)

plt.figure(figsize=(12, 9))
im = plt.imshow(corr.values, aspect='auto')
plt.xticks(range(corr.shape[1]), corr.columns, rotation=45, ha='right', fontsize=8)
plt.yticks(range(corr.shape[0]), corr.index, fontsize=8)
plt.title("FraudLens: Feature Correlation Heatmap")
plt.colorbar(im)
plt.tight_layout()
plt.savefig(HEATMAP_PNG, dpi=220)
plt.close()
print(f"[SAVED] Feature correlation heatmap -> {HEATMAP_PNG}")

# -----------------------------
# Train/Test split
# -----------------------------
X_np = X.fillna(0.0).astype(float).values  # simple NA handling
X_train, X_test, y_train, y_test = train_test_split(
    X_np, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize features
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc  = scaler.transform(X_test)

# -----------------------------
# Accuracy-over-epochs using SGD (logistic)
# -----------------------------
clf = SGDClassifier(
    loss="log_loss",
    alpha=1e-4,
    max_iter=1,       # manual epochs
    learning_rate="optimal",
    random_state=42,
    warm_start=True
)
classes = np.unique(y_train)
epochs = 15
train_acc, test_acc = [], []

for epoch in range(epochs):
    if epoch == 0:
        clf.partial_fit(X_train_sc, y_train, classes=classes)
    else:
        clf.partial_fit(X_train_sc, y_train)

    yhat_tr = clf.predict(X_train_sc)
    yhat_te = clf.predict(X_test_sc)

    train_acc.append(accuracy_score(y_train, yhat_tr))
    test_acc.append(accuracy_score(y_test, yhat_te))

# Plot accuracy curve
plt.figure(figsize=(8, 4.5))
plt.plot(range(1, epochs+1), train_acc, marker='o', label="Train Accuracy")
plt.plot(range(1, epochs+1), test_acc, marker='s', label="Test Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("FraudLens: Accuracy over Epochs (SGD Logistic)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(ACC_PNG, dpi=220)
plt.close()
print(f"[SAVED] Accuracy graph -> {ACC_PNG}")

# Save accuracy CSV
pd.DataFrame({
    "epoch": list(range(1, epochs+1)),
    "train_accuracy": train_acc,
    "test_accuracy": test_acc
}).to_csv(ACC_CSV, index=False)
print(f"[SAVED] Accuracy CSV -> {ACC_CSV}")

# -----------------------------
# Bonus: final confusion matrix heatmap (train a balanced LR)
# -----------------------------
final_lr = LogisticRegression(
    solver="saga",
    penalty="l2",
    class_weight="balanced",  # helps with strong imbalance
    max_iter=400,
    n_jobs=-1,
    random_state=42
)
final_lr.fit(X_train_sc, y_train)
y_pred = final_lr.predict(X_test_sc)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=[0,1])
plt.figure(figsize=(5.2, 4.6))
im = plt.imshow(cm, aspect='equal')
plt.title("FraudLens: Confusion Matrix (Balanced LR)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.colorbar(im)
plt.xticks([0,1], ["Legit(0)", "Fraud(1)"])
plt.yticks([0,1], ["Legit(0)", "Fraud(1)"])
for (i, j), v in np.ndenumerate(cm):
    plt.text(j, i, str(v), ha='center', va='center')
plt.tight_layout()
plt.savefig(CM_PNG, dpi=220)
plt.close()
print(f"[SAVED] Confusion matrix heatmap -> {CM_PNG}")

# Text report
report = classification_report(y_test, y_pred, digits=4)
with open(REPORT_TXT, "w", encoding="utf-8") as f:
    f.write("=== FraudLens: Balanced Logistic Regression Test Report ===\n\n")
    f.write(report + "\n")
print(f"[SAVED] Model report -> {REPORT_TXT}")

print("\n[DONE] Outputs in:", OUT_DIR)
print(" -", HEATMAP_PNG)
print(" -", ACC_PNG)
print(" -", ACC_CSV)
print(" -", CM_PNG)
print(" -", REPORT_TXT)


[INFO] Loading: C:\Users\sagni\Downloads\FraudLens\archive\creditcard_2023.csv
[INFO] Shape: (568630, 31)
[INFO] Columns: ['id', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class']
[INFO] Detected label column: Class
[INFO] Feature count (numeric): 30
[SAVED] Feature correlation heatmap -> C:\Users\sagni\Downloads\FraudLens\fraudlens_feature_corr_heatmap.png
[SAVED] Accuracy graph -> C:\Users\sagni\Downloads\FraudLens\fraudlens_accuracy_over_epochs.png
[SAVED] Accuracy CSV -> C:\Users\sagni\Downloads\FraudLens\fraudlens_accuracy_over_epochs.csv




[SAVED] Confusion matrix heatmap -> C:\Users\sagni\Downloads\FraudLens\fraudlens_confusion_matrix.png
[SAVED] Model report -> C:\Users\sagni\Downloads\FraudLens\fraudlens_model_report.txt

[DONE] Outputs in: C:\Users\sagni\Downloads\FraudLens
 - C:\Users\sagni\Downloads\FraudLens\fraudlens_feature_corr_heatmap.png
 - C:\Users\sagni\Downloads\FraudLens\fraudlens_accuracy_over_epochs.png
 - C:\Users\sagni\Downloads\FraudLens\fraudlens_accuracy_over_epochs.csv
 - C:\Users\sagni\Downloads\FraudLens\fraudlens_confusion_matrix.png
 - C:\Users\sagni\Downloads\FraudLens\fraudlens_model_report.txt
