In [1]:
# run_multimodal_fraud.py
# Multimodal fraud detection: location modality + transaction modality + stacking
# Fully-ready: auto-download IEEE-CIS data via Kaggle API or auto-generate synthetic fallback.
# Notebook-safe argparse: ignores Jupyter's hidden "-f" argument.

import os
import io
import sys
import zipfile
import argparse
import warnings
import subprocess
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore")

KAGGLE_COMPETITION = "ieee-fraud-detection"  # Kaggle competition slug
REQUIRED_FILES = ["train_transaction.csv", "train_identity.csv"]

def pip_install(pkg: str):
    try:
        __import__(pkg.replace("-", "_"))
        return True
    except Exception:
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
            return True
        except Exception:
            return False

def try_download_kaggle_competition(data_dir: str) -> bool:
    data_path = Path(data_dir)
    data_path.mkdir(parents=True, exist_ok=True)
    if all((data_path / f).exists() for f in REQUIRED_FILES):
        return True

    if not pip_install("kaggle"):
        print("[WARN] Could not install 'kaggle'; using synthetic fallback.", flush=True)
        return False

    try:
        from kaggle.api.kaggle_api_extended import KaggleApi
        api = KaggleApi()
        api.authenticate()
        api.competition_download_files(KAGGLE_COMPETITION, path=str(data_path), quiet=False)
        zip_file = data_path / f"{KAGGLE_COMPETITION}.zip"
        if zip_file.exists():
            with zipfile.ZipFile(zip_file, "r") as zf:
                zf.extractall(path=data_path)
            zip_file.unlink(missing_ok=True)
        return all((data_path / f).exists() for f in REQUIRED_FILES)
    except Exception as e:
        print(f"[WARN] Kaggle download failed: {e}", flush=True)
        return False

def generate_synthetic_ieee(data_dir: str, n=25000, seed=42):
    rng = np.random.default_rng(seed)
    TransactionID = np.arange(1, n + 1, dtype=np.int64)
    TransactionDT = rng.integers(0, 60 * 60 * 24 * 90, size=n)  # 90 days
    TransactionAmt = np.round(rng.lognormal(mean=3.6, sigma=0.75, size=n), 2)
    hour = (TransactionDT // 3600) % 24
    C1 = rng.poisson(5, size=n)
    D1 = rng.exponential(scale=2.5, size=n)
    D2 = rng.exponential(scale=3.0, size=n)

    regions = np.array([f"r{i}" for i in range(1, 151)])
    dirichlet_p = rng.dirichlet(alpha=np.ones(len(regions)))
    addr1 = rng.choice(regions, size=n, p=dirichlet_p)
    addr2 = rng.choice(np.array(["US", "CA", "GB", "DE", "FR", "IN", "SG", "AU", "BR", "JP"]), size=n)
    P_emaildomain = rng.choice(np.array([
        "gmail.com","yahoo.com","hotmail.com","live.com","outlook.com",
        "edu.co.uk","example.in","example.de","example.fr","example.jp"
    ]), size=n)
    dist1 = np.abs(rng.normal(loc=12, scale=9, size=n))
    dist2 = np.abs(rng.normal(loc=6, scale=6, size=n))

    amount_risk = (np.log1p(TransactionAmt) - np.log1p(np.median(TransactionAmt)))
    offhour = ((hour < 6) | (hour > 22)).astype(int)
    rare_region = pd.Series(addr1).map(pd.Series(addr1).value_counts(normalize=True)) < 0.01
    foreign = (pd.Series(addr2) != "US").astype(int)
    tld = pd.Series(P_emaildomain).str.split(".").str[-1]
    rare_tld = ~tld.isin(["com", "net", "org"]).astype(int)
    risk = 0.9*amount_risk + 0.7*offhour + 0.6*(dist1 > 25) + 0.6*(dist2 > 15) + \
           0.5*foreign + 0.5*rare_region.astype(int) + 0.4*rare_tld
    prob = 1 / (1 + np.exp(-(-2.2 + risk)))
    isFraud = (rng.random(n) < prob).astype(np.int8)

    train_trx = pd.DataFrame({
        "TransactionID": TransactionID,
        "isFraud": isFraud,
        "TransactionDT": TransactionDT,
        "TransactionAmt": TransactionAmt,
        "C1": C1,
        "D1": D1,
        "D2": D2,
        "addr1": addr1,
        "addr2": addr2,
        "dist1": dist1,
        "dist2": dist2,
        "P_emaildomain": P_emaildomain
    })

    DeviceInfo = rng.choice(np.array(["android", "ios", "windows", "linux", None]), size=n)
    DeviceType = rng.choice(np.array(["mobile", "desktop", None]), size=n)
    id_30 = rng.choice(np.array(["Windows 10", "Windows 7", "iOS 13.3", "Android 10", None]), size=n)
    id_31 = rng.choice(np.array(["chrome", "safari", "firefox", "edge", None]), size=n)

    train_id = pd.DataFrame({
        "TransactionID": TransactionID,
        "DeviceInfo": DeviceInfo,
        "DeviceType": DeviceType,
        "id_30": id_30,
        "id_31": id_31
    })

    data_path = Path(data_dir)
    data_path.mkdir(parents=True, exist_ok=True)
    train_trx.to_csv(data_path / "train_transaction.csv", index=False)
    train_id.to_csv(data_path / "train_identity.csv", index=False)

def reduce_mem(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.select_dtypes(include=["int64", "int32"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")
    for col in df.select_dtypes(include=["float64", "float32"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="float")
    return df

def engineer_time_features(df: pd.DataFrame):
    if "TransactionDT" in df.columns:
        df["DT_day"] = (df["TransactionDT"] // (24 * 60 * 60)).astype(np.int32)
        df["DT_hour"] = ((df["TransactionDT"] // 3600) % 24).astype(np.int16)
        df["DT_wday"] = (df["DT_day"] % 7).astype(np.int8)
    return df

def normalize_device_email(df: pd.DataFrame):
    for c in ["DeviceInfo", "DeviceType", "id_30", "id_31"]:
        if c in df.columns:
            df[c] = df[c].fillna("unknown").astype(str).str.lower()
    for c in ["P_emaildomain", "R_emaildomain"]:
        if c in df.columns:
            df[c] = df[c].fillna("unknown").astype(str).str.lower()
    for c in ["P_emaildomain", "R_emaildomain"]:
        if c in df.columns:
            df[c + "_tld"] = df[c].apply(lambda x: x.split(".")[-1] if x != "unknown" else "unknown")
    return df

def build_features(df: pd.DataFrame):
    df = engineer_time_features(df)
    df = normalize_device_email(df)
    loc_cats = [c for c in ["addr1", "addr2", "P_emaildomain_tld"] if c in df.columns]
    loc_nums = [c for c in ["dist1", "dist2"] if c in df.columns]
    trx_nums = [c for c in ["TransactionAmt", "DT_hour", "DT_wday", "C1", "D1", "D2"] if c in df.columns]
    return loc_cats, loc_nums, trx_nums

def prepare_data(data_dir: str):
    data_path = Path(data_dir)
    if not all((data_path / f).exists() for f in REQUIRED_FILES):
        ok = try_download_kaggle_competition(str(data_path))
        if not ok:
            print("[INFO] Using synthetic IEEE-CIS-like dataset fallback (no manual steps required).", flush=True)
            generate_synthetic_ieee(str(data_path))

    train_trx = pd.read_csv(data_path / "train_transaction.csv")
    train_id = pd.read_csv(data_path / "train_identity.csv")
    train = train_trx.merge(train_id, on="TransactionID", how="left")
    train = reduce_mem(train)

    y = train["isFraud"].astype(np.int8)
    X = train.drop(columns=["isFraud"])

    loc_cats, loc_nums, trx_nums = build_features(X)

    for c in loc_nums + trx_nums:
        if c in X.columns:
            X[c] = X[c].fillna(0.0)
    for c in loc_cats:
        X[c] = X[c].fillna("unknown").astype(str)
    return X, y, loc_cats, loc_nums, trx_nums

def build_pipelines(loc_cats, loc_nums, trx_nums):
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", min_frequency=0.01, sparse_output=False)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", min_frequency=0.01, sparse=False)

    loc_pre = ColumnTransformer(
        transformers=[
            ("cat", ohe, loc_cats),
            ("num", StandardScaler(), loc_nums),
        ],
        remainder="drop"
    )
    trx_pre = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), trx_nums),
        ],
        remainder="drop"
    )
    loc_clf = HistGradientBoostingClassifier(
        max_depth=8, learning_rate=0.05, max_iter=400, class_weight={0: 1, 1: 8}, random_state=42
    )
    trx_clf = HistGradientBoostingClassifier(
        max_depth=8, learning_rate=0.05, max_iter=400, class_weight={0: 1, 1: 8}, random_state=42
    )
    loc_pipe = Pipeline([("pre", loc_pre), ("clf", loc_clf)])
    trx_pipe = Pipeline([("pre", trx_pre), ("clf", trx_clf)])
    meta_clf = LogisticRegression(max_iter=300)
    return loc_pipe, trx_pipe, meta_clf

def time_based_split(X: pd.DataFrame, y: pd.Series, test_frac=0.2, seed=42):
    if "TransactionDT" in X.columns:
        order = X["TransactionDT"].rank(method="first")
        cutoff = np.quantile(order, 1 - test_frac)
        test_mask = order >= cutoff
        X_train, X_valid = X[~test_mask].copy(), X[test_mask].copy()
        y_train, y_valid = y[~test_mask].copy(), y[test_mask].copy()
    else:
        X_train, X_valid, y_train, y_valid = train_test_split(
            X, y, test_size=test_frac, stratify=y, random_state=seed
        )
    return X_train, X_valid, y_train, y_valid




In [3]:
# ============================
# NEW: Visualization Function
# ============================
# FIXED: Visualization Function
import argparse

# ============================
def plot_visualizations(y_valid, loc_valid_p, trx_valid_p, meta_valid_p, best_thr,
                        auc_loc, auc_trx, auc_meta, X_valid, loc_pipe, trx_pipe, out_dir):
    Path(out_dir).mkdir(parents=True, exist_ok=True)

    # --- ROC Curves ---
    fpr_loc, tpr_loc, _ = roc_curve(y_valid, loc_valid_p)
    fpr_trx, tpr_trx, _ = roc_curve(y_valid, trx_valid_p)
    fpr_meta, tpr_meta, _ = roc_curve(y_valid, meta_valid_p)

    plt.figure(figsize=(6, 5))
    plt.plot(fpr_loc, tpr_loc, label=f"Location AUC={auc_loc:.3f}")
    plt.plot(fpr_trx, tpr_trx, label=f"Transaction AUC={auc_trx:.3f}")
    plt.plot(fpr_meta, tpr_meta, label=f"Stacked AUC={auc_meta:.3f}")
    plt.plot([0, 1], [0, 1], "k--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curves")
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, "roc_curves.png"))
    plt.close()

    # --- Confusion Matrix ---
    pred_meta = (meta_valid_p >= best_thr).astype(int)
    cm = confusion_matrix(y_valid, pred_meta)
    plt.figure(figsize=(4, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=["Legit", "Fraud"], yticklabels=["Legit", "Fraud"])
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix (Stacked)")
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, "confusion_matrix.png"))
    plt.close()

    # --- Transaction model feature importance ---
    trx_clf = trx_pipe.named_steps["clf"]
    trx_pre = trx_pipe.named_steps["pre"]
    result = permutation_importance(
        trx_clf, trx_pre.transform(X_valid), y_valid,
        n_repeats=10, random_state=42, n_jobs=-1
    )
    trx_feat_names = trx_pre.get_feature_names_out()
    importances = result.importances_mean

    # Sort and pick top 15
    idx = np.argsort(importances)[-15:]
    plt.figure(figsize=(8, 6))
    sns.barplot(x=importances[idx], y=trx_feat_names[idx], palette="viridis")
    plt.title("Top Transaction Features - Permutation Importance")
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, "feature_importance_transaction.png"))
    plt.close()

# --- Location model feature importance ---
    loc_clf = loc_pipe.named_steps["clf"]
    loc_pre = loc_pipe.named_steps["pre"]
    result_loc = permutation_importance(
        loc_clf, loc_pre.transform(X_valid), y_valid,
        n_repeats=10, random_state=42, n_jobs=-1
    )
    loc_feat_names = loc_pre.get_feature_names_out()
    importances_loc = result_loc.importances_mean

    idx_loc = np.argsort(importances_loc)[-15:]
    plt.figure(figsize=(8, 6))
    sns.barplot(x=importances_loc[idx_loc], y=loc_feat_names[idx_loc], palette="magma")
    plt.title("Top Location Features - Permutation Importance")
    plt.savefig(os.path.join(out_dir, "feature_importance_Location.png"))
    plt.close()

    plt.tight_layout()

    # --- Fraud vs Non-Fraud Distribution ---
    plt.figure(figsize=(4, 4))
    sns.countplot(x=y_valid, palette="Set2")
    plt.xticks([0, 1], ["Legit", "Fraud"])
    plt.title("Fraud vs Legit Distribution (Validation)")
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, "fraud_distribution.png"))
    plt.close()

    # --- Transaction Amount Distribution ---
    if "TransactionAmt" in X_valid.columns:
        plt.figure(figsize=(6, 4))
        sns.histplot(x=X_valid["TransactionAmt"], hue=y_valid, bins=50,
                     stat="density", common_norm=False, palette="Set1")
        plt.xlim(0, X_valid["TransactionAmt"].quantile(0.95))
        plt.title("Transaction Amount by Fraud Label")
        plt.tight_layout()
        plt.savefig(os.path.join(out_dir, "amount_distribution.png"))
        plt.close()

    # --- Fraud Rate by Hour of Day ---
    if "DT_hour" in X_valid.columns:
        df_tmp = pd.DataFrame({"hour": X_valid["DT_hour"], "fraud": y_valid})
        fraud_rate = df_tmp.groupby("hour")["fraud"].mean()
        plt.figure(figsize=(6, 4))
        fraud_rate.plot(kind="bar", color="coral")
        plt.ylabel("Fraud Rate")
        plt.title("Fraud Rate by Hour of Day")
        plt.tight_layout()
        plt.savefig(os.path.join(out_dir, "fraud_rate_by_hour.png"))
        plt.close()


def main(data_dir: str, out_dir: str):
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    X, y, loc_cats, loc_nums, trx_nums = prepare_data(data_dir)

    X_train, X_valid, y_train, y_valid = time_based_split(X, y, test_frac=0.2, seed=42)

    loc_pipe, trx_pipe, meta_clf = build_pipelines(loc_cats, loc_nums, trx_nums)
    loc_pipe.fit(X_train, y_train)
    trx_pipe.fit(X_train, y_train)

    loc_valid_p = loc_pipe.predict_proba(X_valid)[:, 1]
    trx_valid_p = trx_pipe.predict_proba(X_valid)[:, 1]
    meta_X_valid = np.vstack([loc_valid_p, trx_valid_p]).T
    meta_clf.fit(meta_X_valid, y_valid)

    auc_loc = roc_auc_score(y_valid, loc_valid_p)
    auc_trx = roc_auc_score(y_valid, trx_valid_p)
    meta_valid_p = meta_clf.predict_proba(meta_X_valid)[:, 1]
    auc_meta = roc_auc_score(y_valid, meta_valid_p)

    fpr, tpr, thr = roc_curve(y_valid, meta_valid_p)
    best_thr = thr[np.argmax(tpr - fpr)]

    with open(os.path.join(out_dir, "metrics.txt"), "w") as f:
        f.write(f"AUC_location={auc_loc:.5f}\n")
        f.write(f"AUC_transaction={auc_trx:.5f}\n")
        f.write(f"AUC_stacked={auc_meta:.5f}\n")
        f.write(f"best_threshold={best_thr:.6f}\n")

    pred_df = pd.DataFrame({
        "TransactionID": X_valid.get("TransactionID", pd.Series(range(len(X_valid)))),
        "proba_location": loc_valid_p,
        "proba_transaction": trx_valid_p,
        "proba_stacked": meta_valid_p,
    })
    pred_df["pred_stacked"] = (pred_df["proba_stacked"] >= best_thr).astype(int)
    pred_df["isFraud_true"] = y_valid.values
    pred_df.to_csv(os.path.join(out_dir, "valid_predictions.csv"), index=False)

    print(f"AUC (location)    : {auc_loc:.4f}")
    print(f"AUC (transaction) : {auc_trx:.4f}")
    print(f"AUC (stacked)     : {auc_meta:.4f}")
    print(f"Best threshold    : {best_thr:.4f}")
    print(f"Outputs saved in {out_dir}")

    # === Visualization step ===
    plot_visualizations(
        y_valid, loc_valid_p, trx_valid_p, meta_valid_p, best_thr,
        auc_loc, auc_trx, auc_meta, X_valid, loc_pipe, trx_pipe, out_dir
    )
    print(f"Saved visualizations in {out_dir}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir", type=str, default="./data")
    parser.add_argument("--out_dir", type=str, default="./outputs/tandl")
    args, _ = parser.parse_known_args()
    main(args.data_dir, args.out_dir)

AUC (location)    : 0.7252
AUC (transaction) : 0.7708
AUC (stacked)     : 0.8109
Best threshold    : 0.0245
Outputs saved in ./outputs/tandl
Saved visualizations in ./outputs/tandl


<Figure size 640x480 with 0 Axes>