In [1]:
# ============================================================
# GraphGuard — Inference & Batch Prediction
# - Predicts on TEST window and saves predictions.csv / predictions.jsonl
# - Writes predict_cli.py for custom runs (dataset or specific TXIDs)
# ============================================================
import os, csv, json, pickle, warnings, sys
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Optional pretty heatmaps if you later want them here too
try:
    import seaborn as sns
    USE_SNS = True
except Exception:
    USE_SNS = False

import tensorflow as tf

# -----------------------------
# Project paths
# -----------------------------
BASE        = r"C:\Users\sagni\Downloads\GraphGuard"
OUTPUT_DIR  = BASE
FEATURES_CSV= r"C:\Users\sagni\Downloads\GraphGuard\archive (1)\elliptic_bitcoin_dataset\elliptic_txs_features.csv"
CLASSES_CSV = r"C:\Users\sagni\Downloads\GraphGuard\archive (1)\elliptic_bitcoin_dataset\elliptic_txs_classes.csv"
EDGES_CSV   = r"C:\Users\sagni\Downloads\GraphGuard\archive (1)\elliptic_bitcoin_dataset\elliptic_txs_edgelist.csv"

PREPROC_PKL = os.path.join(OUTPUT_DIR, "preprocessor.pkl")
H5_PATH     = os.path.join(OUTPUT_DIR, "model.h5")
KERAS_PATH  = os.path.join(OUTPUT_DIR, "model.keras")  # optional if you saved it
THRESH_PATH = os.path.join(OUTPUT_DIR, "threshold.json")
CFG_YAML    = os.path.join(OUTPUT_DIR, "model_config.yaml")

os.makedirs(OUTPUT_DIR, exist_ok=True)

# -----------------------------
# Robust CSV reader
# -----------------------------
def robust_read_csv(path, expected_min_cols=2):
    if not os.path.exists(path):
        raise FileNotFoundError(path)
    encodings = ["utf-8","utf-8-sig","cp1252","latin1"]
    delims    = [",",";","\t","|"]
    try:
        with open(path, "rb") as f:
            head = f.read(8192).decode("latin1", errors="ignore")
        sniffed = csv.Sniffer().sniff(head)
        if sniffed.delimiter in delims:
            delims = [sniffed.delimiter] + [d for d in delims if d != sniffed.delimiter]
    except Exception:
        pass
    last_err = None
    for enc in encodings:
        for sep in delims:
            try:
                df = pd.read_csv(path, encoding=enc, sep=sep, engine="python")
                if df.shape[1] >= expected_min_cols:
                    return df
            except Exception as e:
                last_err = e
    raise RuntimeError(f"Could not parse {path}. Last error: {last_err}")

# -----------------------------
# Load artifacts
# -----------------------------
with open(PREPROC_PKL, "rb") as f:
    preproc = pickle.load(f)

feature_cols = list(preproc["feature_columns"])
scaler       = preproc["scaler"]
time_col     = preproc["time_column"]
txid_col     = preproc["txid_column"]
train_steps  = set(preproc["splits"]["train_steps"])
val_steps    = set(preproc["splits"]["val_steps"])
test_steps   = set(preproc["splits"]["test_steps"])

with open(THRESH_PATH, "r", encoding="utf-8") as f:
    best_t = float(json.load(f)["best_threshold"])

# If you have a YAML config, prefer paths from there (optional)
try:
    import yaml
    if os.path.exists(CFG_YAML):
        with open(CFG_YAML, "r", encoding="utf-8") as f:
            cfg = yaml.safe_load(f)
        FEATURES_CSV = cfg["paths"].get("features_csv", FEATURES_CSV)
        CLASSES_CSV  = cfg["paths"].get("classes_csv",  CLASSES_CSV)
        EDGES_CSV    = cfg["paths"].get("edgelist_csv", EDGES_CSV)
except Exception:
    pass

# -----------------------------
# Load data and prep features
# -----------------------------
df_feat = robust_read_csv(FEATURES_CSV, expected_min_cols=3)
df_cls  = robust_read_csv(CLASSES_CSV,  expected_min_cols=2)
df_edge = robust_read_csv(EDGES_CSV,    expected_min_cols=2)

# Column names as saved during training
feat_cols = list(df_feat.columns)
tx_col_feat   = feat_cols[0]
time_col_feat = feat_cols[1]
assert tx_col_feat == txid_col, f"TX ID column mismatch: {tx_col_feat} vs {txid_col}"
assert time_col_feat == time_col, f"Time column mismatch: {time_col_feat} vs {time_col}"

cls_cols  = list(df_cls.columns)
tx_col_cls = cls_cols[0]
class_col  = cls_cols[1]

edge_cols = list(df_edge.columns)
src_col, dst_col = edge_cols[0], edge_cols[1]

# Force string TX IDs everywhere
df_feat[tx_col_feat] = df_feat[tx_col_feat].astype(str)
df_cls[tx_col_cls]   = df_cls[tx_col_cls].astype(str)
df_edge[src_col]     = df_edge[src_col].astype(str)
df_edge[dst_col]     = df_edge[dst_col].astype(str)

# Map labels (drop unknowns)
df_cls[class_col] = df_cls[class_col].astype(str).str.lower().str.strip()
label_map = {"1":0, "2":1, "licit":0, "illicit":1}
df_cls["label"] = df_cls[class_col].map(label_map)
df_cls = df_cls[~df_cls["label"].isna()].copy()
df_cls["label"] = df_cls["label"].astype(int)

# Compute degrees
in_deg  = df_edge.groupby(dst_col).size().rename("in_degree")
out_deg = df_edge.groupby(src_col).size().rename("out_degree")
deg_df  = pd.concat([in_deg, out_deg], axis=1).fillna(0.0).reset_index()
deg_df.rename(columns={deg_df.columns[0]: tx_col_feat}, inplace=True)

# Merge & clean
df_feat[time_col_feat] = pd.to_numeric(df_feat[time_col_feat], errors="coerce")
df = df_feat.merge(deg_df, on=tx_col_feat, how="left")
df[["in_degree","out_degree"]] = df[["in_degree","out_degree"]].fillna(0.0)
df = df.merge(df_cls[[tx_col_cls,"label"]], left_on=tx_col_feat, right_on=tx_col_cls, how="left")
if tx_col_cls in df.columns and tx_col_cls != tx_col_feat:
    df = df.drop(columns=[tx_col_cls])

# Keep only columns needed for features (drop NA rows)
for c in feature_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")
df = df.dropna(subset=feature_cols + [time_col_feat]).reset_index(drop=True)

# -----------------------------
# Load model
# -----------------------------
model = None
if os.path.exists(KERAS_PATH):
    try:
        model = tf.keras.models.load_model(KERAS_PATH, safe_mode=False)
    except Exception as e:
        print("[WARN] Could not load model.keras:", e)

if model is None and os.path.exists(H5_PATH):
    model = tf.keras.models.load_model(H5_PATH)

if model is None:
    raise FileNotFoundError("No model found (model.keras / model.h5). Train first.")

# -----------------------------
# Helpers: predict on a dataframe slice
# -----------------------------
def predict_df(df_slice: pd.DataFrame) -> pd.DataFrame:
    X = scaler.transform(df_slice[feature_cols].values)
    prob = model.predict(X, batch_size=4096, verbose=0).ravel()
    pred = (prob >= best_t).astype(int)
    out = pd.DataFrame({
        "txId": df_slice[txid_col].values,
        "timeStep": df_slice[time_col].values,
        "prob_illicit": prob,
        "pred_label": pred
    })
    if "label" in df_slice.columns:
        out["true_label"] = df_slice["label"].values
    return out

# -----------------------------
# Prediction on TEST window & save
# -----------------------------
df_test = df[df[time_col].isin(test_steps)].copy()
pred_test = predict_df(df_test)

pred_csv  = os.path.join(OUTPUT_DIR, "predictions.csv")
pred_json = os.path.join(OUTPUT_DIR, "predictions.jsonl")

pred_test.to_csv(pred_csv, index=False)
with open(pred_json, "w", encoding="utf-8") as f:
    for _, row in pred_test.iterrows():
        rec = row.to_dict()
        rec["pred_label_name"] = "illicit" if rec["pred_label"]==1 else "licit"
        f.write(json.dumps(rec) + "\n")

print(f"[INFO] Saved predictions.csv -> {pred_csv}")
print(f"[INFO] Saved predictions.jsonl -> {pred_json}")
print(pred_test.head())

# ============================================================
# (Optional) Write a CLI script for arbitrary prediction runs
#   Usage examples:
#     python predict_cli.py --mode dataset
#     python predict_cli.py --mode dataset --subset all
#     python predict_cli.py --mode txids --txids 1000000,1000001
# ============================================================
cli_code = r'''#!/usr/bin/env python
import os, csv, json, pickle, argparse, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import tensorflow as tf

def robust_read_csv(path, expected_min_cols=2):
    encodings = ["utf-8","utf-8-sig","cp1252","latin1"]
    delims    = [",",";","\t","|"]
    try:
        with open(path, "rb") as f:
            head = f.read(8192).decode("latin1", errors="ignore")
        sniffed = csv.Sniffer().sniff(head)
        if sniffed.delimiter in delims:
            delims = [sniffed.delimiter] + [d for d in delims if d != sniffed.delimiter]
    except Exception:
        pass
    last_err = None
    for enc in encodings:
        for sep in delims:
            try:
                df = pd.read_csv(path, encoding=enc, sep=sep, engine="python")
                if df.shape[1] >= expected_min_cols:
                    return df
            except Exception as e:
                last_err = e
    raise RuntimeError(f"Could not parse {path}. Last error: {last_err}")

def main():
    p = argparse.ArgumentParser(description="GraphGuard prediction CLI (Elliptic)")
    p.add_argument("--base", default=r"C:\Users\sagni\Downloads\GraphGuard")
    p.add_argument("--features_csv", default=r"C:\Users\sagni\Downloads\GraphGuard\archive (1)\elliptic_bitcoin_dataset\elliptic_txs_features.csv")
    p.add_argument("--classes_csv",  default=r"C:\Users\sagni\Downloads\GraphGuard\archive (1)\elliptic_bitcoin_dataset\elliptic_txs_classes.csv")
    p.add_argument("--edges_csv",    default=r"C:\Users\sagni\Downloads\GraphGuard\archive (1)\elliptic_bitcoin_dataset\elliptic_txs_edgelist.csv")
    p.add_argument("--mode", choices=["dataset","txids"], default="dataset")
    p.add_argument("--subset", choices=["train","val","test","all"], default="test")
    p.add_argument("--txids", help="Comma-separated txIds (only for --mode txids)")
    args = p.parse_args()

    BASE = args.base
    PREPROC_PKL = os.path.join(BASE, "preprocessor.pkl")
    H5_PATH     = os.path.join(BASE, "model.h5")
    KERAS_PATH  = os.path.join(BASE, "model.keras")
    THRESH_PATH = os.path.join(BASE, "threshold.json")

    with open(PREPROC_PKL, "rb") as f:
        preproc = pickle.load(f)
    feature_cols = list(preproc["feature_columns"])
    scaler       = preproc["scaler"]
    time_col     = preproc["time_column"]
    txid_col     = preproc["txid_column"]
    train_steps  = set(preproc["splits"]["train_steps"])
    val_steps    = set(preproc["splits"]["val_steps"])
    test_steps   = set(preproc["splits"]["test_steps"])

    with open(THRESH_PATH, "r", encoding="utf-8") as f:
        best_t = float(json.load(f)["best_threshold"])

    # Load data
    df_feat = robust_read_csv(args.features_csv, expected_min_cols=3)
    df_cls  = robust_read_csv(args.classes_csv,  expected_min_cols=2)
    df_edge = robust_read_csv(args.edges_csv,    expected_min_cols=2)

    feat_cols = list(df_feat.columns)
    tx_col_feat   = feat_cols[0]
    time_col_feat = feat_cols[1]
    assert tx_col_feat == txid_col, f"TX ID column mismatch: {tx_col_feat} vs {txid_col}"
    assert time_col_feat == time_col, f"Time column mismatch: {time_col_feat} vs {time_col}"

    cls_cols  = list(df_cls.columns)
    tx_col_cls = cls_cols[0]
    class_col  = cls_cols[1]

    edge_cols = list(df_edge.columns)
    src_col, dst_col = edge_cols[0], edge_cols[1]

    # Force TXID to string
    df_feat[tx_col_feat] = df_feat[tx_col_feat].astype(str)
    df_cls[tx_col_cls]   = df_cls[tx_col_cls].astype(str)
    df_edge[src_col]     = df_edge[src_col].astype(str)
    df_edge[dst_col]     = df_edge[dst_col].astype(str)

    # Labels (optional)
    df_cls[class_col] = df_cls[class_col].astype(str).str.lower().str.strip()
    label_map = {"1":0, "2":1, "licit":0, "illicit":1}
    df_cls["label"] = df_cls[class_col].map(label_map)
    df_cls["label"] = df_cls["label"].astype("Int64")  # allow NA

    # Degrees
    in_deg  = df_edge.groupby(dst_col).size().rename("in_degree")
    out_deg = df_edge.groupby(src_col).size().rename("out_degree")
    deg_df  = pd.concat([in_deg, out_deg], axis=1).fillna(0.0).reset_index()
    deg_df.rename(columns={deg_df.columns[0]: tx_col_feat}, inplace=True)

    # Merge
    df_feat[time_col_feat] = pd.to_numeric(df_feat[time_col_feat], errors="coerce")
    df = df_feat.merge(deg_df, on=tx_col_feat, how="left")
    df[["in_degree","out_degree"]] = df[["in_degree","out_degree"]].fillna(0.0)
    df = df.merge(df_cls[[tx_col_cls,"label"]], left_on=tx_col_feat, right_on=tx_col_cls, how="left")
    if tx_col_cls in df.columns and tx_col_cls != tx_col_feat:
        df = df.drop(columns=[tx_col_cls])

    # Keep numeric features
    for c in feature_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    df = df.dropna(subset=feature_cols + [time_col_feat]).reset_index(drop=True)

    # Subset
    if args.mode == "dataset":
        if args.subset == "train":
            mask = df[time_col].isin(train_steps)
        elif args.subset == "val":
            mask = df[time_col].isin(val_steps)
        elif args.subset == "test":
            mask = df[time_col].isin(test_steps)
        else:
            mask = np.ones(len(df), dtype=bool)
        df_slice = df[mask].copy()
    else:
        # txids mode
        if not args.txids:
            raise SystemExit("--txids is required for --mode txids")
        want = set([s.strip() for s in args.txids.split(",") if s.strip()])
        df_slice = df[df[txid_col].isin(want)].copy()
        if df_slice.empty:
            raise SystemExit("None of the requested txIds were found in features CSV.")

    # Load model
    model = None
    if os.path.exists(KERAS_PATH):
        try:
            model = tf.keras.models.load_model(KERAS_PATH, safe_mode=False)
        except Exception:
            model = None
    if model is None and os.path.exists(H5_PATH):
        model = tf.keras.models.load_model(H5_PATH)
    if model is None:
        raise SystemExit("No model found (model.keras / model.h5).")

    # Predict
    X = scaler.transform(df_slice[feature_cols].values)
    prob = model.predict(X, batch_size=4096, verbose=0).ravel()
    pred = (prob >= best_t).astype(int)

    out = pd.DataFrame({
        "txId": df_slice[txid_col].values,
        "timeStep": df_slice[time_col].values,
        "prob_illicit": prob,
        "pred_label": pred
    })
    if "label" in df_slice.columns:
        out["true_label"] = df_slice["label"].astype("Int64").values

    # Save
    name = f"predictions_{args.mode}_{args.subset}.csv" if args.mode=="dataset" else "predictions_txids.csv"
    csv_path = os.path.join(BASE, name)
    out.to_csv(csv_path, index=False)
    jsonl_path = csv_path.replace(".csv",".jsonl")
    with open(jsonl_path, "w", encoding="utf-8") as f:
        for _, row in out.iterrows():
            rec = row.to_dict()
            rec["pred_label_name"] = "illicit" if rec["pred_label"]==1 else "licit"
            f.write(json.dumps(rec)+"\n")

    print(f"[OK] Wrote: {csv_path}")
    print(f"[OK] Wrote: {jsonl_path}")
    print(out.head())

if __name__ == "__main__":
    main()
'''
# write CLI
with open(os.path.join(OUTPUT_DIR, "predict_cli.py"), "w", encoding="utf-8") as f:
    f.write(cli_code)

print("[INFO] Wrote predict_cli.py ->", os.path.join(OUTPUT_DIR, "predict_cli.py"))
print("\nRun from terminal, e.g.:")
print(r'  cd "C:\Users\sagni\Downloads\GraphGuard"')
print(r'  python predict_cli.py --mode dataset --subset test')
print(r'  python predict_cli.py --mode txids --txids 230425980,230425981')




[INFO] Saved predictions.csv -> C:\Users\sagni\Downloads\GraphGuard\predictions.csv
[INFO] Saved predictions.jsonl -> C:\Users\sagni\Downloads\GraphGuard\predictions.jsonl
       txId  timeStep  prob_illicit  pred_label  true_label
0  97023208        40      0.987521           1         NaN
1  97732542        40      0.981686           1         NaN
2  97425476        40      0.999817           1         1.0
3  13344762        40      0.994055           1         1.0
4  87028631        40      0.967744           1         NaN
[INFO] Wrote predict_cli.py -> C:\Users\sagni\Downloads\GraphGuard\predict_cli.py

Run from terminal, e.g.:
  cd "C:\Users\sagni\Downloads\GraphGuard"
  python predict_cli.py --mode dataset --subset test
  python predict_cli.py --mode txids --txids 230425980,230425981
