In [2]:
# ============================================================
# GraphGuard — Elliptic Bitcoin (TX graph) baseline trainer (dtype-safe)
# Saves: preprocessor.pkl, model.h5, model_config.yaml, metrics.json, threshold.json
# ============================================================
import os, csv, json, math, pickle, warnings, random
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score, accuracy_score,
    precision_recall_curve, roc_curve, brier_score_loss
)
from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# -----------------------------
# Paths (YOUR EXACT FILES)
# -----------------------------
FEATURES_PATH = r"C:\Users\sagni\Downloads\GraphGuard\archive (1)\elliptic_bitcoin_dataset\elliptic_txs_features.csv"
CLASSES_PATH  = r"C:\Users\sagni\Downloads\GraphGuard\archive (1)\elliptic_bitcoin_dataset\elliptic_txs_classes.csv"
EDGES_PATH    = r"C:\Users\sagni\Downloads\GraphGuard\archive (1)\elliptic_bitcoin_dataset\elliptic_txs_edgelist.csv"
OUTPUT_DIR    = r"C:\Users\sagni\Downloads\GraphGuard"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# -----------------------------
# Reproducibility
# -----------------------------
SEED = 42
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)

# -----------------------------
# Robust CSV reader
# -----------------------------
def robust_read_csv(path, expected_min_cols=2):
    if not os.path.exists(path):
        raise FileNotFoundError(path)
    encodings = ["utf-8", "utf-8-sig", "cp1252", "latin1"]
    delims    = [",", ";", "\t", "|"]
    try:
        with open(path, "rb") as f:
            head = f.read(8192).decode("latin1", errors="ignore")
        sniffed = csv.Sniffer().sniff(head)
        if sniffed.delimiter in delims:
            delims = [sniffed.delimiter] + [d for d in delims if d != sniffed.delimiter]
    except Exception:
        pass

    last_err = None
    for enc in encodings:
        for sep in delims:
            try:
                df = pd.read_csv(path, encoding=enc, sep=sep, engine="python")
                if df.shape[1] >= expected_min_cols:
                    print(f"[INFO] Loaded {os.path.basename(path)} enc='{enc}', sep='{sep}', shape={df.shape}")
                    return df
            except Exception as e:
                last_err = e
    raise RuntimeError(f"Could not parse {path}. Last error: {last_err}")

# -----------------------------
# Load data
# -----------------------------
df_feat = robust_read_csv(FEATURES_PATH, expected_min_cols=3)
df_cls  = robust_read_csv(CLASSES_PATH,  expected_min_cols=2)
df_edge = robust_read_csv(EDGES_PATH,    expected_min_cols=2)

# Canonicalize column names (Elliptic uses txId, timeStep)
feat_cols = list(df_feat.columns)
if len(feat_cols) < 3:
    raise RuntimeError("Features CSV must have >=3 columns (txId, timeStep, features...)")

tx_col_feat   = feat_cols[0]  # txId in features file
time_col_feat = feat_cols[1]  # timeStep in features file
feature_cols  = feat_cols[2:] # f1..fN

# Classes file: typically [txId, class]
cls_cols = list(df_cls.columns)
tx_col_cls = cls_cols[0]
class_col  = cls_cols[1]

# Edge list: [src, dst] transaction graph (directed)
edge_cols = list(df_edge.columns)
src_col, dst_col = edge_cols[0], edge_cols[1]

# -----------------------------
# FORCE CONSISTENT DTYPE FOR TX IDs (fixes your error)
# -----------------------------
df_feat[tx_col_feat] = df_feat[tx_col_feat].astype(str)
df_cls[tx_col_cls]   = df_cls[tx_col_cls].astype(str)
df_edge[src_col]     = df_edge[src_col].astype(str)
df_edge[dst_col]     = df_edge[dst_col].astype(str)

# -----------------------------
# Map labels: licit/illicit; drop unknown
# Elliptic classes often: '1' = licit (0), '2' = illicit (1), 'unknown'
# -----------------------------
df_cls[class_col] = df_cls[class_col].astype(str).str.lower().str.strip()
label_map = {"1": 0, "2": 1, "licit": 0, "illicit": 1}
df_cls["label"] = df_cls[class_col].map(label_map)
df_cls = df_cls[~df_cls["label"].isna()].copy()  # drop unknowns
df_cls["label"] = df_cls["label"].astype(int)

# -----------------------------
# Compute (in/out) degrees from edgelist
# -----------------------------
in_deg  = df_edge.groupby(dst_col).size().rename("in_degree")
out_deg = df_edge.groupby(src_col).size().rename("out_degree")
deg_df  = pd.concat([in_deg, out_deg], axis=1).fillna(0.0).reset_index()
deg_df.rename(columns={deg_df.columns[0]: tx_col_feat}, inplace=True)  # align merge key name

# -----------------------------
# Merge: features + degrees + labels
# -----------------------------
# time column should be numeric
df_feat[time_col_feat] = pd.to_numeric(df_feat[time_col_feat], errors="coerce")

df = df_feat.merge(deg_df, on=tx_col_feat, how="left")
df[["in_degree","out_degree"]] = df[["in_degree","out_degree"]].fillna(0.0)

df = df.merge(df_cls[[tx_col_cls,"label"]], left_on=tx_col_feat, right_on=tx_col_cls, how="inner")
# Drop duplicate right key
if tx_col_cls in df.columns and tx_col_cls != tx_col_feat:
    df = df.drop(columns=[tx_col_cls])

if df.empty:
    raise RuntimeError("After merging features and labels, dataset is empty. Check txId column names and dtypes.")

# Final feature set: original f1..fN + degrees
full_features = feature_cols + ["in_degree", "out_degree"]

# Ensure numeric features
for c in full_features:
    df[c] = pd.to_numeric(df[c], errors="coerce")
df = df.dropna(subset=full_features + [time_col_feat, "label"]).reset_index(drop=True)

print("[INFO] Final dataset shape:", df.shape)
print("[INFO] Class balance: ", df["label"].value_counts().to_dict())

# -----------------------------
# Time-based split (avoid leakage)
#  - Split by unique timesteps: 60% train, 20% val, 20% test
# -----------------------------
steps = sorted(pd.unique(df[time_col_feat].values).tolist())
n = len(steps)
i1 = int(0.6*n); i2 = int(0.8*n)
train_steps = set(steps[:i1])
val_steps   = set(steps[i1:i2])
test_steps  = set(steps[i2:])

train_df = df[df[time_col_feat].isin(train_steps)].copy()
val_df   = df[df[time_col_feat].isin(val_steps)].copy()
test_df  = df[df[time_col_feat].isin(test_steps)].copy()

print("[INFO] Split sizes:", { "train": len(train_df), "val": len(val_df), "test": len(test_df) })

# -----------------------------
# Scale features; save preprocessor
# -----------------------------
scaler = StandardScaler()
X_train = scaler.fit_transform(train_df[full_features].values)
y_train = train_df["label"].values.astype(int)

X_val   = scaler.transform(val_df[full_features].values)
y_val   = val_df["label"].values.astype(int)

X_test  = scaler.transform(test_df[full_features].values)
y_test  = test_df["label"].values.astype(int)

preproc = {
    "feature_columns": full_features,
    "scaler": scaler,
    "label_map": label_map,
    "time_column": time_col_feat,
    "txid_column": tx_col_feat,
    "splits": {
        "train_steps": sorted(list(train_steps)),
        "val_steps":   sorted(list(val_steps)),
        "test_steps":  sorted(list(test_steps)),
    }
}
with open(os.path.join(OUTPUT_DIR, "preprocessor.pkl"), "wb") as f:
    pickle.dump(preproc, f)
print("[INFO] Saved preprocessor.pkl")

# -----------------------------
# Build & train a compact MLP (baseline)
# -----------------------------
input_dim = X_train.shape[1]

def build_model():
    inp = keras.Input(shape=(input_dim,))
    x = layers.BatchNormalization()(inp)
    x = layers.Dense(256, activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)
    out = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inp, out, name="elliptic_mlp_baseline")
    model.compile(
        optimizer=keras.optimizers.Adam(1e-3),
        loss="binary_crossentropy",
        metrics=[
            keras.metrics.AUC(name="auc"),
            keras.metrics.AUC(name="pr_auc", curve="PR"),
            keras.metrics.BinaryAccuracy(name="accuracy")
        ]
    )
    return model

model = build_model()

# Handle imbalance with class weights (robust if a class is missing in train)
try:
    classes = np.array([0,1])
    class_weights = compute_class_weight(
        class_weight="balanced", classes=classes, y=y_train
    )
    cw = {int(c): float(w) for c, w in zip(classes, class_weights)}
except Exception:
    cw = {0: 1.0, 1: 1.0}
print("[INFO] Class weights:", cw)

early = keras.callbacks.EarlyStopping(
    monitor="val_pr_auc", mode="max", patience=5, restore_best_weights=True
)

hist = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=40,
    batch_size=512,
    class_weight=cw,
    callbacks=[early],
    verbose=1
)

# -----------------------------
# Evaluate & choose threshold by F1 on validation
# -----------------------------
val_prob = model.predict(X_val, batch_size=2048, verbose=0).ravel()
test_prob= model.predict(X_test, batch_size=2048, verbose=0).ravel()

# Sweep thresholds
cand = np.linspace(0.05, 0.95, 181)
best_t, best_f1 = 0.5, -1.0
for t in cand:
    f1 = f1_score(y_val, (val_prob >= t).astype(int), zero_division=0)
    if f1 > best_f1:
        best_f1, best_t = float(f1), float(t)
print(f"[INFO] Best threshold on val: t={best_t:.3f} | F1={best_f1:.4f}")

# Final metrics on test
auc_roc = roc_auc_score(y_test, test_prob)
auc_pr  = average_precision_score(y_test, test_prob)
brier   = brier_score_loss(y_test, test_prob)
acc05   = accuracy_score(y_test, (test_prob >= 0.5).astype(int))
f105    = f1_score(y_test, (test_prob >= 0.5).astype(int), zero_division=0)
accT    = accuracy_score(y_test, (test_prob >= best_t).astype(int))
f1T     = f1_score(y_test, (test_prob >= best_t).astype(int), zero_division=0)

metrics = {
    "n_train": int(len(X_train)),
    "n_val":   int(len(X_val)),
    "n_test":  int(len(X_test)),
    "roc_auc": float(auc_roc),
    "pr_auc":  float(auc_pr),
    "brier":   float(brier),
    "acc_at_0.5": float(acc05),
    "f1_at_0.5":  float(f105),
    "acc_at_best_t": float(accT),
    "f1_at_best_t":  float(f1T),
}
with open(os.path.join(OUTPUT_DIR, "metrics.json"), "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2)
with open(os.path.join(OUTPUT_DIR, "threshold.json"), "w", encoding="utf-8") as f:
    json.dump({"best_threshold": best_t, "best_f1": best_f1}, f, indent=2)
print("[INFO] Saved metrics.json and threshold.json")
print(json.dumps(metrics, indent=2))

# -----------------------------
# Save model.h5 (as requested)
# -----------------------------
h5_path = os.path.join(OUTPUT_DIR, "model.h5")
model.save(h5_path)
print("[INFO] Saved model.h5 ->", h5_path)

# -----------------------------
# Save model_config.yaml
# -----------------------------
config = {
    "project": "GraphGuard — Elliptic Bitcoin Baseline",
    "paths": {
        "features_csv": FEATURES_PATH,
        "classes_csv":  CLASSES_PATH,
        "edgelist_csv": EDGES_PATH,
        "output_dir":   OUTPUT_DIR
    },
    "splits": {
        "train_steps": sorted(list(train_steps)),
        "val_steps":   sorted(list(val_steps)),
        "test_steps":  sorted(list(test_steps))
    },
    "preprocessing": {
        "scaler": "StandardScaler",
        "feature_columns": full_features,
        "extra_graph_features": ["in_degree","out_degree"]
    },
    "model": {
        "type": "MLP",
        "layers": [256, 128],
        "dropout": [0.3, 0.2],
        "batch_norm": True,
        "activation": "relu",
        "loss": "binary_crossentropy",
        "optimizer": "adam",
        "learning_rate": 1e-3,
        "metrics": ["AUC", "PR_AUC", "BinaryAccuracy"]
    },
    "training": {
        "epochs": 40,
        "batch_size": 512,
        "early_stopping": {"monitor": "val_pr_auc", "mode": "max", "patience": 5},
        "class_weight": cw
    }
}

yaml_path = os.path.join(OUTPUT_DIR, "model_config.yaml")
try:
    import yaml
    with open(yaml_path, "w", encoding="utf-8") as f:
        yaml.safe_dump(config, f, sort_keys=False)
except Exception:
    # Minimal YAML fallback
    def to_yaml(d, indent=0):
        lines, pad = [], "  " * indent
        if isinstance(d, dict):
            for k, v in d.items():
                if isinstance(v, (dict, list)):
                    lines.append(f"{pad}{k}:")
                    lines.extend(to_yaml(v, indent+1))
                else:
                    lines.append(f"{pad}{k}: {repr(v)}")
        elif isinstance(d, list):
            for it in d:
                if isinstance(it, (dict, list)):
                    lines.append(f"{pad}-")
                    lines.extend(to_yaml(it, indent+1))
                else:
                    lines.append(f"{pad}- {repr(it)}")
        return lines
    with open(yaml_path, "w", encoding="utf-8") as f:
        f.write("\n".join(to_yaml(config)))
print("[INFO] Saved model_config.yaml ->", yaml_path)

print("\n[DONE] Artifacts saved in:", OUTPUT_DIR)
print(" - preprocessor.pkl")
print(" - model.h5")
print(" - model_config.yaml")
print(" - metrics.json")
print(" - threshold.json")


[INFO] Loaded elliptic_txs_features.csv enc='utf-8', sep=',', shape=(203768, 167)
[INFO] Loaded elliptic_txs_classes.csv enc='utf-8', sep=',', shape=(203769, 2)
[INFO] Loaded elliptic_txs_edgelist.csv enc='utf-8', sep=',', shape=(234355, 2)
[INFO] Final dataset shape: (46564, 170)
[INFO] Class balance:  {1: 42019, 0: 4545}
[INFO] Split sizes: {'train': 26381, 'val': 8999, 'test': 11184}
[INFO] Saved preprocessor.pkl
[INFO] Class weights: {0: 4.594392197840474, 1: 0.5610591237771161}
Epoch 1/40
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 33ms/step - accuracy: 0.6657 - auc: 0.8742 - loss: 0.4980 - pr_auc: 0.9783 - val_accuracy: 0.8322 - val_auc: 0.8554 - val_loss: 0.4862 - val_pr_auc: 0.9752
Epoch 2/40
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.8594 - auc: 0.9735 - loss: 0.2576 - pr_auc: 0.9959 - val_accuracy: 0.9195 - val_auc: 0.9017 - val_loss: 0.3064 - val_pr_auc: 0.9831
Epoch 3/40
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━



[INFO] Best threshold on val: t=0.260 | F1=0.9710
[INFO] Saved metrics.json and threshold.json
{
  "n_train": 26381,
  "n_val": 8999,
  "n_test": 11184,
  "roc_auc": 0.8442063594278804,
  "pr_auc": 0.9871204211354372,
  "brier": 0.047455343374524,
  "acc_at_0.5": 0.9441165951359084,
  "f1_at_0.5": 0.970444980375467,
  "acc_at_best_t": 0.9533261802575107,
  "f1_at_best_t": 0.9755411863930278
}
[INFO] Saved model.h5 -> C:\Users\sagni\Downloads\GraphGuard\model.h5
[INFO] Saved model_config.yaml -> C:\Users\sagni\Downloads\GraphGuard\model_config.yaml

[DONE] Artifacts saved in: C:\Users\sagni\Downloads\GraphGuard
 - preprocessor.pkl
 - model.h5
 - model_config.yaml
 - metrics.json
 - threshold.json
