# 07_feature_matrix + baseline model

## Current “best” features (deduped) + Mutual Information (MI)
1. is_international_user — 0.063161  
2. uses_many_channels — 0.016385  
3. industry — 0.011218  
4. kyc_province — 0.006378  
5. recent_amount_ratio — 0.005571  
6. ratio_emt — 0.005094  
7. amount_cv — 0.004436  
8. debit_ratio — 0.004303  
9. channel_entropy — 0.003901  
10. total_amount_vs_finpeer — 0.003895  
11. occupation — 0.003692  
12. pct_history_before_intl — 0.003631  
13. credit_ratio — 0.003352  
14. cv_vs_peer_ratio — 0.003101  
15. max_amount — 0.002908  

## Goal of this notebook
1) Load train/test customer + transaction splits  
2) Merge feature tables from different notebooks into ONE feature matrix  
3) Train baseline model (LogReg or LightGBM if available)  
4) Evaluate ROC-AUC, PR-AUC, Top-K recall (more AML-relevant than threshold=0.5)


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

REPO_ROOT = Path.cwd().parents[0]
PROCESSED = REPO_ROOT/"data/processed"
INTERIM = REPO_ROOT/"data/interim"   # if you prefer interim outputs from feature notebooks

# load training splits ONLY
customers_train = pd.read_csv(PROCESSED/"customers_train.csv")
customers_test  = pd.read_csv(PROCESSED/"customers_test.csv")

txns_train = pd.read_csv(PROCESSED/"transactions_train.csv")
txns_test  = pd.read_csv(PROCESSED/"transactions_test.csv")

# parse datetime
for df in [txns_train, txns_test]:
    if "transaction_datetime" in df.columns:
        df["transaction_datetime"] = pd.to_datetime(df["transaction_datetime"], errors="coerce")

print("customers_train:", customers_train.shape)
print("customers_test :", customers_test.shape)
print("txns_train:", txns_train.shape)
print("txns_test :", txns_test.shape)

customers_train.head()


customers_train: (700, 18)
customers_test : (150, 18)
txns_train: (43004, 31)
txns_test : (6939, 31)


Unnamed: 0,birth_date,city,country,customer_id,customer_type,employee_count,established_date,gender,income,industry,industry_code,marital_status,occupation_code,occupation_title,onboard_date,province,sales,label
0,1972-12-07,other,CA,SYNID0108676505,individual,,,MALE,123875.0,Unknown,Unknown,Married,SELF_EMPLOYED,Unknown,2001-03-17,Unknown,,0.0
1,1964-02-20,other,CA,SYNID0104294551,individual,,,FEMALE,,Unknown,Unknown,Single,RETIRED,Unknown,2010-06-12,Unknown,,0.0
2,1986-04-08,MONTREAL,CA,SYNID0108958094,individual,,,MALE,,Unknown,Unknown,Unknown,OTHER,Unknown,2016-03-25,QC,,0.0
3,1999-04-08,KINGSTON,CA,SYNID0102414463,individual,,,FEMALE,,Unknown,Unknown,Single,SELF_EMPLOYED,Unknown,2021-08-17,ON,,0.0
4,1935-05-02,BRAMPTON,CA,SYNID0100000485,individual,,,FEMALE,19998.0,Unknown,Unknown,Widowed,RETIRED,Unknown,1997-07-12,ON,,0.0


In [3]:
def label_counts(df, name):
    vc = df["label"].value_counts(dropna=False)
    print(f"{name} label counts:\n{vc}\n")

label_counts(customers_train, "TRAIN customers")
label_counts(customers_test,  "TEST customers")


TRAIN customers label counts:
label
0.0    693
1.0      7
Name: count, dtype: int64

TEST customers label counts:
label
0.0    149
1.0      1
Name: count, dtype: int64



In [7]:
# EDIT THESE FILENAMES TO MATCH WHAT YOUR FEATURE NOTEBOOKS SAVE
FEATURE_TABLES = {
    "cash": {
        "train": INTERIM/"features_cash_train.csv",
        "test":  INTERIM/"features_cash_test.csv",
    },
    "wire": {
        "train": INTERIM/"features_wire_train.csv",
        "test":  INTERIM/"features_wire_test.csv",
    },
    "emt": {
        "train": INTERIM/"features_emt_train.csv",
        "test":  INTERIM/"features_emt_test.csv",
    },
    "geo": {
        "train": INTERIM/"features_geo_train.csv",
        "test":  INTERIM/"features_geo_test.csv",
    },
    "behavioral": {
        "train": INTERIM/"features_behavioral_train.csv",
        "test":  INTERIM/"features_behavioral_test.csv",
    },
    "profile": {
        "train": INTERIM/"features_profile_train.csv",
        "test":  INTERIM/"features_profile_test.csv",
    },
}

def load_feature_table(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, low_memory=False)
    if "customer_id" not in df.columns:
        raise ValueError(f"{path} missing customer_id column")
    df = df.drop_duplicates("customer_id")
    return df

def merge_feature_tables(base_customers: pd.DataFrame, split: str) -> pd.DataFrame:
    out = base_customers[["customer_id", "label"]].copy()
    for name, paths in FEATURE_TABLES.items():
        p = paths[split]
        if not p.exists():
            print(f"[WARN] missing {name} {split}: {p}")
            continue
        ft = load_feature_table(p)
        out = out.merge(ft, on="customer_id", how="left", validate="1:1")
    return out

feat_train = merge_feature_tables(customers_train, "train")
feat_test  = merge_feature_tables(customers_test,  "test")

print("feat_train:", feat_train.shape)
print("feat_test :", feat_test.shape)

feat_train.head()


feat_train: (700, 62)
feat_test : (150, 62)


Unnamed: 0,customer_id,label,cash_txn_count,cash_txn_ratio,cash_amount,cash_amount_ratio,cash_amount_last30d,cash_recent_spike_ratio,cash_round_100_ratio,cash_round_1000_ratio,...,activity_burstiness,industry,industry_code,occupation_title,occupation_code,province,country,city,gender,marital_status
0,SYNID0108676505,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,Unknown,Unknown,Unknown,SELF_EMPLOYED,Unknown,CA,other,MALE,Married
1,SYNID0104294551,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.016159,Unknown,Unknown,Unknown,RETIRED,Unknown,CA,other,FEMALE,Single
2,SYNID0108958094,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.890328,Unknown,Unknown,Unknown,OTHER,QC,CA,MONTREAL,MALE,Unknown
3,SYNID0102414463,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.95829,Unknown,Unknown,Unknown,SELF_EMPLOYED,ON,CA,KINGSTON,FEMALE,Single
4,SYNID0100000485,0.0,1,0.025,415.92,0.037841,415.92,415920000000.0,0.0,0.0,...,1.156476,Unknown,Unknown,Unknown,RETIRED,ON,CA,BRAMPTON,FEMALE,Widowed


In [8]:
# labeled only (your train/test splits should already be labeled, but keep safe)
train_df = feat_train[feat_train["label"].notna()].copy()
test_df  = feat_test[feat_test["label"].notna()].copy()

y_train = train_df["label"].astype(int)
y_test  = test_df["label"].astype(int)

X_train = train_df.drop(columns=["customer_id", "label"])
X_test  = test_df.drop(columns=["customer_id", "label"])

# identify categorical vs numeric
cat_cols = [c for c in X_train.columns if X_train[c].dtype == "object"]
num_cols = [c for c in X_train.columns if c not in cat_cols]

print("num_cols:", len(num_cols))
print("cat_cols:", len(cat_cols))
cat_cols[:10], num_cols[:10]


num_cols: 51
cat_cols: 9


(['industry',
  'industry_code',
  'occupation_title',
  'occupation_code',
  'province',
  'country',
  'city',
  'gender',
  'marital_status'],
 ['cash_txn_count',
  'cash_txn_ratio',
  'cash_amount',
  'cash_amount_ratio',
  'cash_amount_last30d',
  'cash_recent_spike_ratio',
  'cash_round_100_ratio',
  'cash_round_1000_ratio',
  'cash_cents_00_ratio',
  'cash_burst_1h'])

In [14]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    confusion_matrix, classification_report,
    precision_recall_curve
)

# =========================
# Preprocess + Model
# =========================

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
        ]), num_cols),
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
        ]), cat_cols),
    ],
    remainder="drop"
)

clf = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    C=0.1,
    solver="liblinear"
)

model = Pipeline(steps=[
    ("prep", preprocess),
    ("clf", clf)
])

model.fit(X_train, y_train)

p_train = model.predict_proba(X_train)[:, 1]
p_test  = model.predict_proba(X_test)[:, 1]

# =========================
# Helpers
# =========================

def precision_at_k(y, p, k=20):
    """Assumes y is array-like (numpy array, list, or pandas Series)."""
    y_arr = np.asarray(y)
    idx = np.argsort(p)[::-1][:k]
    return y_arr[idx].sum() / k

from sklearn.metrics import precision_recall_curve
import numpy as np

def find_threshold_for_precision(y, p, target_precision=0.2, min_recall=0.01):
    """
    Pick a threshold that achieves precision >= target_precision AND recall >= min_recall.
    Among those, pick the one with the highest recall (more useful than choosing the first).
    Falls back to best F1 if no threshold meets constraints.
    """
    precision, recall, thresholds = precision_recall_curve(y, p)

    # precision and recall have length n+1, thresholds has length n
    # Align thresholds to same length by appending 1.0 (degenerate "predict none" point)
    thresholds = np.append(thresholds, 1.0)

    # Valid candidates must have non-trivial recall and not be the degenerate point
    candidates = np.where(
        (precision >= target_precision) &
        (recall >= min_recall) &
        (thresholds < 1.0)
    )[0]

    if len(candidates) > 0:
        # choose the candidate with maximum recall
        best = candidates[np.argmax(recall[candidates])]
        return thresholds[best]

    # Fallback: maximize F1 over non-degenerate thresholds
    f1 = 2 * precision * recall / (precision + recall + 1e-12)
    valid = np.where(thresholds < 1.0)[0]
    best = valid[np.nanargmax(f1[valid])]
    return thresholds[best]


def eval_block(name, y, p, threshold):
    print(f"\n===== {name} =====")
    print("ROC-AUC:", round(roc_auc_score(y, p), 4))
    print("PR-AUC :", round(average_precision_score(y, p), 4))
    pred = (p >= threshold).astype(int)
    print(f"Threshold used: {threshold:.6f}")
    print("Confusion matrix:")
    print(confusion_matrix(y, pred))
    print(classification_report(y, pred, digits=4))
    for k in [10, 20, 50]:
        if k <= len(p):
            print(f"Precision@{k}: {precision_at_k(y, p, k=k):.4f}")

# =========================
# Pick threshold on TRAIN
# =========================

# Choose a target precision that makes sense for your use-case
# (try 0.1, 0.2, 0.3, etc.)
target_precision = 0.20

opt_thresh = find_threshold_for_precision(y_train, p_train, target_precision=target_precision)
print("Chosen threshold (from TRAIN) =", opt_thresh)

# =========================
# Evaluate with that threshold
# =========================

eval_block("TRAIN", y_train, p_train, threshold=opt_thresh)
eval_block("TEST",  y_test,  p_test,  threshold=opt_thresh)


Chosen threshold (from TRAIN) = 0.5000017207945822

===== TRAIN =====
ROC-AUC: 0.7817
PR-AUC : 0.0664
Threshold used: 0.500002
Confusion matrix:
[[656  37]
 [  3   4]]
              precision    recall  f1-score   support

           0     0.9954    0.9466    0.9704       693
           1     0.0976    0.5714    0.1667         7

    accuracy                         0.9429       700
   macro avg     0.5465    0.7590    0.5685       700
weighted avg     0.9865    0.9429    0.9624       700

Precision@10: 0.1000
Precision@20: 0.0500
Precision@50: 0.0800

===== TEST =====
ROC-AUC: 0.8725
PR-AUC : 0.05
Threshold used: 0.500002
Confusion matrix:
[[144   5]
 [  1   0]]
              precision    recall  f1-score   support

           0     0.9931    0.9664    0.9796       149
           1     0.0000    0.0000    0.0000         1

    accuracy                         0.9600       150
   macro avg     0.4966    0.4832    0.4898       150
weighted avg     0.9865    0.9600    0.9731       150

P

In [10]:
def topk_recall(y_true, prob, k):
    # percent of fraud caught if you investigate top-k highest risk
    idx = np.argsort(-prob)[:k]
    return y_true.iloc[idx].sum() / max(y_true.sum(), 1)

for k in [5, 10, 20, 50, 100]:
    r = topk_recall(y_test, p_test, k)
    print(f"Top-{k} recall (TEST): {r:.3f}")


Top-5 recall (TEST): 0.000
Top-10 recall (TEST): 0.000
Top-20 recall (TEST): 1.000
Top-50 recall (TEST): 1.000
Top-100 recall (TEST): 1.000


In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve

# ROC
fpr, tpr, _ = roc_curve(y_test, p_test)
roc = roc_auc_score(y_test, p_test)

plt.figure()
plt.plot(fpr, tpr)
plt.plot([0,1],[0,1], linestyle="--")
plt.title(f"ROC curve (TEST) | AUC={roc:.3f}")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

# PR
prec, rec, _ = precision_recall_curve(y_test, p_test)
pr = average_precision_score(y_test, p_test)

plt.figure()
plt.plot(rec, prec)
plt.title(f"PR curve (TEST) | AP={pr:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.show()

print("TEST ROC-AUC:", roc)
print("TEST PR-AUC :", pr)


In [None]:
try:
    import lightgbm as lgb
    from sklearn.metrics import roc_auc_score, average_precision_score
    
    # Use same preprocess (OneHot) -> sparse matrix
    Xtr = preprocess.fit_transform(X_train)
    Xte = preprocess.transform(X_test)

    # scale_pos_weight helps imbalance
    pos = (y_train == 1).sum()
    neg = (y_train == 0).sum()
    spw = neg / max(pos, 1)

    lgbm = lgb.LGBMClassifier(
        n_estimators=400,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        scale_pos_weight=spw,
        random_state=42,
    )

    lgbm.fit(Xtr, y_train)

    p_te = lgbm.predict_proba(Xte)[:, 1]
    print("LightGBM TEST ROC-AUC:", round(roc_auc_score(y_test, p_te), 4))
    print("LightGBM TEST PR-AUC :", round(average_precision_score(y_test, p_te), 4))

except Exception as e:
    print("LightGBM not available or failed:", repr(e))
    print("If you want it: pip install lightgbm")
