# Baseline Model Evaluation (Train → Val → Test)

## Selected features (deduplicated best version per feature)
| Rank | Feature | Mutual Info (MI) |
|---:|---|---:|
| 1 | is_international_user | 0.063161 |
| 2 | uses_many_channels | 0.016385 |
| 3 | industry | 0.011218 |
| 4 | kyc_province | 0.006378 |
| 5 | recent_amount_ratio | 0.005571 |
| 6 | ratio_emt | 0.005094 |
| 7 | amount_cv | 0.004436 |
| 8 | debit_ratio | 0.004303 |
| 9 | channel_entropy | 0.003901 |
| 10 | total_amount_vs_finpeer | 0.003895 |
| 11 | occupation | 0.003692 |
| 12 | pct_history_before_intl | 0.003631 |
| 13 | credit_ratio | 0.003352 |
| 14 | cv_vs_peer_ratio | 0.003101 |
| 15 | max_amount | 0.002908 |

**Goal:** Train a baseline classifier (Logistic Regression) on the training split, tune nothing fancy, and measure performance on validation (and final check on test).

**Primary metric:** ROC-AUC (accuracy is not meaningful for imbalanced AML).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

REPO_ROOT = Path.cwd().parents[0]
PROCESSED = REPO_ROOT / "data/processed"

# Uncomment if you haven't loaded them yet:
# customers_train = pd.read_csv(PROCESSED/"customers_train.csv")
# customers_val   = pd.read_csv(PROCESSED/"customers_val.csv")
# customers_test  = pd.read_csv(PROCESSED/"customers_test.csv")
# txns_train = pd.read_csv(PROCESSED/"transactions_train.csv")
# txns_val   = pd.read_csv(PROCESSED/"transactions_val.csv")
# txns_test  = pd.read_csv(PROCESSED/"transactions_test.csv")

print(customers_train.shape, customers_val.shape, customers_test.shape)
print(txns_train.shape, txns_val.shape, txns_test.shape)


In [None]:
FEATURES = [
    "is_international_user",
    "uses_many_channels",
    "industry",
    "kyc_province",
    "recent_amount_ratio",
    "ratio_emt",
    "amount_cv",
    "debit_ratio",
    "channel_entropy",
    "total_amount_vs_finpeer",
    "occupation",
    "pct_history_before_intl",
    "credit_ratio",
    "cv_vs_peer_ratio",
    "max_amount",
]

TARGET = "label"
IDCOL = "customer_id"

def make_xy(df: pd.DataFrame, features=FEATURES):
    d = df[[IDCOL, TARGET] + [c for c in features if c in df.columns]].copy()

    # y must be int
    d[TARGET] = pd.to_numeric(d[TARGET], errors="coerce").astype(int)

    # separate X/y
    X = d.drop(columns=[IDCOL, TARGET])
    y = d[TARGET]

    # identify categorical vs numeric
    cat_cols = [c for c in X.columns if X[c].dtype == "object"]
    num_cols = [c for c in X.columns if c not in cat_cols]

    # fill missing
    for c in cat_cols:
        X[c] = X[c].fillna("Unknown").astype(str).str.strip()
    for c in num_cols:
        X[c] = pd.to_numeric(X[c], errors="coerce").fillna(0.0)

    return X, y, cat_cols, num_cols

X_train, y_train, cat_cols, num_cols = make_xy(customers_train)
X_val,   y_val,   _, _ = make_xy(customers_val)
X_test,  y_test,  _, _ = make_xy(customers_test)

print("Categorical:", cat_cols)
print("Numeric:", num_cols)
print("X_train:", X_train.shape, "Positive rate:", y_train.mean())


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ],
    remainder="drop",
)

model = LogisticRegression(
    max_iter=500,
    class_weight="balanced",   # important for imbalanced AML
    solver="liblinear"
)

clf = Pipeline(steps=[
    ("prep", preprocess),
    ("model", model)
])

clf


In [None]:
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    confusion_matrix, classification_report,
    precision_recall_curve
)

clf.fit(X_train, y_train)

def eval_split(name, X, y):
    proba = clf.predict_proba(X)[:, 1]
    pred = (proba >= 0.5).astype(int)

    roc = roc_auc_score(y, proba)
    ap  = average_precision_score(y, proba)
    cm  = confusion_matrix(y, pred)

    print(f"\n===== {name} =====")
    print("ROC-AUC:", round(roc, 4))
    print("PR-AUC :", round(ap, 4))
    print("Confusion matrix @0.5:\n", cm)
    print(classification_report(y, pred, digits=4))

    return proba

p_train = eval_split("TRAIN", X_train, y_train)
p_val   = eval_split("VAL",   X_val,   y_val)
p_test  = eval_split("TEST",  X_test,  y_test)


In [None]:
from sklearn.metrics import f1_score

proba = p_val
thresholds = np.linspace(0.01, 0.99, 99)

best_t, best_f1 = None, -1
for t in thresholds:
    pred = (proba >= t).astype(int)
    f1 = f1_score(y_val, pred)
    if f1 > best_f1:
        best_f1, best_t = f1, t

print("Best threshold on VAL (max F1):", round(best_t, 3), "F1:", round(best_f1, 4))

# evaluate test at this threshold
pred_test = (p_test >= best_t).astype(int)
print("\nTEST report @best_val_threshold")
print(classification_report(y_test, pred_test, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test, pred_test))


In [None]:
# Get feature names after one-hot
ohe = clf.named_steps["prep"].named_transformers_["cat"]
cat_feature_names = ohe.get_feature_names_out(cat_cols)

all_feature_names = np.concatenate([cat_feature_names, np.array(num_cols, dtype=str)])

coefs = clf.named_steps["model"].coef_.ravel()
coef_df = pd.DataFrame({"feature": all_feature_names, "coef": coefs})
coef_df["abs_coef"] = coef_df["coef"].abs()
coef_df = coef_df.sort_values("abs_coef", ascending=False)

coef_df.head(25)


In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, _ = roc_curve(y_val, p_val)
plt.figure()
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve (Validation)")
plt.show()
