In [None]:
#load training table
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import joblib

df = pd.read_csv("data/processed/training_table.csv")
feature_cols = [c for c in df.columns if c not in ("user_id","fraud_label","is_fraud")]
X = df[feature_cols].fillna(0)
y = df["is_fraud"]

In [None]:
#split 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
#baseline model - logistic regression
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train, y_train)
lr_prob = lr.predict_proba(X_test)[:,1]

print("ROC-AUC:", roc_auc_score(y_test, lr_prob))
print("PR-AUC :", average_precision_score(y_test, lr_prob))

In [None]:
#random forest
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
rf_prob = rf.predict_proba(X_test)[:,1]

print("ROC-AUC:", roc_auc_score(y_test, rf_prob))
print("PR-AUC :", average_precision_score(y_test, rf_prob))

In [None]:
#Precision@K fraud ops metric
def precision_at_k(y_true, y_score, k=0.01):
    n = len(y_true)
    top_n = max(1, int(n * k))
    idx = np.argsort(-y_score)[:top_n]
    return y_true.iloc[idx].mean()

for k in [0.01, 0.02, 0.05, 0.10]:
    print(k, "LR P@K:", precision_at_k(y_test.reset_index(drop=True), lr_prob, k),
              "RF P@K:", precision_at_k(y_test.reset_index(drop=True), rf_prob, k))

In [None]:
best_model = rf  # replace if LR performs better for your data
joblib.dump({"model": best_model, "features": feature_cols}, "models/fraud_model.pkl")
print("Saved models/fraud_model.pkl")