In [34]:
import joblib, numpy as np, pandas as pd
from sklearn.calibration import CalibratedClassifierCV, FrozenEstimator
import numpy as np

# load model & data
model = joblib.load("models/xgb_day6.joblib")
full = pd.read_parquet("data/processed/train_full_with_day6.parquet")

In [35]:
# get booster feature names (model's training features)
booster = model.get_booster()
trained_feats = booster.feature_names
print("Model expects", len(trained_feats), "features. Sample:", trained_feats[:10])

Model expects 485 features. Sample: ['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2']


In [36]:
# pick same positional splits
n=len(full); te=int(0.70*n); ve=int(0.85*n)

# create X_val_raw from full (all candidate features)
# we’ll force alignment to trained_feats below
X_val_raw = full.iloc[te:ve].copy()
y_val = full.iloc[te:ve]['isFraud'].astype(int)


In [37]:
# Add missing features (that model expects but are not in X_val_raw)
miss = [c for c in trained_feats if c not in X_val_raw.columns]
if miss:
    print("Adding missing columns (filled -999):", miss[:20], "… total", len(miss))
    for c in miss:
        X_val_raw[c] = -999

In [38]:
# Drop unexpected extras (not needed but harmless)
extra = [c for c in X_val_raw.columns if c not in trained_feats]
if extra:
    print("Dropping extra columns not in model (sample):", extra[:20], "… total", len(extra))
    X_val_raw = X_val_raw.drop(columns=extra)

Dropping extra columns not in model (sample): ['TransactionID', 'isFraud', 'TransactionDT', 'P_emaildomain', 'R_emaildomain', 'DeviceInfo', 'dt'] … total 7


In [39]:
# Reorder exactly as model.feature_names
X_val_aligned = X_val_raw[trained_feats].copy()

# Ensure numeric dtypes: convert object/categorical/bool -> numeric codes or ints
for c in X_val_aligned.columns:
    if X_val_aligned[c].dtype == 'object':
        X_val_aligned[c] = X_val_aligned[c].astype('category').cat.codes
    if str(X_val_aligned[c].dtype).startswith('category'):
        X_val_aligned[c] = X_val_aligned[c].cat.codes
    if X_val_aligned[c].dtype == 'bool':
        X_val_aligned[c] = X_val_aligned[c].astype('int8')

# Fill NaNs (match training fill strategy)
X_val_aligned = X_val_aligned.fillna(-999)

In [40]:
# quick sanity
print("Final X_val shape:", X_val_aligned.shape)
non_numeric = [c for c in X_val_aligned.columns if X_val_aligned[c].dtype not in (np.int8,np.int16,np.int32,np.int64,np.float16,np.float32,np.float64)]
print("Non-numeric left:", non_numeric)

Final X_val shape: (88581, 485)
Non-numeric left: []


In [41]:
# Calibrate using FrozenEstimator (no refit risk)
frozen = FrozenEstimator(model)
cal = CalibratedClassifierCV(frozen, method='sigmoid', cv="prefit")
cal.fit(X_val_aligned, y_val)   # should run without feature mismatch now

joblib.dump(cal, "models/xgb_day7_calibrated.joblib")
print("Calibrated model saved to models/xgb_day7_calibrated.joblib")




Calibrated model saved to models/xgb_day7_calibrated.joblib


  grad = np.asarray([-g @ F, -g.sum()], dtype=np.float64)
  grad = np.asarray([-g @ F, -g.sum()], dtype=np.float64)
  grad = np.asarray([-g @ F, -g.sum()], dtype=np.float64)


In [42]:
# probs on val (from calibrated model you just saved)
probs_val = cal.predict_proba(X_val_aligned)[:,1]

# distributions & trouble flags
print("probs min/median/max:", probs_val.min(), np.median(probs_val), probs_val.max())
print("unique probs count:", np.unique(np.round(probs_val,6)).size)
print("counts of extreme probs:", ((probs_val==0).sum(), (probs_val==1).sum()))
print("val pos count:", y_val.sum(), " / ", len(y_val))


probs min/median/max: 0.00554740928962251 0.006881584425518473 0.8089915802762615
unique probs count: 24078
counts of extreme probs: (np.int64(0), np.int64(0))
val pos count: 3042  /  88581


In [43]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, f1_score, classification_report
roc_val = roc_auc_score(y_val, probs_val)
prec, rec, ths = precision_recall_curve(y_val, probs_val); pr_val = auc(rec, prec)
# best F1 on val
f1s = [( (prec_i*rec_i*2)/(prec_i+rec_i+1e-9) ) for prec_i,rec_i in zip(prec,rec)]
best_idx = np.argmax(f1s)
best_th = ths[best_idx-1] if best_idx>0 else 0.5
print("ROC-val, PR-val:", roc_val, pr_val, "best-F1-th:", best_th)


ROC-val, PR-val: 0.9201276837409074 0.5910724257294672 best-F1-th: 0.43383615224333966


In [45]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

# trained_feats, full, ve already exist from earlier cells
X_test_raw = full.iloc[ve:].copy()
y_test = full.iloc[ve:]['isFraud'].astype(int)

In [46]:
# add missing cols model expects
miss = [c for c in trained_feats if c not in X_test_raw.columns]
for c in miss:
    X_test_raw[c] = -999

In [48]:
# drop extras
extra = [c for c in X_test_raw.columns if c not in trained_feats]
if extra:
    X_test_raw = X_test_raw.drop(columns=extra)

# reorder
X_test_raw = X_test_raw[trained_feats].copy()

In [49]:
# Convert object/categorical/bool -> numeric codes BEFORE fillna
for c in X_test_raw.columns:
    if X_test_raw[c].dtype == 'object':
        X_test_raw[c] = X_test_raw[c].astype('category')
    # If categorical (including those you may have created earlier), convert to codes
    if str(X_test_raw[c].dtype).startswith('category'):
        # keep -1 for missing categories (cat.codes uses -1)
        X_test_raw[c] = X_test_raw[c].cat.codes.astype('int32')
    if X_test_raw[c].dtype == 'bool':
        X_test_raw[c] = X_test_raw[c].astype('int8')

In [50]:

# Now safe to fill NaNs with sentinel
X_test_aligned = X_test_raw.fillna(-999)

# sanity check
non_numeric = [c for c in X_test_aligned.columns if X_test_aligned[c].dtype not in (np.int8,np.int16,np.int32,np.int64,np.float16,np.float32,np.float64)]
print("Non-numeric after conversion (should be empty):", non_numeric)
print("X_test_aligned shape:", X_test_aligned.shape)

Non-numeric after conversion (should be empty): []
X_test_aligned shape: (88581, 485)


In [51]:
# predict & metrics
probs_test = cal.predict_proba(X_test_aligned)[:,1]
roc_test = roc_auc_score(y_test, probs_test)
prec, rec, _ = precision_recall_curve(y_test, probs_test)
pr_test = auc(rec, prec)
print("ROC-test:", round(roc_test,4), "PR-test:", round(pr_test,4))


ROC-test: 0.8899 PR-test: 0.5159


In [52]:
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
import numpy as np

# probs_val, probs_test, y_val, y_test, best_th already exist

# Metrics
roc_val = roc_auc_score(y_val, probs_val)
roc_test = roc_auc_score(y_test, probs_test)

# PR-AUC (robust)
pr_val = average_precision_score(y_val, probs_val)
pr_test = average_precision_score(y_test, probs_test)

print("ROC-val:", round(roc_val,4), " ROC-test:", round(roc_test,4))
print("PR-val (average_precision):", round(pr_val,4), " PR-test:", round(pr_test,4))

ROC-val: 0.9201  ROC-test: 0.8899
PR-val (average_precision): 0.5911  PR-test: 0.516


In [53]:
# Classification report at chosen threshold
yhat_test = (probs_test >= best_th).astype(int)
print("\nTest classification report (th={:.4f}):".format(best_th))
print(classification_report(y_test, yhat_test, digits=4))

# Top-1% fraud capture
k = max(1, int(0.01 * len(probs_test)))
topk_idx = np.argsort(probs_test)[-k:]
# y_test is a pandas Series (aligned); get values at indices
top1_frac = y_test.iloc[topk_idx].sum() / (y_test.sum() + 1e-9)
print("Top-1% fraud capture:", round(top1_frac,4))



Test classification report (th=0.4338):
              precision    recall  f1-score   support

           0     0.9798    0.9909    0.9853     85498
           1     0.6332    0.4340    0.5150      3083

    accuracy                         0.9716     88581
   macro avg     0.8065    0.7125    0.7502     88581
weighted avg     0.9678    0.9716    0.9690     88581

Top-1% fraud capture: 0.2562


Calibration didn’t boost ROC but smoothed probability outputs.

Fraud recall improved vs baseline at similar precision.

Model captures fraud reasonably well at low thresholds while preserving high accuracy.