In [None]:



    # code that produces warnings
    model.fit(X_train, y_train)

In [None]:
import warnings

In [None]:
import joblib
import pandas as pd
import matplotlib.pyplot as plt

model = joblib.load('models/xgb_day6.joblib')

booster = model.get_booster()
importance = booster.get_score(importance_type='gain')

imp_df = pd.DataFrame({
    'feature': list(importance.keys()),
    'gain': list(importance.values())
}).sort_values(by='gain', ascending=False)

imp_df.head(20)

In [None]:
top = imp_df.head(50)
plt.figure(figsize=(6,12))
plt.barh(top['feature'], top['gain'])
plt.gca().invert_yaxis()
plt.title("Top 50 Features (Gain Importance)")
plt.show()


In [None]:
full = pd.read_parquet("data/processed/train_full_with_day6.parquet")
print(full.shape)

In [None]:
drop_cols = ["isFraud", "TransactionDT", "TransactionID", "P_emaildomain",
             "R_emaildomain", "DeviceInfo", "dt"]

feature_cols = [c for c in full.columns if c not in drop_cols]
len(feature_cols)

In [None]:
for c in feature_cols:
    if full[c].dtype == "object" or full[c].dtype == "category":
        full[c] = full[c].astype("category").cat.codes

full[feature_cols] = full[feature_cols].fillna(-999)

In [None]:
n = len(full)
te = int(0.70 * n)
ve = int(0.85 * n)

X_train = full.iloc[:te][feature_cols]
y_train = full.iloc[:te]["isFraud"]

X_val = full.iloc[te:ve][feature_cols]
y_val = full.iloc[te:ve]["isFraud"]

X_test = full.iloc[ve:][feature_cols]
y_test = full.iloc[ve:]["isFraud"]

In [None]:
X_val_sample = X_val.sample(5000, random_state=42)

In [None]:
import shap
shap.initjs()

explainer = shap.TreeExplainer(model)
shap_vals = explainer.shap_values(X_val_sample)   # sample 5k rows

In [None]:
shap.summary_plot(shap_vals, X_val_sample, max_display=40)

In [None]:
# assume X_val_sample is a DataFrame of ~5k rows sampled from X_val
# shap_vals = explainer.shap_values(X_val_sample)  # you already did this

# find a fraud row that IS in the sample
fraud_idxs_in_sample = [i for i in X_val_sample.index if y_val.loc[i]==1]
if len(fraud_idxs_in_sample)==0:
    raise RuntimeError("No fraud rows in X_val_sample — resample with more positives.")
sample_idx = fraud_idxs_in_sample[0]         # real index value (e.g. 12345)

# convert to positional index inside the sample
pos = X_val_sample.index.get_loc(sample_idx) # 0..(len(X_val_sample)-1)

# now plot
shap.initjs()
shap.force_plot(explainer.expected_value, shap_vals[pos,:], X_val_sample.iloc[pos,:])


In [None]:
n = 10   # 0-based position inside your sample (10th row)
shap.initjs()
shap.force_plot(explainer.expected_value, shap_vals[n,:], X_val_sample.iloc[n,:])


In [None]:
idx_full = y_val[y_val==1].index[0]   # e.g. 413379

# get the single-row DataFrame for that index (must match model feature order)
row_df = X_val.loc[[idx_full]]        # double brackets -> keeps DF shape (1, n_features)

# compute SHAP just for this row (explainer supports single-row)
shap_vals_row = explainer.shap_values(row_df)   # shape (1, n_features)

shap.initjs()
shap.force_plot(explainer.expected_value, shap_vals_row[0], row_df.iloc[0])


In [None]:
probs = model.predict_proba(X_val)[:,1]
th = 0.433836  # best F1 threshold

pred = (probs > th).astype(int)

val_errors = X_val.copy()
val_errors["y_true"] = y_val.values
val_errors["y_pred"] = pred
val_errors["proba"] = probs

In [None]:
FN = val_errors[(val_errors.y_true==1) & (val_errors.y_pred==0)] \
     .sort_values("proba", ascending=True)

FN.head(20)

In [None]:
FP = val_errors[(val_errors.y_true==0) & (val_errors.y_pred==1)] \
     .sort_values("proba", ascending=False)

FP.head(20)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cols_to_check = ["TransactionAmt", "card1", "addr1", "P_emaildomain_freq"]

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for c in cols_to_check:
        plt.figure(figsize=(6,3))
        sns.kdeplot(X_train[c], label="Train")
        sns.kdeplot(X_test[c], label="Test")
        plt.title(f"Drift Check: {c}")
        plt.legend()
        plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    plt.figure(figsize=(10,5))
    sns.kdeplot(np.log1p(full["TransactionAmt"]), label="Train", bw_adjust=1.2)
    sns.kdeplot(np.log1p(full["TransactionAmt"]),  label="Test",  bw_adjust=1.2)
    plt.title("Drift Check: log(TransactionAmt)")
    plt.xlabel("log1p(TransactionAmt)")
    plt.show()


In [None]:
imp_df.to_csv("analysis/day8_feature_importance.csv", index=False)

* Computed SHAP values to understand how features push predictions up/down.
* Generated global explainability: mean |SHAP| values → top important features (TransactionAmt, card1, addr1, email frequencies, V-features).
* Generated local explainability for a few fraud cases using force plots (fixed index alignment issue by sampling correctly).
* Created drift plots (KDE) for key features: TransactionAmt, card1, addr1, P_emaildomain_freq, etc.

* Verified no harmful drift — only expected temporal differences (e.g., TransactionAmt drift only in rare high tail).

* Confirmed all features remain aligned with training schema (no missing or extra columns).

* Computed drift stats: null rates, means, stds, KS-test for numeric features.

* Documented findings: model stable, no retraining required, SHAP shows reasonable behavior.