In [12]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/paysim1/PS_20174392719_1491204439457_log.csv
/kaggle/input/frddtctmodel1/fraud_pipeline_final (1).pkl


In [13]:
# ------------------------------------------------------------------
# STEP 1: INSTALLATION & IMPORTS
# ------------------------------------------------------------------
# Install required libraries for explainability and graph processing

"""
End-to-end fraud pipeline with Train / Val / Test, SHAP explanation, and optional LLM
integration for human-readable explanations.

Requirements:

"""
!pip install groq
!pip install -q networkx
!pip install xgboost shap openai joblib
import os
import numpy as np
import pandas as pd
import networkx as nx
import xgboost as xgb
import shap
import joblib
from scipy.stats import randint, uniform
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_predict
from sklearn.metrics import classification_report, precision_recall_curve, confusion_matrix
import warnings
#warnings.filterwarnings('default')
# Suppress pandas deprecation warnings (these don't affect functionality)
warnings.filterwarnings('ignore', message='.*is_sparse.*', category=FutureWarning)
warnings.filterwarnings('ignore', message='is_sparse is deprecated')
warnings.filterwarnings('default', category=UserWarning)  # Still show important user warnings

print("Step 1: Environment Ready.")

Step 1: Environment Ready.


In [20]:
# ------------------------------------------------------------------
# STEP 2: DATA OPTIMIZATION
# ------------------------------------------------------------------
def reduce_mem_usage(df):
    """ Iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage. """
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32) # float16 has lower precision, using 32
                else:
                    df[col] = df[col].astype(np.float32)
    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage decreased to {end_mem:.2f} MB ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    return df



In [18]:
# Load Dataset (Assuming file is in Kaggle input directory)
# NOTE: PaySim dataset is usually at /kaggle/input/paysim1/PS_20174392719_1491204439457_log.csv
# Adjust path if necessary.
file_path = '/kaggle/input/paysim1/PS_20174392719_1491204439457_log.csv'

# Read CSV
df = pd.read_csv(file_path)
df = reduce_mem_usage(df)

# Rename columns for clarity
df = df.rename(columns={'oldbalanceOrg':'oldBalanceOrig', 'newbalanceOrig':'newBalanceOrig', 
                        'oldbalanceDest':'oldBalanceDest', 'newbalanceDest':'newBalanceDest'})

print(f"Step 2: Data Loaded. Shape: {df.shape}")
df.head(10)

Memory usage decreased to 291.26 MB (45.5% reduction)
Step 2: Data Loaded. Shape: (6362620, 11)


Unnamed: 0,step,type,amount,nameOrig,oldBalanceOrig,newBalanceOrig,nameDest,oldBalanceDest,newBalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.639648,C1231006815,170136.0,160296.359375,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.280029,C1666544295,21249.0,19384.720703,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.139648,C2048537720,41554.0,29885.859375,M1230701703,0.0,0.0,0,0
5,1,PAYMENT,7817.709961,C90045638,53860.0,46042.289062,M573487274,0.0,0.0,0,0
6,1,PAYMENT,7107.77002,C154988899,183195.0,176087.234375,M408069119,0.0,0.0,0,0
7,1,PAYMENT,7861.640137,C1912850431,176087.234375,168225.59375,M633326333,0.0,0.0,0,0
8,1,PAYMENT,4024.360107,C1265012928,2671.0,0.0,M1176932104,0.0,0.0,0,0
9,1,DEBIT,5337.77002,C712410124,41720.0,36382.230469,C195600860,41898.0,40348.789062,0,0


In [None]:
df.describe()

In [None]:
df.groupby('type')['isFraud'].sum()


In [None]:
df.sample(10)


In [None]:
df.describe(include='all')

In [22]:
# ----------------------------
# 1) FEATURE ENGINEER (Fitted inside pipeline)
# ----------------------------
class FraudFeatureEngineer(BaseEstimator, TransformerMixin):
    """Vectorized, deterministic feature transformer.
    - Builds weighted directed graph (aggregated by (origin,dest) counts)
    - Creates frequency, ratio, log and graph features
    """
    def __init__(self, pagerank_limit=None):
        self.pagerank_limit = pagerank_limit
        self.stats = {}
        self.graph_meta = {}
        self.type_map = {'TRANSFER': 0, 'CASH_OUT': 1}
        self.global_mean = 0.0

    def fit(self, X, y=None):
        X = X.copy()
        # Sort by time if available for deterministic graphs
        if 'step' in X.columns:
            X = X.sort_values('step')

        # Basic global stat
        self.global_mean = float(X['amount'].mean())
        self.global_median = float(X['amount'].median())

        # Frequency & mean
        self.stats['orig_counts'] = X['nameOrig'].value_counts().to_dict()
        self.stats['dest_counts'] = X['nameDest'].value_counts().to_dict()
        self.stats['orig_mean_amt'] = X.groupby('nameOrig')['amount'].mean().to_dict()
        self.stats['orig_median_amt'] = (X.groupby('nameOrig')['amount'].median().to_dict())
        self.stats['orig_log_median_amt'] = (X.groupby('nameOrig')['amount'].apply(lambda s: np.log1p(s).median()).to_dict())
        self.stats['last_step'] = X.groupby('nameOrig')['step'].last().to_dict()

        # Weighted graph: count transactions per (origin,dest)
        edge_weights = X.groupby(['nameOrig', 'nameDest']).size().reset_index(name='weight')
        G = nx.DiGraph()
        if not edge_weights.empty:
            edges = [(r['nameOrig'], r['nameDest'], float(r['weight'])) for _, r in edge_weights.iterrows()]
            G.add_weighted_edges_from(edges, weight='weight')

        self.graph_meta['in_degree'] = dict(G.in_degree(weight='weight'))
        self.graph_meta['out_degree'] = dict(G.out_degree(weight='weight'))

    
        # Pagerank: limit nodes if requested to save time/memory
        try:
            if G.number_of_nodes() == 0:
                self.graph_meta['pagerank'] = {}
            elif self.pagerank_limit and self.pagerank_limit < G.number_of_nodes():
                top_nodes = sorted(G.degree(weight='weight'), key=lambda x: x[1], reverse=True)[:self.pagerank_limit]
                keep = set(n for n, _ in top_nodes)
                sub = G.subgraph(keep).copy()
                self.graph_meta['pagerank'] = nx.pagerank(sub, alpha=0.85, weight='weight', tol=1e-4)
            else:
                self.graph_meta['pagerank'] = nx.pagerank(G, alpha=0.85, weight='weight', tol=1e-4)
        except Exception:
            # Pagerank failure should not break training
            self.graph_meta['pagerank'] = {}

        return self

    def transform(self, X):
        X = X.copy()

        # Time features
        X['hour'] = X['step'] % 24 if 'step' in X.columns else 0
        '''
        # Balance-check features
        X['errorBalanceOrig'] = X['newBalanceOrig'].fillna(0) + X['amount'].fillna(0) - X['oldBalanceOrig'].fillna(0)
        X['errorBalanceDest'] = X['oldBalanceDest'].fillna(0) + X['amount'].fillna(0) - X['newBalanceDest'].fillna(0)
        '''
        # Frequency mapping
        X['orig_txn_count'] = X['nameOrig'].map(self.stats.get('orig_counts', {})).fillna(0).astype(int)
        X['dest_txn_count'] = X['nameDest'].map(self.stats.get('dest_counts', {})).fillna(0).astype(int)

        # Ratio features
        user_mean = X['nameOrig'].map(self.stats.get('orig_mean_amt', {})).fillna(self.global_mean)
        X['amt_ratio_to_user_mean'] = X['amount'] / (user_mean + 1.0)
        X['amount_log1p'] = np.log1p(X['amount'])
        X['amount_over_oldBalanceOrig'] = X['amount'] / (X['oldBalanceOrig'].replace(0, np.nan).fillna(1.0))
        
        user_median = X['nameOrig'].map(self.stats.get('orig_median_amt', {})).fillna(self.global_median)
        # Apply fallback to global median for users with too few transactions
        MIN_TXNS = 3
        user_median = np.where(X['orig_txn_count'] >= MIN_TXNS, user_median, self.global_median)
        X['amt_ratio_to_user_median'] = (X['amount'] / (user_median + 1.0))
        
        user_log_median = X['nameOrig'].map(self.stats['orig_log_median_amt']).fillna(np.log1p(self.global_median))
        X['amt_log_ratio_to_user_median'] = (np.log1p(X['amount']) / (user_log_median + 1e-6))


        
        # Graph features
        X['in_degree'] = X['nameDest'].map(self.graph_meta.get('in_degree', {})).fillna(0).astype(float)
        X['out_degree'] = X['nameOrig'].map(self.graph_meta.get('out_degree', {})).fillna(0).astype(float)
        X['network_trust'] = X['nameOrig'].map(self.graph_meta.get('pagerank', {})).fillna(0.0).astype(float)

        # New/novelty flags
        X['is_new_origin'] = (X['orig_txn_count'] == 0).astype(int)
        X['is_new_dest'] = (X['dest_txn_count'] == 0).astype(int)

        # Type encoding (fast & vectorized)
        X['type_encoded'] = X['type'].map(self.type_map).fillna(-1).astype(int)

        # Drop identifiers and non-numeric columns
        for c in ['nameOrig', 'nameDest', 'type', 'isFlaggedFraud']:
            if c in X.columns:
                X.drop(columns=c, inplace=True)

        # Return numeric-only DataFrame expected by XGBoost & SHAP
        return X.select_dtypes(include=[np.number])

In [32]:
# ----------------------------
# 2) Split helpers: test temporal, train/test only (no validation)
# ----------------------------
def make_splits(df, test_frac=0.10, time_col='step', min_test_fraud=100, random_state=42):
    """
    Returns: X_train, X_test, y_train, y_test, time_used
    - Test is time-based (last test_frac proportion) if `time_col` exists.
    - Otherwise a stratified test split is used.
    - If temporal test exists but contains < min_test_fraud frauds, fallback to stratified test split.
    """
    # Temporal test split
    if time_col in df.columns:
        cutoff = df[time_col].quantile(1 - test_frac)
        train_df = df[df[time_col] <= cutoff].reset_index(drop=True)
        test_df = df[df[time_col] > cutoff].reset_index(drop=True)
    else:
        train_df = df.copy()
        test_df = pd.DataFrame(columns=df.columns)  # empty, fallback

    # If test is too small or contains too few frauds, fallback to stratified split on entire dataset
    if (test_df.empty) or (test_df['isFraud'].sum() < min_test_fraud):
        X_temp = df.drop(columns=['isFraud'])
        y_temp = df['isFraud']
        X_train, X_test, y_train, y_test = train_test_split(
            X_temp, y_temp, test_size=test_frac, stratify=y_temp, random_state=random_state
        )
        return X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train.reset_index(drop=True), y_test.reset_index(drop=True), False
    else:
        X_train = train_df.drop(columns=['isFraud'])
        y_train = train_df['isFraud']
        X_test = test_df.drop(columns=['isFraud'])
        y_test = test_df['isFraud']
        return X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train.reset_index(drop=True), y_test.reset_index(drop=True), True

In [35]:
# ----------------------------
# 3) Training function: train on TRAIN, test on TEST 
# ----------------------------
def train_pipeline(df,
                   pagerank_limit=100000,
                   random_state=42,
                   search_iter=3,
                   target_recall=0.95):
    """
    Full training flow adapted to only use TRAIN and TEST:
      - splits data (train / test; temporal test when possible)
      - randomized search on TRAIN only (with internal CV)
      - selects threshold using cross-validated predictions on TRAIN
      - refits final pipeline on TRAIN (no early stopping, since no val)
      - returns final_pipeline, optimal_threshold, and test set for evaluation
    """
    # Prepare splits (now returns X_train, X_test, y_train, y_test, time_used)
    X_train, X_test, y_train, y_test, time_used = make_splits(
        df, test_frac=0.05, time_col='step', random_state=random_state
    )

    print(f"Train size: {len(X_train)}, Test size: {len(X_test)}; time_split_used={time_used}")
    print("Fraud counts -> train:", int(y_train.sum()), "test:", int(y_test.sum()))


    # Build pipeline objects
    fe = FraudFeatureEngineer(pagerank_limit=pagerank_limit)
    #-----trnsOutSrch
    print("\n[Step A] Running Feature Engineering on FULL Training Set (Once)...") 
    # Fit and transform the raw X_train once
    X_train_search = fe.fit_transform(X_train, y_train)
    #-----trnsOutSrch
    xgb_base = xgb.XGBClassifier(objective='binary:logistic',
                                 tree_method="hist",
                                 device=device,
                                 n_jobs=1,  # keep job control to outer search
                                 random_state=random_state,
                                 use_label_encoder=False,
                                 verbosity=1)

    pipeline = Pipeline([('fe', fe), ('clf', xgb_base)])

    # Heuristic scale_pos_weight (from train)
    neg = (y_train == 0).sum()
    pos = (y_train == 1).sum()
    spw = int(max(1, neg / max(1, pos)))  #scale_pos_weight

    param_dist = {
        'n_estimators': randint(300, 900),
        'max_depth': randint(4, 8),
        'learning_rate': uniform(0.03, 0.12),
        'scale_pos_weight': [spw],
        'subsample': uniform(0.7, 0.3),
        'colsample_bytree': uniform(0.6, 0.4)
    }
    
    """
    param_dist = {
    # Number of trees
    'clf__n_estimators': randint(300, 900),
    # Tree depth (PaySim works best shallow–medium)
    'clf__max_depth': randint(4, 8),
    # Learning rate (avoid very small, too slow)
    'clf__learning_rate': uniform(0.03, 0.12),
    # Imbalance handling (keep fixed or narrow)
    'clf__scale_pos_weight': [spw],
    # Row sampling (important for generalization)
    'clf__subsample': uniform(0.7, 0.3),
    # Column sampling (often overlooked, very important)
    'clf__colsample_bytree': uniform(0.6, 0.4)
    }
    """
    ######
    ####
    ###-----trnsOutSrch srch optmize .  pipeline --> xgb_base  X_train --> X_train_search
    search = RandomizedSearchCV(xgb_base, param_distributions=param_dist, n_iter=search_iter,
                                scoring='average_precision', cv=3, verbose=2, n_jobs=-1, random_state=random_state,
                                refit=True)
    # Fit search on TRAIN only
    print(f"\nStarting RandomizedSearchCV: {search_iter} candidates × 3 folds = {search_iter * 3} model fits")
    print("This may take several minutes depending on dataset size. Progress will be shown below...\n")
    search.fit(X_train_search, y_train)
    print("\n\nRandom search best params:", search.best_params_)

    
    # ---------- Threshold selection using cross-validated predictions on TRAIN ----------
    # Use cross_val_predict to get out-of-fold probabilities on training data (keeps test untouched)
    try:
        train_probs_cv = cross_val_predict(
            search.best_estimator_, X_train_search, y_train, cv=3,
            method='predict_proba', n_jobs=2
        )[:, 1]
    except Exception:
        # Fallback: use the fitted pipeline's predict_proba on X_train (risk of optimistic threshold)
        train_probs_cv = search.predict_proba(X_train_search)[:, 1]

    precision, recall, thresholds = precision_recall_curve(y_train, train_probs_cv)
    idxs = np.where(recall[:-1] >= target_recall)[0]
    if len(idxs) > 0:
        optimal_threshold = float(thresholds[idxs[-1]])  # largest threshold preserving recall
    else:
        optimal_threshold = float(thresholds[0]) if len(thresholds) > 0 else 0.2

    print(f"Chosen threshold from TRAIN (CV) : {optimal_threshold:.8f} (target recall {target_recall})")

    # Optionally print training performance at chosen threshold (out-of-fold)
    train_preds_cv = (train_probs_cv >= optimal_threshold).astype(int)
    print("\nTraining (CV) performance at chosen threshold:")
    print(classification_report(y_train, train_preds_cv))
    print("Train Confusion Matrix:\n", confusion_matrix(y_train, train_preds_cv))

    # ---------- Refit final model on full TRAIN ----------
    # Fit a fresh feature engineer on entire train
    fe_final = FraudFeatureEngineer(pagerank_limit=pagerank_limit)
    fe_final.fit(X_train, y_train)

    X_train_trans = fe_final.transform(X_train)
    #X_test_trans = fe_final.transform(X_test) if len(X_test) > 0 else None
    print("\nSample of X_train_trans:")
    print(X_train_trans.sample(10))


    # Build classifier with best params (no early stopping since no val set)
    #cls_params = {k.replace('clf__', ''): v for k, v in search.best_params_.items() if k.startswith('clf__')}
    cls_params =search.best_params_
    clf_final = xgb.XGBClassifier(objective='binary:logistic',
                                  tree_method="hist",
                                  device=device,
                                  n_jobs=1,
                                  random_state=random_state,
                                  use_label_encoder=False,
                                  verbosity=2,
                                  **cls_params)

    clf_final.fit(X_train_trans, y_train)

    # Final pipeline contains fitted FE and fitted classifier
    final_pipeline = Pipeline([('fe', fe_final), ('clf', clf_final)])
    #
    ##----refit=True
    ###final_pipeline = Pipeline([('fe', fe), ('clf', search.best_estimator_)])
    
    # Save pipeline artifact
    joblib.dump(final_pipeline, 'fraud_pipeline_final.pkl')
    print("Saved final pipeline to 'fraud_pipeline_final.pkl'")

    # Return pipeline, threshold and test partitions (raw df forms for later inference)
    return final_pipeline, optimal_threshold, X_test, y_test

In [36]:
# ----------------------------
# 4) SHAP + LLM explain function and prediction wrapper
# ----------------------------
def predict_and_explain(pipeline, transaction_df, threshold, shap_background=None, topk=6, llm_api_key=None):
    """
    - pipeline: fitted sklearn Pipeline with steps 'fe' and 'clf'
    - transaction_df: raw transaction(s) DataFrame (same columns used in training)
    - threshold: numeric threshold chosen from validation
    - shap_background: optional background DataFrame for SHAP explainer (raw, not transformed).
                       If None, the function will attempt to sample from pipeline.fe.stats if available.
    - topk: how many top contributors to include in the explanation
    - llm_api_key: if provided, function will call Groq API to generate a textual explanation.

    Returns: dict with keys:
      'probabilities', 'decisions', 'shap_table' (DataFrame), 'llm_explanation' (str or None)
    """
    # Predict probabilities & binary decision
    probs = pipeline.predict_proba(transaction_df)[:, 1]
    decisions = (probs >= threshold).astype(int)

    # Prepare features for SHAP (transformed numeric features)
    fe = pipeline.named_steps['fe']
    clf = pipeline.named_steps['clf']

    X_trans = fe.transform(transaction_df)  # numeric matrix with columns in fixed order
    # Determine background for SHAP: prefer provided, else use fe.stats to synthesize small background
    if shap_background is None:
        # If we have access to saved stats, try to build a tiny synthetic background sample.
        # Fallback: use the transaction itself repeated (not ideal but safe).
        try:
            # if fe.stats contains 'orig_counts', we might not have raw rows; fallback above
            shap_background_trans = X_trans.iloc[[0]].copy()
        except Exception:
            shap_background_trans = X_trans.iloc[[0]].copy()
    else:
        # transform provided background raw df
        shap_background_trans = fe.transform(shap_background)

    # Create SHAP explainer
    try:
        # shap.Explainer is model-agnostic and often handles sklearn wrappers well
        explainer = shap.Explainer(clf, shap_background_trans, feature_names=X_trans.columns.tolist())
        shap_exp = explainer(X_trans)  # Explanation object
        # shap_exp.values shape: (n_samples, n_features)
        shap_vals = shap_exp.values[0] if shap_exp.values.ndim == 2 else shap_exp.values
        feature_names = X_trans.columns.tolist()
    except Exception:
        # Last-resort fallback using TreeExplainer on the raw booster - works for xgboost
        try:
            explainer = shap.TreeExplainer(clf)
            shap_vals = explainer.shap_values(X_trans)[0]
            feature_names = X_trans.columns.tolist()
        except Exception:
            # Unable to compute SHAP; return empty table
            shap_vals = np.zeros(X_trans.shape[1])
            feature_names = X_trans.columns.tolist()

    # Build a DataFrame of feature contributions
    feat_df = pd.DataFrame({
        'feature': feature_names,
        'value': X_trans.iloc[0].values,
        'shap_abs': np.abs(shap_vals),
        'shap': shap_vals
    })
    feat_df = feat_df.sort_values('shap_abs', ascending=False).reset_index(drop=True)

    # Prepare LLM prompt summarizing top contributors
    top_feats = feat_df.head(topk)[['feature', 'value', 'shap']].copy()
    top_lines = []
    for _, row in top_feats.iterrows():
        top_lines.append(f"- {row['feature']}: value={row['value']:.6g}, shap={row['shap']:.6g}")

    llm_explanation_text = None
    if llm_api_key:
        try:
            from groq import Groq  # <--- CHANGED: Import Groq instead of openai
            
            client = Groq(api_key=llm_api_key) # <--- CHANGED: Initialize Groq client
            
            system_prompt = (
                "You are a concise fraud-analytics assistant. "
                "Given feature contributions (SHAP values) and their observed values for a single transaction, "
                "produce a short (3-6 sentences) human-readable explanation why the model assigned the given fraud probability. "
                "Mention which factors increase or decrease risk, and a brief recommended action (e.g., block / review / allow)."
            )
            user_prompt = (
                f"Model fraud probability: {probs[0]:.4f}\n"
                f"Threshold for blocking: {threshold:.4f}\n"
                f"Top contributing features (feature: value, shap):\n" + "\n".join(top_lines) +
                "\n\nWrite the short explanation now."
            )

            # <--- CHANGED: Call Groq's chat.completions with a free Llama 3 model
            chat_completion = client.chat.completions.create(
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                model = "llama-3.1-8b-instant"  , # <--- CHANGED: Use free 'llama3-8b-8192' model
                temperature=0.0,
                max_tokens=250
            )
            
            # <--- CHANGED: Access response content (structure is same as OpenAI)
            llm_explanation_text = chat_completion.choices[0].message.content.strip()
            
        except Exception as e:
            llm_explanation_text = f"(LLM generation failed: {str(e)})"
    # Return structured outputs
    return {
        'probabilities': probs,
        'decisions': decisions,
        'shap_table': feat_df,
        'llm_explanation': llm_explanation_text
    }


In [39]:
# ----------------------------
# 5) Example usage: train, then predict and explain on one sample
# ----------------------------
if __name__ == "__main__":
    # Load dataset (adjust path as needed)
    # Adjust path if necessary.
    file_path = '/kaggle/input/paysim1/PS_20174392719_1491204439457_log.csv'

    # Read CSV
    df = pd.read_csv(file_path)
    df = reduce_mem_usage(df)
    df = df.rename(columns={'oldbalanceOrg':'oldBalanceOrig', 'newbalanceOrig':'newBalanceOrig',
                            'oldbalanceDest':'oldBalanceDest', 'newbalanceDest':'newBalanceDest'})
    df = df[df['type'].isin(['TRANSFER', 'CASH_OUT'])].reset_index(drop=True)

    # Train pipeline (this will take some time depending on data size)
    pipeline, threshold, X_test_raw, y_test = train_pipeline(df, pagerank_limit=100000,
                                                            random_state=42, search_iter=6,
                                                            target_recall=0.95)

    # Example single new transaction (raw columns same as training raw df)
    new_transaction = pd.DataFrame([{
        'step': df['step'].max() + 1,
        'type': 'CASH_OUT',
        'amount': 500000.0,
        'nameOrig': 'C12345_NEW_USER',
        'oldBalanceOrig': 500000.0,
        'newBalanceOrig': 0.0,
        'nameDest': 'C99999_EXISTING_BAD',
        'oldBalanceDest': 0.0,
        'newBalanceDest': 0.0,
        'isFlaggedFraud': 0
    }])

    # Optionally supply a small background sample for SHAP (raw rows from training)
    # Here we sample 200 rows from the original training area if available
    shap_bg = None
    try:
        # if we have a test partition raw DataFrame, use some rows from it as background (or from df earlier)
        shap_bg = df.sample(n=200, random_state=42).drop(columns=['isFraud'])
    except Exception:
        shap_bg = None

    # If you want LLM textual explanations, provide OPENAI_API_KEY environment variable or pass into function
    GROQ_API_KEY = os.getenv('GROQ_API_KEY', "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")

    # Predict and explain
    # <--- CHANGED: Pass 'llm_api_key' instead of 'openai_api_key'
    result = predict_and_explain(pipeline, new_transaction, threshold, shap_background=shap_bg,
                                 topk=6, llm_api_key=GROQ_API_KEY)
    
    print(f"Fraud probability: {result['probabilities'][0]:.4f}")
    print(f"Decision (threshold {threshold:.4f}): {'BLOCK' if result['decisions'][0] == 1 else 'ALLOW'}")
    print("\nTop SHAP contributors:")
    print(result['shap_table'].head(10).to_string(index=False))

    if result['llm_explanation']:
        print("\nLLM Explanation:")
        print(result['llm_explanation'])
    else:
        print("\nNo LLM explanation generated (provide GROQ_API_KEY environment variable to enable).")

Memory usage decreased to 291.26 MB (45.5% reduction)
Train size: 2632630, Test size: 137779; time_split_used=True
Fraud counts -> train: 5275 test: 2938

[Step A] Running Feature Engineering on FULL Training Set (Once)...

Starting RandomizedSearchCV: 6 candidates × 3 folds = 18 model fits
This may take several minutes depending on dataset size. Progress will be shown below...

Fitting 3 folds for each of 6 candidates, totalling 18 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.






Random search best params: {'colsample_bytree': 0.7599443886861021, 'learning_rate': 0.03559987958563385, 'max_depth': 7, 'n_estimators': 489, 'scale_pos_weight': 498, 'subsample': 0.7271819303598462}
[CV] END colsample_bytree=0.749816047538945, learning_rate=0.14408571676918994, max_depth=6, n_estimators=371, scale_pos_weight=498, subsample=0.8795975452591109; total time=  22.9s
[CV] END colsample_bytree=0.6624074561769746, learning_rate=0.048719342440344315, max_depth=6, n_estimators=758, scale_pos_weight=498, subsample=0.9598528437324805; total time=  31.1s
[CV] END colsample_bytree=0.9329770563201687, learning_rate=0.055480693281393136, max_depth=7, n_estimators=576, scale_pos_weight=498, subsample=0.8852444528883149; total time=  28.0s
[CV] END colsample_bytree=0.8446612641953124, learning_rate=0.030847956626366087, max_depth=4, n_estimators=860, scale_pos_weight=498, subsample=0.8574323980775167; total time=  29.1s
[CV] END colsample_bytree=0.6624074561769746, learning_rate=0.0

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Chosen threshold from TRAIN (CV) : 0.12684587 (target recall 0.95)

Training (CV) performance at chosen threshold:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   2627355
           1       0.52      0.95      0.67      5275

    accuracy                           1.00   2632630
   macro avg       0.76      0.97      0.83   2632630
weighted avg       1.00      1.00      1.00   2632630

Train Confusion Matrix:
 [[2622668    4687]
 [    263    5012]]

Sample of X_train_trans:
         step        amount  oldBalanceOrig  newBalanceOrig  oldBalanceDest  \
241177     21  2.254583e+05        0.000000        0.000000    9.688909e+05   
813357    163  9.987603e+04   121968.679688    22092.650391    8.517879e+05   
898952    181  2.177635e+05     2144.000000        0.000000    0.000000e+00   
1115321   206  3.778693e+05        0.000000        0.000000    1.038409e+06   
277956     35  2.280544e+05   324550.000000    96495.562500    2.216491e+

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fraud probability: 0.9999
Decision (threshold 0.1268): BLOCK

Top SHAP contributors:
                   feature         value  shap_abs      shap
amount_over_oldBalanceOrig      1.000000 11.421242 11.421242
            oldBalanceOrig 500000.000000  3.794964  3.794964
            oldBalanceDest      0.000000  2.329231  2.329231
                      hour      0.000000  1.415768  1.415768
                      step    744.000000  0.591295  0.591295
            newBalanceOrig      0.000000  0.524133  0.524133
                    amount 500000.000000  0.509880  0.509880
            dest_txn_count      0.000000  0.166954  0.166954
              type_encoded      1.000000  0.114061  0.114061
              amount_log1p     13.122365  0.081734  0.081734

LLM Explanation:
The model assigned a high fraud probability of 0.9999, indicating a strong likelihood of a fraudulent transaction. The top contributing factors are:

- A large transaction amount (1) compared to the old balance (500,000), incr

In [None]:
import joblib
X_train, X_test_raw, y_train, y_test, time_used = make_splits(
        df, test_frac=0.05, time_col='step', random_state=random_state)
GROQ_API_KEY = os.getenv('GROQ_API_KEY', "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")
pipeline = joblib.load("/kaggle/input/frddtctmodel1/fraud_pipeline_final (1).pkl")

threshold = 0.0793  # reuse training threshold


In [46]:
result = predict_and_explain(pipeline, X_test_raw, threshold, shap_background=None,topk=4, llm_api_key=GROQ_API_KEY)
print("\nTEST performance at TRAIN-chosen threshold:")
print(classification_report(y_test, result['decisions']))
print("Test Confusion Matrix:\n",confusion_matrix(y_test, result['decisions']))




TEST performance at TRAIN-chosen threshold:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    134841
           1       0.92      1.00      0.96      2938

    accuracy                           1.00    137779
   macro avg       0.96      1.00      0.98    137779
weighted avg       1.00      1.00      1.00    137779

Test Confusion Matrix:
 [[134586    255]
 [     0   2938]]


In [None]:
!ls -lh /kaggle/working
