# RL Agents for Fraud Detection (Hybrid: Embeddings + Structured Data)

This notebook trains RL agents (PPO, A2C, DQN) on a hybrid dataset consisting of:
1.  **Embeddings**: Attention-pooled embeddings from a DistilBERT model.
2.  **Structured Data**: Original features from the CreditCard dataset (Time, V1-V28, Amount).

The combined data goes through a preprocessing pipeline:
-   **Scaling**: StandardScaler applied to the concatenated vector.
-   **PCA**: Dimensionality reduction on the scaled combined vector.
-   **CTGAN**: Data augmentation to balance the training set.

In [None]:
# Install required packages
!pip install gymnasium numpy pandas torch stable-baselines3 scikit-learn matplotlib seaborn ctgan

In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import pandas as pd
import torch
import os

from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.vec_env import DummyVecEnv

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

## 1. Data Loading and Alignment

In [None]:
# Load Embeddings
print("Loading embeddings...")
pkl_data = pd.read_pickle("attention_pooled_embeddings.pkl")
embeddings = pkl_data['embeddings']
labels = np.array(pkl_data['labels'])
uids = pkl_data['uids']

print(f"Embeddings shape: {embeddings.shape}")
print(f"Labels shape: {labels.shape}")
print(f"UIDs shape: {len(uids)}")

# Load Structured Data
print("\nLoading structured data...")
creditcard_df = pd.read_csv("ablation_study/creditcard.csv")

# Filter structured data to match the UIDs in the embeddings file
# Using uids to select the corresponding rows from the original dataset
structured_data = creditcard_df.loc[uids].copy()

# Drop Class column to get features
structured_features = structured_data.drop('Class', axis=1).values
print(f"Structured features shape: {structured_features.shape}")

# Concatenate Embeddings and Structured Data
# We use the labels from the pickle file as the ground truth
X = np.hstack([embeddings, structured_features])
y = labels

print(f"\nCombined Data Shape: {X.shape}")

## 2. Data Preprocessing Pipeline
1. Split Train/Test
2. Scale (StandardScaler)
3. PCA
4. CTGAN Augmentation

In [None]:
# 1. Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")

# 2. Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. PCA
# We want to retain 99% variance, similar to the original notebook
pca = PCA(n_components=0.99, whiten=True)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"\nPCA reduced dimensions from {X_train.shape[1]} to {X_train_pca.shape[1]}")

# 4. CTGAN Augmentation
from ctgan import CTGAN

# Prepare data for CTGAN (only fraud samples from training set)
train_df_pca = pd.DataFrame(X_train_pca, columns=[f'pc{i}' for i in range(X_train_pca.shape[1])])
train_df_pca['label'] = y_train

fraud_df = train_df_pca[train_df_pca['label'] == 1].drop('label', axis=1)
print(f"Fraud samples for CTGAN: {len(fraud_df)}")

# Train CTGAN
ctgan = CTGAN(epochs=200, batch_size=64, pac=1, verbose=True)
ctgan.fit(fraud_df)

# Generate synthetic samples
n_synthetic = len(fraud_df) # Double the fraud samples
synthetic_fraud = ctgan.sample(n_synthetic)
X_synthetic = synthetic_fraud.values.astype(np.float32)
y_synthetic = np.ones(n_synthetic, dtype=np.int64)

# Augment Training Set
X_train_aug = np.vstack([X_train_pca, X_synthetic])
y_train_aug = np.concatenate([y_train, y_synthetic])

print(f"Augmented Train Shape: {X_train_aug.shape}")

## 3. RL Environment

In [None]:
class FraudDetectionEnv(gym.Env):
    """
    Custom Environment that follows gym interface.
    """
    def __init__(self, features, labels, reward_config=None):
        super(FraudDetectionEnv, self).__init__()
        
        self.features = features.astype(np.float32)
        self.labels = labels.astype(np.int64)
        self.n_samples = len(features)
        self.input_dim = features.shape[1]
        
        # Define action and observation space
        # Action: 0 (Not Fraud), 1 (Fraud)
        self.action_space = spaces.Discrete(2)
        
        # Observation: Feature vector
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf,
            shape=(self.input_dim,),
            dtype=np.float32
        )
        
        if reward_config is None:
            self.reward_config = {
                'TP': 10.0,
                'FP': -5.0,
                'FN': -20.0,
                'TN': 1.0
            }
        else:
            self.reward_config = reward_config
            
        self.current_step = 0
        self.indices = np.arange(self.n_samples)
        
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        np.random.shuffle(self.indices)
        return self._get_obs(), {}
    
    def _get_obs(self):
        idx = self.indices[self.current_step]
        return self.features[idx]
    
    def step(self, action):
        idx = self.indices[self.current_step]
        true_label = self.labels[idx]
        
        reward = 0
        if action == 1 and true_label == 1:
            reward = self.reward_config['TP']
        elif action == 1 and true_label == 0:
            reward = self.reward_config['FP']
        elif action == 0 and true_label == 1:
            reward = self.reward_config['FN']
        elif action == 0 and true_label == 0:
            reward = self.reward_config['TN']
            
        self.current_step += 1
        done = self.current_step >= self.n_samples
        truncated = False
        
        info = {
            'true_label': true_label,
            'pred_label': action
        }
        
        if not done:
            next_obs = self._get_obs()
        else:
            next_obs = np.zeros(self.input_dim, dtype=np.float32)
            
        return next_obs, reward, done, truncated, info

## 4. Training and Evaluation

In [None]:
# Define a learning rate schedule
def linear_schedule(initial_value):
    def schedule(progress_remaining):
        return progress_remaining * initial_value
    return schedule

def train_evaluate_agent(agent_name, X_train, y_train, X_test, y_test, total_timesteps=10000):
    print(f"\nTraining {agent_name}...")
    
    # Create env
    env = DummyVecEnv([lambda: FraudDetectionEnv(X_train, y_train)])
    
    model = None
    if agent_name == 'PPO':
        # PPO was not in the reference notebook, using default parameters
        model = PPO('MlpPolicy', env, verbose=0)
    elif agent_name == 'A2C':
        # Parameters from RL2.0_ATT.ipynb
        model = A2C(
            "MlpPolicy",
            env,
            learning_rate=1e-4,
            gamma=0.99,
            n_steps=5,
            ent_coef=0.01,
            vf_coef=0.5,
            max_grad_norm=0.5,
            verbose=0,
            device="auto"
        )
    elif agent_name == 'DQN':
        # Parameters from RL2.0_ATT.ipynb
        model = DQN(
            "MlpPolicy",
            env,
            learning_rate=linear_schedule(1e-4),
            buffer_size=100000,
            learning_starts=1000,
            batch_size=512,
            gamma=0.99,
            train_freq=1,
            gradient_steps=1,
            target_update_interval=500,
            exploration_fraction=0.1,
            exploration_initial_eps=1.0,
            exploration_final_eps=0.05,
            max_grad_norm=10,
            verbose=0,
            device="auto"
        )
    
    if model:
        model.learn(total_timesteps=total_timesteps)
        
        # Evaluate
        print(f"Evaluating {agent_name}...")
        test_env = FraudDetectionEnv(X_test, y_test)
        obs, _ = test_env.reset()
        
        y_true = []
        y_pred = []
        
        done = False
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, truncated, info = test_env.step(action)
            
            if 'true_label' in info:
                y_true.append(info['true_label'])
                y_pred.append(action)
                
        # Metrics
        print(f"--- {agent_name} Results ---")
        print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
        print(f"Precision: {precision_score(y_true, y_pred):.4f}")
        print(f"Recall: {recall_score(y_true, y_pred):.4f}")
        print(f"F1 Score: {f1_score(y_true, y_pred):.4f}")
        
        cm = confusion_matrix(y_true, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'{agent_name} Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.show()

# Train Agents
# Note: PPO parameters are default as it was not present in the reference notebook.
train_evaluate_agent('PPO', X_train_aug, y_train_aug, X_test_pca, y_test)
train_evaluate_agent('A2C', X_train_aug, y_train_aug, X_test_pca, y_test)
train_evaluate_agent('DQN', X_train_aug, y_train_aug, X_test_pca, y_test)