In [1]:
# Final RL Reserving Framework with Regime Breakdown, Stress Testing, and Regime-Aware Curriculum

import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from sklearn.preprocessing import MinMaxScaler
import torch
import random

# ========== Data Loading ==========
def load_data(file_path):
    df = pd.read_csv(file_path)
    possible_cols = [
        ("IncurLoss_D", "CumPaidLoss_D", "PostedReserve97_D"),
        ("IncurLoss_h1", "CumPaidLoss_h1", "PostedReserve97_h1"),
        ("IncurredLosses", "PaidLosses", "Reserves")
    ]
    for incur, paid, res in possible_cols:
        if all(c in df.columns for c in [incur, paid, res]):
            df = df.rename(columns={incur: "IncurredLosses", paid: "PaidLosses", res: "Reserves"})
            break
    else:
        raise KeyError("No valid columns found.")
    df = df[["AccidentYear", "DevelopmentYear", "IncurredLosses", "PaidLosses", "Reserves"]].dropna()
    scaler = MinMaxScaler()
    df[["IncurredLosses", "PaidLosses", "Reserves"]] = scaler.fit_transform(df[["IncurredLosses", "PaidLosses", "Reserves"]])
    return df

# ========== Environment ==========
class ReservingEnv(gym.Env):
    def __init__(self, df, level=0, max_level=3, seed=None, fixed_shock=None):
        super().__init__()
        self.df_original = df.copy()
        self.level = level
        self.max_level = max_level
        self.seed_value = seed if seed is not None else random.randint(0, 9999)
        self.fixed_shock = fixed_shock
        self._setup_env()
        self.action_space = spaces.Discrete(7)
        self.action_mapping = np.linspace(-0.1, 0.1, 7)
        self.observation_space = spaces.Box(low=0, high=1, shape=(9,), dtype=np.float32)
        self.violation_memory = 0.0

    def _setup_env(self):
        np.random.seed(self.seed_value)
        frac = 0.4 + 0.6 * (self.level / self.max_level)
        n = int(len(self.df_original) * frac)
        self.df = self.df_original.sample(n=n, replace=True).reset_index(drop=True)
        noise = np.random.normal(0, 0.01 + 0.01 * self.level, size=(n, 3))
        self.df[["Reserves", "PaidLosses", "IncurredLosses"]] += noise
        self.df["Volatility"] = self.df[["PaidLosses", "IncurredLosses"]].std(axis=1)
        self.df["Capital"] = 1 - abs(self.df["Reserves"] - self.df["IncurredLosses"])

        if self.fixed_shock is not None:
            self.df["MacroShock"] = self.fixed_shock
        else:
            regime_map = {0: (1.0, 0.1), 1: (1.2, 0.2), 2: (1.5, 0.3), 3: (1.8, 0.4)}
            mean, std = regime_map.get(self.level, (1.0, 0.1))
            shock = np.random.normal(loc=mean, scale=std, size=n)
            self.df["MacroShock"] = np.clip(shock, 0.8, 2.0)

        self.df["Volatility"] *= self.df["MacroShock"]
        self.max_steps = min(60 + 10 * self.level, len(self.df))
        self.index = 0
        self.shortfall_history = []
        self.violations = 0

    def reset(self, *, seed=None, options=None):
        super().reset(seed=seed)
        self._setup_env()
        self.violation_memory = 0.0
        self.shortfall_history.clear()
        return self._get_obs(), {}

    def _get_obs(self):
        row = self.df.iloc[self.index]
        return np.array([
            row["Reserves"], row["PaidLosses"], row["IncurredLosses"],
            row["DevelopmentYear"], row["Volatility"], row["Capital"],
            self.violation_memory, self.level / self.max_level, row["MacroShock"]
        ], dtype=np.float32)

    def step(self, action):
        row = self.df.iloc[self.index]
        reserve_adj = self.action_mapping[action]
        reserve = np.clip(row["Reserves"] + reserve_adj, 0, 1)
        incurred = row["IncurredLosses"]
        volatility = row["Volatility"]

        min_reg = 0.4 + 0.2 * volatility
        buffer_zone = min_reg + 0.05
        shortfall = max(0, incurred - reserve)
        self.shortfall_history.append(shortfall)
        cvar_threshold = int(90 + 5 * (volatility + 0.1))
        cvar = np.percentile(self.shortfall_history, cvar_threshold) if self.shortfall_history else 0.0

        reg_penalty = 0
        if reserve < min_reg:
            reg_penalty = (min_reg - reserve) * 100.0
            self.violations += 1
        elif reserve < buffer_zone:
            reg_penalty = (buffer_zone - reserve) * 10.0

        norm_shortfall = np.clip(shortfall / 0.2, 0, 1)
        norm_cvar = np.clip(cvar / 0.2, 0, 1)
        norm_capital = 1 - row["Capital"]

        reward = -(
            2.0 * norm_shortfall + 4.0 * norm_cvar + 1.0 * norm_capital + reg_penalty
        )

        self.violation_memory = 0.95 * self.violation_memory + 0.05 * (1 if reserve < min_reg else 0)
        self.index += 1
        done = self.index >= self.max_steps
        return self._get_obs(), reward, done, False, {}

# ========== Evaluation ==========
def evaluate_agent(env, model, episodes=5):
    rewards, shortfalls, violations, regime_stats = [], [], [], []
    for _ in range(episodes):
        obs, _ = env.reset()
        done = False
        total = 0
        sf_list = []
        v_count = 0
        regime = "High" if obs[-1] > 1.5 else "Moderate" if obs[-1] > 1.0 else "Low"

        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _, _ = env.step(int(action))
            row = env.df.iloc[env.index - 1]
            sf_list.append(max(0, row["IncurredLosses"] - row["Reserves"]))
            if row["Reserves"] < (0.4 + 0.2 * row["Volatility"]):
                v_count += 1
            total += reward

        rewards.append(total)
        shortfalls.append(np.mean(sf_list))
        violations.append(v_count / env.max_steps)
        regime_stats.append(regime)

    df = pd.DataFrame({
        "Reward": rewards, "Shortfall": shortfalls, "CVaR95": shortfalls,
        "ViolationRate": violations, "Regime": regime_stats
    })
    return df.groupby("Regime").mean().reset_index()

# ========== Training ==========
def train_curriculum(df, seeds, lob_name, max_level=3):
    all_logs = []
    for seed in seeds:
        for level in range(max_level + 1):
            print(f"Training {lob_name} | Seed {seed} | Level {level}...")
            env = DummyVecEnv([lambda: ReservingEnv(df, level=level, max_level=max_level, seed=seed)])
            model = PPO("MlpPolicy", env, verbose=0, seed=seed)
            model.learn(total_timesteps=15000)

            eval_env = ReservingEnv(df, level=level, seed=seed)
            level_results = evaluate_agent(eval_env, model, episodes=5)
            level_results.insert(0, "Level", level)
            level_results.insert(0, "Seed", seed)
            level_results.insert(0, "LOB", lob_name)
            all_logs.append(level_results)

            # Optional stress test
            for shock in [0.8, 1.0, 1.5, 2.0]:
                stress_env = ReservingEnv(df, level=level, seed=seed, fixed_shock=shock)
                stress_stats = evaluate_agent(stress_env, model, episodes=3)
                stress_stats.insert(0, "MacroShock", shock)
                stress_stats.insert(0, "StressTest", True)
                stress_stats.insert(0, "Level", level)
                stress_stats.insert(0, "Seed", seed)
                stress_stats.insert(0, "LOB", lob_name)
                all_logs.append(stress_stats)

    final_df = pd.concat(all_logs, ignore_index=True)
    final_df.to_csv(f"final_regime_results_{lob_name}.csv", index=False)
    return final_df

# ========== Entry Point ==========
if __name__ == "__main__":
    df_wk = load_data("wkcomp_pos.csv")
    df_oth = load_data("othliab_pos.csv")

    train_curriculum(df_wk, seeds=[0, 42], lob_name="WorkersComp")
    train_curriculum(df_oth, seeds=[0, 42], lob_name="OtherLiability")


Training WorkersComp | Seed 0 | Level 0...
Training WorkersComp | Seed 0 | Level 1...
Training WorkersComp | Seed 0 | Level 2...
Training WorkersComp | Seed 0 | Level 3...
Training WorkersComp | Seed 42 | Level 0...
Training WorkersComp | Seed 42 | Level 1...
Training WorkersComp | Seed 42 | Level 2...
Training WorkersComp | Seed 42 | Level 3...
Training OtherLiability | Seed 0 | Level 0...
Training OtherLiability | Seed 0 | Level 1...
Training OtherLiability | Seed 0 | Level 2...
Training OtherLiability | Seed 0 | Level 3...
Training OtherLiability | Seed 42 | Level 0...
Training OtherLiability | Seed 42 | Level 1...
Training OtherLiability | Seed 42 | Level 2...
Training OtherLiability | Seed 42 | Level 3...
