In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import gymnasium as gym
from copy import deepcopy
import py_vollib_vectorized

class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim)  # Output Q-values for each action
        )

    def forward(self, x):
        return self.net(x)

In [84]:
class DQNAgent:
    def __init__(self, state_dim, action_dim, lr=0.001, gamma=0.99):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.model = DQN(state_dim, action_dim).to(self.device)
        self.target_model = DQN(state_dim, action_dim).to(self.device)
        self.target_model.load_state_dict(self.model.state_dict())  # Target network starts as a copy
        self.target_model.eval()

        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.gamma = gamma
        self.epsilon = 1.0  # Start with full exploration
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.memory = []  # Replay buffer
        self.batch_size = 32

    def select_action(self, state, valid_actions):
        """ Selects an action using epsilon-greedy and masks invalid actions """
        if random.random() < self.epsilon:
            return random.choice(valid_actions)  # Random valid action

        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
        q_values = self.model(state_tensor).cpu().detach().numpy().flatten()

        # Mask invalid actions
        masked_q_values = np.full_like(q_values, -np.inf)
        masked_q_values[valid_actions] = q_values[valid_actions]

        return np.argmax(masked_q_values)

    def store_transition(self, transition):
        """ Stores a transition (s, a, r, s', done) in the replay buffer """
        self.memory.append(transition)
        if len(self.memory) > 10_000:  # Keep buffer size manageable
            self.memory.pop(0)

    def train(self):
        """ Trains the DQN with a batch from replay buffer """
        if len(self.memory) < self.batch_size:
            return

        batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.tensor(states, dtype=torch.float32).to(self.device)
        actions = torch.tensor(actions, dtype=torch.long).to(self.device)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(self.device)
        next_states = torch.tensor(next_states, dtype=torch.float32).to(self.device)
        dones = torch.tensor(dones, dtype=torch.float32).to(self.device)

        # Compute Q-values
        q_values = self.model(states)
        #print(f"q_values: {q_values}")
        #print(f"actions: {actions}")
        #print(f"q_values.shape: {q_values.shape}")
        #print("actions.shape: ", actions.shape)
        #print("actions.unsqueeze(1).shape: ", actions.unsqueeze(1).shape)
        q_values = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)  # Select Q-values for taken actions

        # Compute target Q-values using target network
        with torch.no_grad():
            next_q_values = self.target_model(next_states).max(1)[0]
            target_q_values = rewards + (1 - dones) * self.gamma * next_q_values

        # Compute loss
        loss = nn.MSELoss()(q_values, target_q_values)

        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update epsilon
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def update_target_network(self):
        """ Updates target network weights with main model weights """
        self.target_model.load_state_dict(self.model.state_dict())

In [71]:
def apply_quadratic_volatility_model(strikes, spot, atm_vol, slope, quadratic_term, texp_years):
    """
    Apply the quadratic volatility model to new data points.
    
    Parameters:
        strikes (array-like): Array of strike prices.
        spot (float): Spot price.
        atm_vol (float): At-the-money volatility.
        slope (float): Slope of the linear term.
        quadratic_term (float): Coefficient of the quadratic term.
        texp_years (float): Time to expiration in years.
    
    Returns:
        array-like: Fitted volatilities for the given strikes.
    """
    #print(f"apply_quadratic_vol input sizes: strikes={strikes}, spot={len(spot)}, atm_vol={len(atm_vol)}, slope={len(slope)}, quadratic_term={len(quadratic_term)}, texp_years={len(texp_years)}")
    log_strikes = np.log(strikes) - np.log(spot)
    fitted_vols = atm_vol + slope * log_strikes + quadratic_term * log_strikes**2
    #fitted_vols = atm_vol + (slope / np.sqrt(texp_years)) * log_strikes + quadratic_term * log_strikes**2
    fitted_vols= np.clip(fitted_vols, .05,.4)
    return fitted_vols

In [55]:
df=pd.read_csv("./algo_data/vol_surfaces2.csv")
df['minute'] = pd.to_datetime(df['minute'])
df['minute'].apply(lambda x: x.tz).unique()
#for each row find 16:17:00 and compute years to maturity where maturity is 16:17:00 for each row

def get_years_to_maturity(row):
    maturity = pd.Timestamp(row['minute'].date(), tz=row['minute'].tz) + pd.Timedelta(hours=16, minutes=17)
    return (maturity - row['minute']).seconds / (365.25 * 24 * 60 * 60)

df['years_to_maturity'] = df.apply(get_years_to_maturity, axis=1)
df.loc[df['implied_spot'] <= .07, ['implied_spot', 'atm_vol', 'slope', 'quadratic_term', 'scaled_slope', 'scaled_quadratic']] = np.nan
df.loc[df['atm_vol'] <= .03, ['implied_spot', 'atm_vol', 'slope', 'quadratic_term', 'scaled_slope', 'scaled_quadratic']] = np.nan
# Forward fill the NaN values
df=df.ffill().infer_objects(copy=False)

  df['minute'] = pd.to_datetime(df['minute'])


In [82]:
import gymnasium as gym  # ✅ Use gymnasium instead of gym
import numpy as np
import pandas as pd
from gymnasium import spaces

class SimEnv(gym.Env):
    """
    Custom Options Trading Environment for Reinforcement Learning.
    """
    def __init__(self, df):
        super(SimEnv, self).__init__()
        
        # Market Data
        self.df = df
        self.df_today = None
        
        # Index Tracking
        self.global_index = 0
        self.daily_index = 0
        self.start_index = 0
        self.max_steps = 62  # Max steps per episode

        # Trading Variables
        self.position = 0
        self.entry_price = 0
        self.position_open_time = None

        # Capital & PnL
        self.capital = 100
        self.pnl = 0
        self.position_value = 0

        # Episode State
        self.done = False
        self.current_row = None
        self.last_pnl = 0

        # Action & Observation Space
        self.action_space = spaces.Discrete(3)  # 0: Hold, 1: Open, 2: Close
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(11,), dtype=np.float32)

    def __init__(self, env_config):
        super(SimEnv, self).__init__()
        
        # ✅ Read `df` from `env_config`
        self.df = env_config.get("df")
        
        # ✅ Ensure df is provided
        if self.df is None:
            raise ValueError("Error: `df` must be provided in env_config!")
        
        # ✅ Initialize other attributes
        self.df_today = None
        self.global_index = 0
        self.max_steps = 62  # Max steps per episode

        self.position = 0
        self.entry_price = 0
        self.pnl = 0
        self.done = False
        self.current_row = None
        self.last_pnl = 0
        self.capital = 100
        
        self.action_space = spaces.Discrete(3)  # 0: Hold, 1: Open, 2: Close
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(11,), dtype=np.float32)

    def reset(self, seed=None):
        """ Reset environment and start a new episode. """
        self.global_index = self.pick_episode_start()
        self.daily_index = self.df_today.index.get_loc(self.global_index)
        self.start_index = self.global_index
        self.done = False
        self.position = 0
        self.pnl = 0
        self.capital = 100
        # Compute daily straddle prices before trading starts
        straddle_prices = self.compute_daily_atm_straddle_prices()
        self.df_today["daily_straddle_prices"] = straddle_prices
        self.df_today["open_straddle_pnl"] = 0

        obs = self._get_state()  # Observation (state)
        action_mask = self.compute_action_mask()  # ✅ Compute action mask for valid actions
        info = {"action_mask": action_mask}  # ✅ Include action mask in `info`

        return obs, info  # ✅ Must return a tuple (obs, info)

    def compute_action_mask(self):
        """ Computes an action mask where invalid actions are marked as 0. """
        action_mask = np.array([1, 1, 1])  # Default: all actions allowed
        
        if self.position == 0:
            action_mask[2] = 0  # Can't close if no position is open
        else:
            action_mask[1] = 0  # Can't open a new position if one is already open
        
        return action_mask  # ✅ Masked actions for Rllib

    def step(self, action):
        """ Execute the selected action. """
        reward = 0.0
        allowed_actions = self.valid_actions()

        if action not in allowed_actions:
            reward = -1000  # Strong penalty for invalid action
            truncated = True
            self.done = True
            return self._get_state(), reward, self.done, truncated, {"action_mask": self.compute_action_mask()}  # ✅ Return action mask

        if action == 0 and self.position == 0:  # Open position
            self.open_position()
            self.update_time_step(60)

        elif action == 1 and self.position > 0:  # Close position
            reward = self.df_today["open_straddle_pnl"].loc[self.global_index] - self.pnl
            self.pnl = self.df_today["open_straddle_pnl"].loc[self.global_index]
            self.position = 0
            self.done = True

        elif action == 2:  # Hold position
            if self.position > 0:
                reward = self.df_today["open_straddle_pnl"].loc[self.global_index] - self.pnl
                self.pnl = self.df_today["open_straddle_pnl"].loc[self.global_index]
            
            self.update_time_step(1)

        # End episode if time exceeds max steps
        if self.global_index - self.start_index >= self.max_steps:
            self.done = True

        return self._get_state(), reward, self.done, False, {"action_mask": self.compute_action_mask()}  # ✅ Return action mask
    
    def pick_random_day(self, burn_days=5):
        all_days = self.df['date'].unique()
        all_days = sorted(all_days)
        start_day = np.random.choice(all_days[burn_days:-1])
        return start_day

    def pick_random_timestep(self,df):
        all_times = self.df['minute'].apply(lambda x: x.time()).unique()
        all_times = sorted(all_times)
        latest_time = pd.Timestamp('12:45').time()
        earliest_time = pd.Timestamp('9:30').time()
        all_times = [x for x in all_times if x >= earliest_time and x <= latest_time]
        start_time = np.random.choice(all_times)
        return start_time

    def pick_episode_start(self):
        start_day = self.pick_random_day()
        self.df_today = self.df[self.df['date'] == start_day]
        self.df_today=deepcopy(self.df_today)
        start_time=self.pick_random_timestep(self.df_today)
        episode_start_index = self.df[(self.df['date'] == start_day) & (self.df['minute'].apply(lambda x: x.time()) == start_time)].index[0]
        
        #self.current_row = self.df.iloc[self.global_index]
        #self.df_today=self.select_todays_data()
        return episode_start_index
    
    

    
    def compute_spot_vols(self,strike):
        """
        Compute fitted volatilities for a range of strikes.
        
        Parameters:
            spot (float): Spot price.
            atm_vol (float): At-the-money
            slope (float): Slope of the linear term.
            quadratic_term (float): Coefficient of the quadratic term.
            texp_years (float): Time to expiration in years.    

        Returns:
            array-like: Fitted volatilities for a range of strikes.
        """
        spots=self.df_today['implied_spot']
        atm_vol=self.df_today['atm_vol']
        texp_years = self.df_today['years_to_maturity']
        slope=self.df_today['slope']
        quadratic_term=self.df_today['quadratic_term']
        #print(f"variable sizes: texp={texp_years.shape}, spot={spots.shape}, atm_vol={atm_vol.shape}, slope={slope.shape}, quadratic_term={quadratic_term.shape},strike={strike.shape}")
        vols = apply_quadratic_volatility_model(strike, spots, atm_vol, slope, quadratic_term, texp_years)
        #print(f"vols size={vols.shape}")
        return vols


    def compute_daily_atm_straddle_prices(self):
        """
        Compute straddle prices for a range of strikes.
        
        Parameters:
            spot (float): Spot price.
            atm_vol (float): At-the-money
            slope (float): Slope of the linear term.
            quadratic_term (float): Coefficient of the quadratic term.
            texp_years (float): Time to expiration in years.    

        Returns:
            array-like: Fitted volatilities for a range of strikes.
        """
        texp = self.df_today['years_to_maturity']
        spot = self.df_today['implied_spot']
        texp = self.df_today['years_to_maturity']
        vol=self.df_today['atm_vol']
        #print("variable sizes: ",texp.shape,spot.shape,vol.shape)
        straddle_prices = self.price_instrument('c', spot, spot, texp, vol) + self.price_instrument('p', spot, spot, texp, vol)

        return straddle_prices

    
    def compute_straddle_prices(self, strike):
        """
        Compute straddle prices for a range of strikes.
        
        Parameters:
            spot (float): Spot price.
            atm_vol (float): At-the-money
            slope (float): Slope of the linear term.
            quadratic_term (float): Coefficient of the quadratic term.
            texp_years (float): Time to expiration in years.    

        Returns:
            array-like: Fitted volatilities for a range of strikes.
        """
    
        texp = self.df_today['years_to_maturity']
        spot = self.df_today['implied_spot']
        vols=self.compute_spot_vols(strike)
        #print(f"variable sizes: texp={texp.shape}, spot={spot.shape}, vols={vols.shape}")
        #vols=apply_apply_quadratic_volatility_model(strike, spot, atm_vols, slopes, quadratic_terms, texp)
        straddle_prices = self.price_instrument('c', strike, spot, texp, vols) + self.price_instrument('p', strike, spot, texp, vols) 
        #print(f"straddle_prices={straddle_prices}")

        df_output=pd.DataFrame()
    
        df_output["spot"]=spot
        df_output["texp"]=texp
        df_output["vols"]=vols
        df_output["strike"]=strike
        df_output["straddle_prices"]=straddle_prices
        df_output.to_csv("straddle_prices.csv")

        return straddle_prices


    def update_time_step(self, minutes=1):
        self.global_index = min(self.global_index + minutes, self.df_today.index.max())

    def price_instrument(self, cp, strike, spot, texp, vol):
        #if self.debug:
        #    print(f"cp={cp}\n, strike={strike}\n, spot={spot}\n, texp={texp}\n, vol={vol}\n")
        #print(f"pricing_insturment sizes: cp={cp}, strike={strike.shape}, spot={spot.shape}, texp={texp.shape}, vol={vol.shape}")
        return py_vollib_vectorized.models.vectorized_black_scholes(cp, spot, strike, texp, 0, vol,return_as="numpy")

    def update_time_step(self, minutes=1):
        self.global_index = min(self.global_index + minutes, len(self.df) - 1)
        self.daily_index = min(self.daily_index + minutes, len(self.df_today) - 1) 

    def get_current_time(self):
        return self.df.iloc[self.global_index]['minute']
    

    def get_current_row(self):
        return self.df.iloc[self.global_index]

    def open_position(self):

        ivol = self.get_current_row()['implied_spot']
        texp = self.get_current_row()['years_to_maturity']
        spot=self.get_current_row()['implied_spot']
        #straddle_price_1 = self.price_one_day_straddle(texp, ivol)
        straddle_price=self.df_today['daily_straddle_prices'].loc[self.global_index]
        #print(f"straddle_price={straddle_price}")
        #print(f"straddle_price_1={straddle_price_1}")
        if (straddle_price == 0):
            print(f"eror: straddle_price={straddle_price}. at time={self.get_current_time()}")
        self.position = self.capital / straddle_price
        #self.position_value = self.position * straddle_price
        self.strike=spot
        #spot_vols=self.compute_spot_vols(self.strike)
        self.straddle_prices=self.compute_straddle_prices(self.strike)
        self.df_today["open_straddle_prices"]=self.straddle_prices
        self.df_today["open_straddle_pnl"]=(self.df_today["open_straddle_prices"]- straddle_price)*self.position
        self.position_open_time = self.global_index
        return self.position*straddle_price


    def _get_state(self):
        """ Returns the current state as a NumPy array. """
        row = self.df.iloc[self.global_index]
        steps_taken = self.global_index - self.start_index
        steps_remaining = self.max_steps - steps_taken
        
        state = np.array([
            row['implied_spot'],  # Current spot price
            row['atm_vol'],  # ATM implied volatility
            row['scaled_slope'],  # Volatility skew slope
            row['scaled_quadratic'],  # Volatility skew curvature
            steps_taken,  # How many steps taken in this episode
            steps_remaining,  # Steps remaining before timeout
            self.position,  # Position status (0: no position, >0: position held)
            int(self.position>0),  # binary state to make it sumpler
            self.pnl,  # Cumulative PnL
            self.df_today["daily_straddle_prices"].loc[self.global_index],  # Current straddle price
            self.df_today["open_straddle_pnl"].loc[self.global_index]  # PnL from position
        ], dtype=np.float32)

        return state

    def valid_actions(self):
        if self.position == 0:
            return [0, 2]
        else:
            return [1, 2]

    def render(self, mode="human"):
        """ Optional: Print state information for debugging. """
        print(f"Time: {self.df.iloc[self.global_index]['minute']}, Position: {self.position}, PnL: {self.pnl}")

    def close(self):
        pass

In [85]:
env = SimEnv({"df": df})  # ✅ Use your trading environment
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

agent = DQNAgent(state_dim, action_dim)

num_episodes = 1000
update_target_every = 50  # Update target network every 50 episodes

for episode in range(num_episodes):
    state, info = env.reset()
    done = False
    episode_reward = 0

    while not done:
        valid_actions = env.valid_actions()  # ✅ Get valid actions
        action = agent.select_action(state, valid_actions)  # ✅ Use masked action selection
        next_state, reward, done, truncated, info = env.step(action)
        #print(f"next_state={next_state}, reward={reward}, done={done}, truncated={truncated}, info={info}")

        agent.store_transition((state, action, reward, next_state, done))
        agent.train()

        state = next_state
        episode_reward += reward

    if episode % update_target_every == 0:
        agent.update_target_network()

    print(f"Episode {episode}, Reward: {episode_reward}, Epsilon: {agent.epsilon:.2f}")

Episode 0, Reward: -9.191996805577897, Epsilon: 1.00
Episode 1, Reward: -0.3128137749859505, Epsilon: 1.00
Episode 2, Reward: -8.191066900460113, Epsilon: 1.00
Episode 3, Reward: -14.376436552661081, Epsilon: 1.00
Episode 4, Reward: -4.680182134762129, Epsilon: 1.00
Episode 5, Reward: -8.328714255619682, Epsilon: 1.00
Episode 6, Reward: 15.13748828396314, Epsilon: 1.00
Episode 7, Reward: -10.275232323484124, Epsilon: 1.00
Episode 8, Reward: 0.0, Epsilon: 1.00
Episode 9, Reward: -8.469761039302318, Epsilon: 1.00
Episode 10, Reward: -11.24579658689321, Epsilon: 1.00
Episode 11, Reward: 0.0, Epsilon: 0.96
Episode 12, Reward: -15.183636931852321, Epsilon: 0.95
Episode 13, Reward: 0.0, Epsilon: 0.92
Episode 14, Reward: -12.529637165691472, Epsilon: 0.91
Episode 15, Reward: -11.519170177676004, Epsilon: 0.90
Episode 16, Reward: -14.295216988519385, Epsilon: 0.89
Episode 17, Reward: -19.133734551607755, Epsilon: 0.88
Episode 18, Reward: 23.82630765261593, Epsilon: 0.87
Episode 19, Reward: 0.0