In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_

import gym

from torch.distributions import Categorical

In [23]:
df = pd.read_csv('cleaned_data.csv')
data = df.to_numpy()

#70% train, 30% remaining 
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)

# 15% validation, 15% test
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print("Train shape:", train_data.shape)
print("Validation shape:", val_data.shape)
print("Test shape:", test_data.shape)

Train shape: (28111, 31)
Validation shape: (6024, 31)
Test shape: (6024, 31)


In [24]:
class ActorCritic(nn.Module):
    def __init__(self, obs_dim, action_dim, hidden_size=64):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(obs_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, action_dim),
            nn.Softmax(dim=-1)
        )
        self.critic = nn.Sequential(
            nn.Linear(obs_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
    
    def forward(self, x):
        action_probs = self.actor(x)
        state_value = self.critic(x)
        return action_probs, state_value


In [25]:
# TEST FOR THE MODULE ABOVE ^^

obs_dim = 28        #num columns in our dataset      
action_dim = 3      #Temp number used to represent our actions (Buy, Hold, Sell)

model = ActorCritic(obs_dim, action_dim)

# Create a dummy input tensor 
dummy_input = torch.randn(1, obs_dim)

# Perform a forward pass
action_probs, state_value = model(dummy_input)

print("Action probabilities:", action_probs)
print("Sum of probabilities:", action_probs.sum(dim=-1))  # should be 1 for each sample
print("State value:", state_value)


Action probabilities: tensor([[0.4410, 0.3227, 0.2362]], grad_fn=<SoftmaxBackward0>)
Sum of probabilities: tensor([1.], grad_fn=<SumBackward1>)
State value: tensor([[-0.0309]], grad_fn=<AddmmBackward0>)


In [26]:
def compute_returns_and_advantages(rewards, values, dones, gamma=0.99, lam=0.95):
    #Computes using Generalized Advantage Estimation (GAE)
    returns = []
    advantages = []
    gae = 0
    next_value = 0
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * next_value * (1 - dones[step]) - values[step]
        gae = delta + gamma * lam * (1 - dones[step]) * gae
        advantages.insert(0, gae)
        next_value = values[step]
        returns.insert(0, gae + values[step])
    return returns, advantages

In [27]:
#TEST FOR THE MODULE ABOVE ^^
def test_single_step():
    """
    Test with a single step.
    
    For a single step:
      rewards = [5]
      values = [2]
      dones = [1]  (episode ends at this step)
    
    Calculation:
      delta = 5 + 0.99*0*(1-1) - 2 = 5 - 2 = 3
      gae = delta = 3   (since there's no future step)
      return = gae + value = 3 + 2 = 5
    """
    rewards = [5]
    values = [2]
    dones = [1]
    returns, advantages = compute_returns_and_advantages(rewards, values, dones, gamma=0.99, lam=0.95)
    
    assert len(returns) == 1
    # Ensure return is close to 5 and advantage is close to 3
    assert abs(returns[0] - 5) < 1e-5, f"Expected return 5, got {returns[0]}"
    assert abs(advantages[0] - 3) < 1e-5, f"Expected advantage 3, got {advantages[0]}"
    print("test_single_step passed.")



if __name__ == "__main__":
    test_single_step()


test_single_step passed.


In [28]:
#PPO Updates a batch of data
def ppo_update(model, optimizer, states, actions, log_probs_old, returns, advantages,
               clip_param=0.2, epochs=4, batch_size=64):
    
    
    #Convert Our inputs into tensors
    states = torch.tensor(np.array(states), dtype=torch.float32)
    actions = torch.tensor(actions, dtype=torch.int64)
    returns = torch.tensor(returns, dtype=torch.float32)
    advantages = torch.tensor(advantages, dtype=torch.float32)
    log_probs_old = torch.tensor(log_probs_old, dtype=torch.float32)

    #Initialize dataset and data loader
    dataset = torch.utils.data.TensorDataset(states, actions, log_probs_old, returns, advantages)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for _ in range(epochs):
        #Iterate over the dataset for multiple epochs to update the model
        for batch_states, batch_actions, batch_log_probs_old, batch_returns, batch_advantages in loader:
             #Obtain the current policy's action probabilities and state value estimates
            action_probs, state_values = model(batch_states)
            
            dist = torch.distributions.Categorical(action_probs)
            log_probs = dist.log_prob(batch_actions)
            ratio = torch.exp(log_probs - batch_log_probs_old)
            surr1 = ratio * batch_advantages
            surr2 = torch.clamp(ratio, 1 - clip_param, 1 + clip_param) * batch_advantages
            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = nn.MSELoss()(state_values.squeeze(), batch_returns)
            entropy_loss = dist.entropy().mean()
            
            # Combine the losses actor loss, weighted critic loss, and subtract the entropy bonus
            loss = actor_loss + 0.5 * critic_loss - 0.01 * entropy_loss

            #Clear previous gradients, backpropogate the loss and update model params
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [33]:
def main():
    print("Starting training")
    env = gym.make("CartPole-v1")
    obs_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    model = ActorCritic(obs_dim, action_dim)
    optimizer = optim.Adam(model.parameters(), lr=3e-4)

    num_updates = 1000         # Number of PPO updates
    steps_per_update = 2048    # Steps per update
    gamma = 0.99               # Discount factor
    lam = 0.95                 # GAE lambda

    # Unpack observation from reset
    state, _ = env.reset()
    

    for update in range(num_updates):
        states, actions, rewards, dones, log_probs, values = [], [], [], [], [], []
        episode_rewards = []  # To track rewards for each completed episode during this update
        
        # Collect trajectories for this update
        for step in range(steps_per_update):
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
            action_probs, value = model(state_tensor)
            dist = Categorical(action_probs)
            action = dist.sample().item()
            log_prob = dist.log_prob(torch.tensor(action)).item()
            
            next_state, reward, done, truncated, _ = env.step(action)
            # Combine done and truncated
            done = done or truncated

            states.append(state)
            actions.append(action)
            rewards.append(reward)
            dones.append(done)
            log_probs.append(log_prob)
            values.append(value.item())
            
            # Track reward for the episode
            episode_rewards.append(reward)
            
            state = next_state
            if done:
                total_ep_reward = sum(episode_rewards)
                episode_rewards = []  # Reset for next episode
                print(f"Episode finished. Total reward: {total_ep_reward}")
                state, _ = env.reset()
        
        # Compute returns and advantages for the collected trajectory
        returns, advantages = compute_returns_and_advantages(rewards, values, dones, gamma, lam)
        advantages = (np.array(advantages) - np.mean(advantages)) / (np.std(advantages) + 1e-8)
        ppo_update(model, optimizer, states, actions, log_probs, returns, advantages)
        
    
    torch.save(model.state_dict(), "ppo_actor_critic.pth")
    print("Training completed and model saved.")
    
    
    
if __name__ == "__main__":
    main()



Starting training
Episode finished. Total reward: 22.0
Episode finished. Total reward: 16.0
Episode finished. Total reward: 46.0
Episode finished. Total reward: 37.0
Episode finished. Total reward: 11.0
Episode finished. Total reward: 25.0
Episode finished. Total reward: 37.0
Episode finished. Total reward: 37.0
Episode finished. Total reward: 30.0
Episode finished. Total reward: 32.0
Episode finished. Total reward: 55.0
Episode finished. Total reward: 37.0
Episode finished. Total reward: 14.0
Episode finished. Total reward: 14.0
Episode finished. Total reward: 16.0
Episode finished. Total reward: 19.0
Episode finished. Total reward: 20.0
Episode finished. Total reward: 35.0
Episode finished. Total reward: 16.0
Episode finished. Total reward: 31.0
Episode finished. Total reward: 13.0
Episode finished. Total reward: 15.0
Episode finished. Total reward: 28.0
Episode finished. Total reward: 12.0
Episode finished. Total reward: 18.0
Episode finished. Total reward: 15.0
Episode finished. To

KeyboardInterrupt: 

<function __main__.main()>