**Step 1: Data Preprocessing**

In [None]:
import pandas as pd

# Load the market data
data_file = '/content/xnas-itch-20230703.tbbo.csv'
market_data = pd.read_csv(data_file)

# Display basic information about the dataset
print(market_data.info())
print(market_data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59271 entries, 0 to 59270
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ts_recv        59271 non-null  int64 
 1   ts_event       59271 non-null  int64 
 2   rtype          59271 non-null  int64 
 3   publisher_id   59271 non-null  int64 
 4   instrument_id  59271 non-null  int64 
 5   action         59271 non-null  object
 6   side           59271 non-null  object
 7   depth          59271 non-null  int64 
 8   price          59271 non-null  int64 
 9   size           59271 non-null  int64 
 10  flags          59271 non-null  int64 
 11  ts_in_delta    59271 non-null  int64 
 12  sequence       59271 non-null  int64 
 13  bid_px_00      59271 non-null  int64 
 14  ask_px_00      59271 non-null  int64 
 15  bid_sz_00      59271 non-null  int64 
 16  ask_sz_00      59271 non-null  int64 
 17  bid_ct_00      59271 non-null  int64 
 18  ask_ct_00      59271 non-n

In [None]:
print(market_data.isna().sum())

ts_recv          0
ts_event         0
rtype            0
publisher_id     0
instrument_id    0
action           0
side             0
depth            0
price            0
size             0
flags            0
ts_in_delta      0
sequence         0
bid_px_00        0
ask_px_00        0
bid_sz_00        0
ask_sz_00        0
bid_ct_00        0
ask_ct_00        0
symbol           0
dtype: int64


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Extract relevant features
features = ['price', 'bid_px_00', 'ask_px_00', 'bid_sz_00', 'ask_sz_00']

# Create technical indicators
market_data['moving_average'] = market_data['price'].rolling(window=5).mean()

# Select the features and the technical indicator
features = ['price', 'bid_px_00', 'ask_px_00', 'bid_sz_00', 'ask_sz_00', 'moving_average']

# Fill any remaining NaN values (e.g., due to rolling mean)
market_data = market_data.fillna(method='ffill')

# Check for NaN values in the data
nan_columns = market_data.columns[market_data.isna().any()].tolist()
print(f"Columns with NaN values: {nan_columns}")

if market_data.isna().sum().sum() > 0:
    print("Handling remaining NaN values...")
    # Option 1: Drop rows with NaN values
    market_data = market_data.dropna()
    # Option 2: Impute remaining NaN values with mean or median
    # market_data = market_data.fillna(market_data.mean())

# Verify no NaNs are left
if market_data.isna().sum().sum() > 0:
    raise ValueError("NaN values found in the market data after preprocessing")

# Normalize the features
scaler = StandardScaler()
market_data[features] = scaler.fit_transform(market_data[features])

# Create sequences
import numpy as np

def create_sequences(data, seq_length):
    sequences = []
    for i in range(len(data) - seq_length):
        seq = data.iloc[i:i+seq_length].values
        sequences.append(seq)
    return np.array(sequences)

# Define sequence length
seq_length = 30

# Create sequences
sequences = create_sequences(market_data[features], seq_length)

# Split into train and test sets
train_size = int(len(sequences) * 0.8)
train_sequences = sequences[:train_size]
test_sequences = sequences[train_size:]

print("Train sequences shape:", train_sequences.shape)
print("Test sequences shape:", test_sequences.shape)


Columns with NaN values: ['moving_average']
Handling remaining NaN values...
Train sequences shape: (47389, 30, 6)
Test sequences shape: (11848, 30, 6)


**Step 2: Define the Transformer-Based PPO Model**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torch.distributions import Categorical
import numpy as np
import copy

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ActorCriticWithTransformer(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, action_dim):
        super(ActorCriticWithTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, model_dim)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.fc_actor = nn.Linear(model_dim, action_dim)
        self.fc_critic = nn.Linear(model_dim, 1)

    def forward(self, state):
        if torch.any(torch.isnan(state)):
            print("NaN detected in input state")
            print(state)

        x = self.embedding(state)
        if torch.any(torch.isnan(x)):
            print("NaN detected after embedding")
            print(x)

        x = self.transformer_encoder(x)
        if torch.any(torch.isnan(x)):
            print("NaN detected after transformer_encoder")
            print(x)

        x = x.mean(dim=1)  # Aggregate sequence output
        action_logits = self.fc_actor(x)
        if torch.any(torch.isnan(action_logits)):
            print("NaN detected in action_logits")
            print(action_logits)

        action_probs = F.softmax(action_logits, dim=-1)
        if torch.any(torch.isnan(action_probs)):
            print("NaN detected in action_probs")
            print(action_probs)

        state_value = self.fc_critic(x)
        if torch.any(torch.isnan(state_value)):
            print("NaN detected in state_value")
            print(state_value)

        return action_probs, state_value

# Example usage
input_dim = len(features)  # Number of features
model_dim = 512
num_heads = 8
num_layers = 6
action_dim = 3  # Buy, Sell, Hold

model = ActorCriticWithTransformer(input_dim, model_dim, num_heads, num_layers, action_dim).to(device)



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
from torch.distributions import Categorical
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import copy

# Define the ActorCriticWithTransformer class
class ActorCriticWithTransformer(nn.Module):
    def __init__(self, state_dim, action_dim, embedding_dim=512, num_layers=6, num_heads=8):
        super(ActorCriticWithTransformer, self).__init__()
        self.embedding = nn.Linear(state_dim, embedding_dim)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim, nhead=num_heads, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc_actor = nn.Linear(embedding_dim, action_dim)
        self.fc_critic = nn.Linear(embedding_dim, 1)

    def forward(self, state):
        embedded_state = self.embedding(state)
        transformer_output = self.transformer_encoder(embedded_state)
        action_logits = self.fc_actor(transformer_output[:, -1, :])
        state_value = self.fc_critic(transformer_output[:, -1, :])

        if torch.any(torch.isnan(action_logits)):
            print("NaN detected in action_logits")
            print(action_logits)

        action_probs = F.softmax(action_logits, dim=-1)

        if torch.any(torch.isnan(action_probs)):
            print("NaN detected in action_probs")

        return action_probs, state_value

# Define the PPO class
class PPO:
    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip):
        self.policy = ActorCriticWithTransformer(state_dim, action_dim).to(device)
        self.policy_old = ActorCriticWithTransformer(state_dim, action_dim).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        self.optimizer = optim.Adam([
            {'params': self.policy.parameters(), 'lr': lr_actor},
            {'params': self.policy.parameters(), 'lr': lr_critic}
        ])
        self.MseLoss = nn.MSELoss()
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs

    def update(self, memory):
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        old_states = torch.stack(memory.states).to(device).detach()
        old_actions = torch.stack(memory.actions).to(device).detach()
        old_logprobs = torch.stack(memory.logprobs).to(device).detach()

        for _ in range(self.K_epochs):
            action_probs, state_values = self.policy(old_states)
            dist = Categorical(action_probs)
            logprobs = dist.log_prob(old_actions)
            dist_entropy = dist.entropy()
            state_values = torch.squeeze(state_values)

            ratios = torch.exp(logprobs - old_logprobs.detach())
            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages
            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.01 * dist_entropy

            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        self.policy_old.load_state_dict(self.policy.state_dict())

# Assume Memory class and other necessary components are defined elsewhere

# Initialize the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the PPO agent
ppo = PPO(
    state_dim=6,        # Number of input features
    action_dim=3,       # Number of output actions (Buy, Hold, Sell)
    lr_actor=0.0003,    # Learning rate for actor
    lr_critic=0.0003,   # Learning rate for critic
    gamma=0.99,         # Discount factor
    K_epochs=4,         # Number of epochs
    eps_clip=0.2        # Clipping parameter for PPO
).to(device)

# Assume train_loader and test_loader are already defined and loaded with data


**Step 3: Initialize Weights**

In [None]:
import torch.nn.init as init

def init_weights(m):
    if isinstance(m, nn.Linear):
        init.kaiming_uniform_(m.weight, nonlinearity='relu')
        if m.bias is not None:
            m.bias.data.fill_(0.01)
    elif isinstance(m, nn.TransformerEncoderLayer):
        init.kaiming_uniform_(m.self_attn.in_proj_weight, nonlinearity='relu')
        if m.self_attn.in_proj_bias is not None:
            m.self_attn.in_proj_bias.data.fill_(0.01)

model.apply(init_weights)

ActorCriticWithTransformer(
  (embedding): Linear(in_features=6, out_features=512, bias=True)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
    )
    (linear1): Linear(in_features=512, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=512, bias=True)
    (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out

**Step 4: Define the PPO Class**

In [None]:
class PPOWithTransformer:
    def __init__(self, input_dim, model_dim, num_heads, num_layers, action_dim, lr=3e-4, gamma=0.99, eps_clip=0.2, K_epochs=4):
        self.policy = ActorCriticWithTransformer(input_dim, model_dim, num_heads, num_layers, action_dim).to(device)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        self.policy_old = ActorCriticWithTransformer(input_dim, model_dim, num_heads, num_layers, action_dim).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        self.MseLoss = nn.MSELoss()
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs

    def update(self, memory):
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        # Normalize rewards
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

        # Convert list to tensor
        old_states = torch.stack(memory.states).to(device).detach()
        old_actions = torch.stack(memory.actions).to(device).detach()
        old_logprobs = torch.stack(memory.logprobs).to(device).detach()

        # Optimize policy for K epochs
        for _ in range(self.K_epochs):
            action_probs, state_values = self.policy(old_states)
            state_values = torch.squeeze(state_values)
            dist = Categorical(action_probs)
            new_logprobs = dist.log_prob(old_actions)
            dist_entropy = dist.entropy()

            ratios = torch.exp(new_logprobs - old_logprobs)

            # Finding Surrogate Loss
            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages

            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy

            # Take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            torch.nn.utils.clip_grad_norm_(self.policy.parameters(), max_norm=0.5)
            self.optimizer.step()

        # Copy new weights into old policy
        self.policy_old.load_state_dict(self.policy.state_dict())


**Step 5: Define Memory Class**

In [None]:
class Memory:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []

    def clear_memory(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]

    def push(self, state, action, logprob, reward, is_terminal):
        self.actions.append(action)
        self.states.append(state)
        self.logprobs.append(logprob)
        self.rewards.append(reward)
        self.is_terminals.append(is_terminal)


**Step 6: Training Loop**

In [None]:
import numpy as np
import copy
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

# Convert sequences to PyTorch tensors
train_sequences = torch.tensor(train_sequences, dtype=torch.float32).clone().detach().to(device)
test_sequences = torch.tensor(test_sequences, dtype=torch.float32).clone().detach().to(device)

# Create DataLoader for batching
batch_size = 32
train_loader = DataLoader(TensorDataset(train_sequences), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(TensorDataset(test_sequences), batch_size=batch_size, shuffle=False)

# Training parameters
num_epochs = 20
ppo = PPOWithTransformer(input_dim, model_dim, num_heads, num_layers, action_dim)
memory = Memory()

# Apply weight initialization
ppo.policy.apply(init_weights)


# Define early stopping criteria
class EarlyStopping:
    def __init__(self, patience=5, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_score = None
        self.early_stop = False
        self.counter = 0

    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.best_model = copy.deepcopy(model.state_dict())
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.best_model = copy.deepcopy(model.state_dict())
            self.counter = 0

# Initialize early stopping and learning rate scheduler
early_stopping = EarlyStopping(patience=5, delta=0.01)
scheduler = StepLR(ppo.optimizer, step_size=5, gamma=0.1)

# Training parameters
num_epochs = 50  # Increased number of epochs for more training
best_val_loss = float('inf')

def calculate_accuracy(predictions, targets):
    correct = (predictions == targets).sum().item()
    return correct / len(targets)

# Training loop with early stopping and learning rate scheduling
for epoch in range(num_epochs):
    ppo.policy.train()
    for batch in train_loader:
        states = batch[0]

        if torch.any(torch.isnan(states)):
            print("NaN detected in states during training")
            print(states)

        action_probs, state_values = ppo.policy_old(states)

        if torch.any(torch.isnan(action_probs)):
            print("NaN detected in action_probs during training")
            print(action_probs)

        if torch.any(torch.isnan(state_values)):
            print("NaN detected in state_value during training")
            print(state_values)

        dist = Categorical(action_probs)
        actions = dist.sample()
        logprobs = dist.log_prob(actions)

        # Simulate rewards and terminals (this should be replaced with real logic)
        rewards = torch.randn(states.size(0)).to(device)  # Use states.size(0) to handle last batch size
        is_terminals = torch.zeros(states.size(0)).to(device)

        for i in range(states.size(0)):  # Use states.size(0) to handle last batch size
            memory.push(states[i], actions[i], logprobs[i], rewards[i], is_terminals[i])

        ppo.update(memory)
        memory.clear_memory()

    # Validation
    ppo.policy.eval()
    val_rewards = []
    val_loss = 0
    correct_predictions = 0
    total_samples = 0
    with torch.no_grad():
        for batch in test_loader:
            states = batch[0]

            if torch.any(torch.isnan(states)):
                print("NaN detected in states during validation")
                print(states)

            action_probs, state_values = ppo.policy(states)

            if torch.any(torch.isnan(action_probs)):
                print("NaN detected in action_probs during validation")
                print(action_probs)

            dist = Categorical(action_probs)
            actions = dist.sample()
            val_rewards.extend(actions.cpu().numpy())
            val_loss += ppo.MseLoss(state_values, rewards).item()

            # Calculate accuracy
            predictions = torch.argmax(action_probs, dim=1)
            correct_predictions += calculate_accuracy(predictions, actions)
            total_samples += len(actions)

    val_loss /= len(test_loader)
    accuracy = correct_predictions / total_samples
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss}, Validation Rewards: {np.mean(val_rewards)}, Accuracy: {accuracy * 100:.2f}%')

    # Early stopping
    early_stopping(val_loss, ppo.policy)
    if early_stopping.early_stop:
        print("Early stopping")
        break

    # Step the scheduler
    scheduler.step()

# Load the best model
ppo.policy.load_state_dict(early_stopping.best_model)

# Save the trained model
torch.save(ppo.policy.state_dict(), 'ppo_transformer_model.pth')

  train_sequences = torch.tensor(train_sequences, dtype=torch.float32).clone().detach().to(device)
  test_sequences = torch.tensor(test_sequences, dtype=torch.float32).clone().detach().to(device)
  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: The size of tensor a (32) must match the size of tensor b (29) at non-singleton dimension 0

In [None]:
import numpy as np
import copy
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

# Convert sequences to PyTorch tensors
train_sequences = torch.tensor(train_sequences, dtype=torch.float32).clone().detach().to(device)
test_sequences = torch.tensor(test_sequences, dtype=torch.float32).clone().detach().to(device)

# Create DataLoader for batching
batch_size = 32
train_loader = DataLoader(TensorDataset(train_sequences), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(TensorDataset(test_sequences), batch_size=batch_size, shuffle=False)

# Training parameters
num_epochs = 50  # Increased number of epochs for more training
ppo = PPOWithTransformer(input_dim, model_dim, num_heads, num_layers, action_dim)
memory = Memory()

# Apply weight initialization
ppo.policy.apply(init_weights)

# Define early stopping criteria
class EarlyStopping:
    def __init__(self, patience=5, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_score = None
        self.early_stop = False
        self.counter = 0

    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.best_model = copy.deepcopy(model.state_dict())
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.best_model = copy.deepcopy(model.state_dict())
            self.counter = 0

# Initialize early stopping and learning rate scheduler
early_stopping = EarlyStopping(patience=5, delta=0.01)
scheduler = StepLR(ppo.optimizer, step_size=5, gamma=0.1)

# Function to calculate accuracy
def calculate_accuracy(predictions, targets):
    correct = (predictions == targets).sum().item()
    return correct / len(targets)

# Training loop with early stopping and learning rate scheduling
for epoch in range(num_epochs):
    ppo.policy.train()
    for batch in train_loader:
        states = batch[0]

        if torch.any(torch.isnan(states)):
            print("NaN detected in states during training")
            print(states)

        action_probs, state_values = ppo.policy_old(states)

        if torch.any(torch.isnan(action_probs)):
            print("NaN detected in action_probs during training")
            print(action_probs)

        if torch.any(torch.isnan(state_values)):
            print("NaN detected in state_value during training")
            print(state_values)

        dist = Categorical(action_probs)
        actions = dist.sample()
        logprobs = dist.log_prob(actions)

        # Simulate rewards and terminals (this should be replaced with real logic)
        rewards = torch.randn(states.size(0)).to(device)  # Use states.size(0) to handle last batch size
        is_terminals = torch.zeros(states.size(0)).to(device)

        for i in range(states.size(0)):  # Use states.size(0) to handle last batch size
            memory.push(states[i], actions[i], logprobs[i], rewards[i], is_terminals[i])

        ppo.update(memory)
        memory.clear_memory()

    # Validation
    ppo.policy.eval()
    val_rewards = []
    val_loss = 0
    correct_predictions = 0
    total_samples = 0
    with torch.no_grad():
        for batch in test_loader:
            states = batch[0]

            if torch.any(torch.isnan(states)):
                print("NaN detected in states during validation")
                print(states)

            action_probs, state_values = ppo.policy(states)

            if torch.any(torch.isnan(action_probs)):
                print("NaN detected in action_probs during validation")
                print(action_probs)

            dist = Categorical(action_probs)
            actions = dist.sample()
            val_rewards.extend(actions.cpu().numpy())
            state_values = state_values.view(-1)  # Flatten state_values to match rewards shape
            rewards = torch.randn(state_values.size(0)).to(device)  # Simulate rewards for validation
            val_loss += ppo.MseLoss(state_values, rewards).item()  # Ensure matching sizes

            # Calculate accuracy
            predictions = torch.argmax(action_probs, dim=1)
            correct_predictions += (predictions == actions).sum().item()
            total_samples += len(actions)

    val_loss /= len(test_loader)
    accuracy = correct_predictions / total_samples
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss}, Validation Rewards: {np.mean(val_rewards)}, Accuracy: {accuracy * 100:.2f}%')

    # Early stopping
    early_stopping(val_loss, ppo.policy)
    if early_stopping.early_stop:
        print("Early stopping")
        break

    # Step the scheduler
    scheduler.step()

# Load the best model
ppo.policy.load_state_dict(early_stopping.best_model)

# Save the trained model
torch.save(ppo.policy.state_dict(), 'ppo_transformer_model.pth')


  train_sequences = torch.tensor(train_sequences, dtype=torch.float32).clone().detach().to(device)
  test_sequences = torch.tensor(test_sequences, dtype=torch.float32).clone().detach().to(device)


Epoch [1/50], Validation Loss: 1.018490174227969, Validation Rewards: 0.99957798784605, Accuracy: 99.96%
Epoch [2/50], Validation Loss: 1.0200101827996761, Validation Rewards: 0.9983119513841998, Accuracy: 99.83%
Epoch [3/50], Validation Loss: 0.980665816370689, Validation Rewards: 1.0, Accuracy: 100.00%
Epoch [4/50], Validation Loss: 1.0023081301839525, Validation Rewards: 0.99983119513842, Accuracy: 99.98%
Epoch [5/50], Validation Loss: 1.0079804384162163, Validation Rewards: 1.9983963538149898, Accuracy: 99.92%
Epoch [6/50], Validation Loss: 0.9919212429992593, Validation Rewards: 1.9964550979068196, Accuracy: 99.82%
Epoch [7/50], Validation Loss: 1.005018877533247, Validation Rewards: 1.287390276839973, Accuracy: 64.24%
Epoch [8/50], Validation Loss: 0.989236916412883, Validation Rewards: 0.5072586090479406, Accuracy: 69.45%
Early stopping


**Step 7: Evaluation**

In [None]:
def evaluate_model(ppo, test_loader):
    ppo.policy.eval()
    all_rewards = []
    with torch.no_grad():
        for batch in test_loader:
            states = batch[0]

            if torch.any(torch.isnan(states)):
                print("NaN detected in states during evaluation")
                print(states)

            action_probs, _ = ppo.policy(states)

            if torch.any(torch.isnan(action_probs)):
                print("NaN detected in action_probs during evaluation")
                print(action_probs)

            dist = Categorical(action_probs)
            actions = dist.sample()
            all_rewards.extend(actions.cpu().numpy())

    # Calculate metrics like RMSE, MAE, etc.
    rmse = np.sqrt(np.mean((np.array(all_rewards) - np.array([0]*len(all_rewards)))**2))
    mae = np.mean(np.abs(np.array(all_rewards) - np.array([0]*len(all_rewards))))

    print(f'RMSE: {rmse}')
    print(f'MAE: {mae}')

evaluate_model(ppo, test_loader)


RMSE: 1.0
MAE: 1.0


In [None]:
# Function to map numerical predictions to text labels
def map_prediction_to_text(prediction):
    if prediction == 0:
        return "Hold"
    elif prediction == 1:
        return "Buy"
    elif prediction == 2:
        return "Sell"
    else:
        return "Unknown"

# Function to evaluate the model and print predictions vs actual results
def evaluate_model_and_display_results(ppo, test_loader):
    ppo.policy.eval()
    all_predictions = []
    all_actuals = []
    all_rewards = []

    with torch.no_grad():
        for batch in test_loader:
            states = batch[0]
            labels = torch.randint(0, 3, (states.size(0),))  # Simulate actual labels

            if torch.any(torch.isnan(states)):
                print("NaN detected in states during evaluation")
                print(states)

            action_probs, _ = ppo.policy(states)

            if torch.any(torch.isnan(action_probs)):
                print("NaN detected in action_probs during evaluation")
                print(action_probs)

            dist = Categorical(action_probs)
            actions = dist.sample()
            all_rewards.extend(actions.cpu().numpy())

            # Store predictions and actual results
            predictions = torch.argmax(action_probs, dim=1)
            all_predictions.extend(predictions.cpu().numpy())
            all_actuals.extend(labels.cpu().numpy())

    # Calculate RMSE and MAE
    rmse = np.sqrt(np.mean((np.array(all_predictions) - np.array(all_actuals))**2))
    mae = np.mean(np.abs(np.array(all_predictions) - np.array(all_actuals)))

    print(f'RMSE: {rmse}')
    print(f'MAE: {mae}')

    # Display predictions vs actual results
    for i in range(len(all_predictions)):
        prediction_text = map_prediction_to_text(all_predictions[i])
        actual_text = map_prediction_to_text(all_actuals[i])
        print(f'Prediction: {prediction_text}, Actual: {actual_text}')

# Evaluate the model and display results
evaluate_model_and_display_results(ppo, test_loader)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Prediction: Buy, Actual: Sell
Prediction: Buy, Actual: Buy
Prediction: Buy, Actual: Hold
Prediction: Buy, Actual: Hold
Prediction: Buy, Actual: Sell
Prediction: Buy, Actual: Sell
Prediction: Buy, Actual: Buy
Prediction: Buy, Actual: Hold
Prediction: Buy, Actual: Hold
Prediction: Buy, Actual: Hold
Prediction: Buy, Actual: Sell
Prediction: Buy, Actual: Sell
Prediction: Buy, Actual: Hold
Prediction: Buy, Actual: Buy
Prediction: Buy, Actual: Sell
Prediction: Buy, Actual: Buy
Prediction: Buy, Actual: Buy
Prediction: Buy, Actual: Hold
Prediction: Buy, Actual: Sell
Prediction: Buy, Actual: Sell
Prediction: Buy, Actual: Buy
Prediction: Buy, Actual: Hold
Prediction: Buy, Actual: Sell
Prediction: Buy, Actual: Sell
Prediction: Buy, Actual: Sell
Prediction: Buy, Actual: Buy
Prediction: Buy, Actual: Hold
Prediction: Buy, Actual: Hold
Prediction: Buy, Actual: Hold
Prediction: Buy, Actual: Buy
Prediction: Buy, Actual: Buy
Prediction: Bu