In [11]:
from typing import *
import yfinance as yf
import pandas as pd
import datetime
import time
import torch
from torch import nn

%load_ext memory_profiler

In [2]:
main_base: pd.DataFrame = pd.read_parquet(r"C:\Users\Ran\dev\vibe-trader\vibe_trader\notebooks\main_base_2000.parquet")
main_base.head()

Unnamed: 0,Close,High,Low,Open,Volume,ticker,Adj Close
2025-03-10 15:30:00+00:00,10.03,10.03,10.03,10.03,0.0,AACBU,
2025-03-12 16:30:00+00:00,10.04,10.04,10.04,10.04,24993.0,AACBU,
2025-03-12 17:30:00+00:00,10.1,10.1,10.04,10.04,405445.0,AACBU,
2025-03-13 13:30:00+00:00,10.0401,10.0479,10.04,10.0422,0.0,AACBU,
2025-03-13 19:30:00+00:00,10.06,10.06,10.06,10.06,0.0,AACBU,


In [3]:
def reset_index_as_ts(df: pd.DataFrame) -> pd.DataFrame:
    return df.reset_index().rename(columns={"index": "timestamp"})


def standardize_col_names(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = [c.lower().replace(" ", "_") for c in df.columns]
    return df
        

In [4]:
main_base = main_base.pipe(reset_index_as_ts).pipe(standardize_col_names)

In [5]:
main_base.head()

Unnamed: 0,timestamp,close,high,low,open,volume,ticker,adj_close
0,2025-03-10 15:30:00+00:00,10.03,10.03,10.03,10.03,0.0,AACBU,
1,2025-03-12 16:30:00+00:00,10.04,10.04,10.04,10.04,24993.0,AACBU,
2,2025-03-12 17:30:00+00:00,10.1,10.1,10.04,10.04,405445.0,AACBU,
3,2025-03-13 13:30:00+00:00,10.0401,10.0479,10.04,10.0422,0.0,AACBU,
4,2025-03-13 19:30:00+00:00,10.06,10.06,10.06,10.06,0.0,AACBU,


In [6]:
# state: balance, stock in portfolio, all past close, high, low, open, volumes from all tickers, timestamps

In [7]:
import gymnasium as gym
import numpy as np

class TradingEnv(gym.Env):
    def __init__(self, data, initial_balance=10000, max_stocks=10, window_size=8):
        super().__init__()
        self.data = data  # Shape: [num_steps, num_tickers, features]
        self.window_size = window_size
        self.initial_balance = initial_balance
        self.max_stocks = max_stocks
        self.current_step = window_size
        self.num_tickers = data.shape[1]

        self.action_space = gym.spaces.Discrete(self.num_tickers * 3)  # Buy, Hold, Sell for each stock
        obs_shape = (self.num_tickers, data.shape[2], window_size)
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=obs_shape, dtype=np.float32)

        self.reset()

    def reset(self):
        self.balance = self.initial_balance
        self.previous_balance = self.balance
        self.portfolio = np.zeros(self.num_tickers)
        self.current_step = self.window_size
        return self._get_observation()

    def _get_observation(self):
        window_data = self.data[self.current_step - self.window_size:self.current_step]
        obs = np.transpose(window_data, (1, 2, 0))  # [tickers, features, window]
        return obs.astype(np.float32)

    def _update_balance(self) -> None:
        """Update balance according to price changes"""
        self.previous_balance = self.balance
        prices = self.data[self.current_step, :, 1]  # close price of each stock, TODO: need to add a cash dummy ticker
        if self.current_step - self.window_size >= 0:
            previous_prices = self.data[self.current_step - self.window_size, :, 1] # this will be used to calculate gain/loss for each step
        else:
            previous_prices = prices
        shares = self.portfolio / previous_prices
        self.balance = sum(shares * prices)

    def step(self, q_values):
        """
        q_values: [num_tickers] (raw output from the model)
        Converted to allocation weights using softmax
        """
        weights = torch.softmax(torch.tensor(q_values), dim=0).numpy()

        self._update_balance()
    
        # Apply balance: how much to spend on each stock
        self.portfolio = weights * self.balance  # how much $ to spend per stock
    
        # Update env state
        self.current_step += 1
    
        # Reward = change in total portfolio value
        portfolio_value = self.balance + np.sum(self.portfolio * self.data[self.current_step - 1, :, 3])
        reward = self.balance - self.previous_balance
    
        obs = self._get_observation()
        done = self.current_step >= len(self.data)
    
        return obs, reward, done, {}



In [8]:
class TransformerTradingQNet(nn.Module):
    def __init__(self, num_tickers, num_features, window_size, hidden_dim=128, num_heads=4, num_layers=2):
        super().__init__()

        self.num_tickers = num_tickers
        self.num_actions = 3  # Buy, Hold, Sell

        self.input_proj = nn.Linear(num_features * window_size, hidden_dim)
        self.stock_embeddings = nn.Embedding(num_tickers, hidden_dim)
        self.pos_encoder = PositionalEncoding(hidden_dim)

        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.q_head = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, self.num_actions)
        )

    def forward(self, x):
        # x shape: [batch, num_tickers, num_features, window_size]
        batch_size = x.size(0)

        x = x.view(batch_size, self.num_tickers, -1)  # flatten time series: [batch, tickers, features*window]
        x = self.input_proj(x)  # project into hidden space

        # Add learned stock embeddings
        stock_ids = torch.arange(self.num_tickers, device=x.device).unsqueeze(0).repeat(batch_size, 1)
        stock_embed = self.stock_embeddings(stock_ids)
        x = x + stock_embed  # [batch, tickers, hidden_dim]

        x = self.pos_encoder(x)

        # Transformer encoder
        x = self.transformer_encoder(x)  # [batch, tickers, hidden_dim]

        # Get Q-values per stock
        q_values = self.q_head(x)  # [batch, tickers, actions]
        return q_values

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=2000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).float().unsqueeze(1)
        div_term = torch.exp(-torch.arange(0, d_model, 2).float() * (torch.log(torch.tensor(10000.0)) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # shape: [1, max_len, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x


In [9]:
main_base.columns

Index(['timestamp', 'close', 'high', 'low', 'open', 'volume', 'ticker',
       'adj_close'],
      dtype='object')

In [10]:
FEATURE_LIST = ['Close', 'High', 'Low', 'Open', 'Volume']
N_tickers: int = main_base.ticker.nunique()
N_ts: int = main_base["timestamp"].nunique()
N_features: int = len(FEATURE_LIST)
WINDOW_SIZE = 40

In [13]:
%memit 
tf_qnet = TransformerTradingQNet(
    num_tickers=N_tickers + 1, 
    num_features=N_features * N_tickers, 
    window_size=WINDOW_SIZE,
    hidden_dim=128, 
    num_heads=4, 
    num_layers=2
)

peak memory: 781.70 MiB, increment: 0.09 MiB


In [15]:
tf_qnet.eval()
with torch.no_grad():
    %memit
    dummy_out = tf_qnet(torch.zeros(10, N_tickers + 1, N_features * N_tickers, WINDOW_SIZE))
    print(dummy_out.shape)

peak memory: 957.94 MiB, increment: 0.01 MiB
torch.Size([10, 1667, 3])


In [8]:
from collections import deque
import random

def train_dqn(env, num_episodes=100, batch_size=32, gamma=0.99, epsilon=1.0, epsilon_decay=0.995):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    obs_shape = env.observation_space.shape
    num_actions = env.action_space.n

    model = DQN(*obs_shape, num_actions).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    memory = deque(maxlen=10000)

    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0
        done = False

        while not done:
            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
                q_values = model(state_tensor)
                action = torch.argmax(q_values).item() # action = [ticker_count + 1], each represent % of $ allocated on each stock.

            next_state, reward, done, _ = env.step(action)
            memory.append((state, action, reward, next_state, done))
            state = next_state
            total_reward += reward

            if len(memory) >= batch_size:
                batch = random.sample(memory, batch_size)
                states, actions, rewards, next_states, dones = zip(*batch)

                states = torch.tensor(states, dtype=torch.float32).to(device)
                actions = torch.tensor(actions).unsqueeze(1).to(device)
                rewards = torch.tensor(rewards).to(device)
                next_states = torch.tensor(next_states, dtype=torch.float32).to(device)
                dones = torch.tensor(dones).to(device)

                q_values = model(states).gather(1, actions)
                next_q_values = model(next_states).max(1)[0].detach()
                expected_q_values = rewards + gamma * next_q_values * (~dones)

                loss = F.mse_loss(q_values.squeeze(), expected_q_values)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        epsilon *= epsilon_decay
        print(f"Episode {episode+1}: Total Reward = {total_reward:.2f}, Epsilon = {epsilon:.3f}")
