In [1]:
from typing import *
import yfinance as yf
import pandas as pd
import datetime
import time
import torch
from torch import nn

In [2]:
main_base: pd.DataFrame = pd.read_parquet(r"C:\Users\Ran\dev\vibe-trader\vibe_trader\notebooks\main_base_2000.parquet")
main_base.head()

Unnamed: 0,Close,High,Low,Open,Volume,ticker,Adj Close
2025-03-10 15:30:00+00:00,10.03,10.03,10.03,10.03,0.0,AACBU,
2025-03-12 16:30:00+00:00,10.04,10.04,10.04,10.04,24993.0,AACBU,
2025-03-12 17:30:00+00:00,10.1,10.1,10.04,10.04,405445.0,AACBU,
2025-03-13 13:30:00+00:00,10.0401,10.0479,10.04,10.0422,0.0,AACBU,
2025-03-13 19:30:00+00:00,10.06,10.06,10.06,10.06,0.0,AACBU,


In [3]:
def reset_index_as_ts(df: pd.DataFrame) -> pd.DataFrame:
    return df.reset_index().rename(columns={"index": "timestamp"})


def standardize_col_names(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = [c.lower().replace(" ", "_") for c in df.columns]
    return df
        

In [4]:
main_base.pipe(reset_index_as_ts).pipe(standardize_col_names)

Unnamed: 0,timestamp,close,high,low,open,volume,ticker,adj_close
0,2025-03-10 15:30:00+00:00,10.0300,10.0300,10.0300,10.0300,0.0,AACBU,
1,2025-03-12 16:30:00+00:00,10.0400,10.0400,10.0400,10.0400,24993.0,AACBU,
2,2025-03-12 17:30:00+00:00,10.1000,10.1000,10.0400,10.0400,405445.0,AACBU,
3,2025-03-13 13:30:00+00:00,10.0401,10.0479,10.0400,10.0422,0.0,AACBU,
4,2025-03-13 19:30:00+00:00,10.0600,10.0600,10.0600,10.0600,0.0,AACBU,
...,...,...,...,...,...,...,...,...
2410397,2025-04-04 15:30:00+00:00,3.5200,3.8200,3.5000,3.8200,95246.0,LHSW,
2410398,2025-04-04 16:30:00+00:00,3.2008,3.6000,3.2000,3.5280,122228.0,LHSW,
2410399,2025-04-04 17:30:00+00:00,3.0327,3.2084,2.8500,3.2084,297333.0,LHSW,
2410400,2025-04-04 18:30:00+00:00,2.9600,3.3000,2.9421,3.0500,37176.0,LHSW,


In [5]:
# state: balance, stock in portfolio, all past close, high, low, open, volumes from all tickers, timestamps

In [6]:
import gymnasium as gym
import numpy as np

class TradingEnv(gym.Env):
    def __init__(self, data, initial_balance=10000, max_stocks=10, window_size=10):
        super().__init__()
        self.data = data  # Shape: [num_steps, num_tickers, features]
        self.window_size = window_size
        self.initial_balance = initial_balance
        self.max_stocks = max_stocks
        self.current_step = window_size
        self.num_tickers = data.shape[1]

        self.action_space = gym.spaces.Discrete(self.num_tickers * 3)  # Buy, Hold, Sell for each stock
        obs_shape = (self.num_tickers, data.shape[2], window_size)
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=obs_shape, dtype=np.float32)

        self.reset()

    def reset(self):
        self.balance = self.initial_balance
        self.portfolio = np.zeros(self.num_tickers)
        self.current_step = self.window_size
        return self._get_observation()

    def _get_observation(self):
        window_data = self.data[self.current_step - self.window_size:self.current_step]
        obs = np.transpose(window_data, (1, 2, 0))  # [tickers, features, window]
        return obs.astype(np.float32)

    def step(self, action):
        ticker = action // 3
        action_type = action % 3  # 0: buy, 1: hold, 2: sell

        price = self.data[self.current_step][ticker][3]  # close price

        if action_type == 0:  # buy
            if self.balance >= price and np.count_nonzero(self.portfolio) < self.max_stocks:
                self.portfolio[ticker] += 1
                self.balance -= price
        elif action_type == 2:  # sell
            if self.portfolio[ticker] > 0:
                self.portfolio[ticker] -= 1
                self.balance += price

        self.current_step += 1
        done = self.current_step >= len(self.data)

        portfolio_value = self.balance + np.sum(self.portfolio * self.data[self.current_step - 1, :, 3])
        reward = portfolio_value - self.initial_balance  # reward = profit
        obs = self._get_observation()

        return obs, reward, done, {}


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DQN(nn.Module):
    def __init__(self, num_tickers, num_features, window_size, num_actions):
        super(DQN, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(num_features, 32, kernel_size=3),
            nn.ReLU(),
            nn.Flatten()
        )
        conv_output_size = 32 * (num_tickers - 2) * (window_size - 2)

        self.fc = nn.Sequential(
            nn.Linear(conv_output_size, 256),
            nn.ReLU(),
            nn.Linear(256, num_actions)
        )

    def forward(self, x):
        # x shape: [batch, tickers, features, window]
        x = x.permute(0, 2, 1, 3)  # -> [batch, features, tickers, window]
        x = self.conv(x)
        return self.fc(x)


In [8]:
from collections import deque
import random

def train_dqn(env, num_episodes=100, batch_size=32, gamma=0.99, epsilon=1.0, epsilon_decay=0.995):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    obs_shape = env.observation_space.shape
    num_actions = env.action_space.n

    model = DQN(*obs_shape, num_actions).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    memory = deque(maxlen=10000)

    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0
        done = False

        while not done:
            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
                q_values = model(state_tensor)
                action = torch.argmax(q_values).item()

            next_state, reward, done, _ = env.step(action)
            memory.append((state, action, reward, next_state, done))
            state = next_state
            total_reward += reward

            if len(memory) >= batch_size:
                batch = random.sample(memory, batch_size)
                states, actions, rewards, next_states, dones = zip(*batch)

                states = torch.tensor(states, dtype=torch.float32).to(device)
                actions = torch.tensor(actions).unsqueeze(1).to(device)
                rewards = torch.tensor(rewards).to(device)
                next_states = torch.tensor(next_states, dtype=torch.float32).to(device)
                dones = torch.tensor(dones).to(device)

                q_values = model(states).gather(1, actions)
                next_q_values = model(next_states).max(1)[0].detach()
                expected_q_values = rewards + gamma * next_q_values * (~dones)

                loss = F.mse_loss(q_values.squeeze(), expected_q_values)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        epsilon *= epsilon_decay
        print(f"Episode {episode+1}: Total Reward = {total_reward:.2f}, Epsilon = {epsilon:.3f}")
