In [1]:
import torch
import torch.nn as nn

In [2]:
# Deep Q-Network (DQN) model
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [4]:
import torch.optim as optim
from collections import deque
import numpy as np
import random

# Deep Q-Network (DQN) agent
class DQNAgent:
    def __init__(self, input_dim, output_dim, lr=0.001, gamma=0.99, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.999, batch_size=64, replay_buffer_size=10000):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.replay_buffer_size = replay_buffer_size
        self.replay_buffer = deque(maxlen=replay_buffer_size)

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = DQN(input_dim, output_dim).to(self.device)
        self.target_net = DQN(input_dim, output_dim).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.loss_fn = nn.MSELoss()

    def select_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.output_dim)  # Explore: choose a random action
        else:
            with torch.no_grad():
                state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
                q_values = self.policy_net(state)
                return torch.argmax(q_values).item()  # Exploit: choose the action with highest Q-value
            
    def remember(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def replay(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        batch = random.sample(self.replay_buffer, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.tensor(states, dtype=torch.float32).to(self.device)
        actions = torch.tensor(actions, dtype=torch.int64).unsqueeze(-1).to(self.device)
        rewards = torch.tensor(rewards, dtype=torch.float32).unsqueeze(-1).to(self.device)
        next_states = torch.tensor(next_states, dtype=torch.float32).to(self.device)
        dones = torch.tensor(dones, dtype=torch.bool).unsqueeze(-1).to(self.device)

        current_q_values = self.policy_net(states).gather(1, actions)
        next_q_values = torch.max(self.target_net(next_states), dim=1, keepdim=True)[0].detach()
        target_q_values = rewards + (1 - dones) * self.gamma * next_q_values

        loss = self.loss_fn(current_q_values, target_q_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

In [None]:
import pandas as pd

class TradingEnvironment:
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)
        self.current_index = 0
        self.done = False
        self.portfolio_value = 1000000  # Initial portfolio value
        self.portfolio_weights = {'BTC': 0.4, 'ETH': 0.3, 'LTC': 0.3}  # Initial portfolio weights
        self.num_days_to_invest = 30  # Number of days to invest

    def reset(self):
        self.current_index = 0
        self.done = False
        self.portfolio_value = 1000000  # Reset portfolio value
        return self.get_state()

    def step(self, action):
        if self.done:
            return None, 0, True, None  # Return None for state and info when episode is done

        if self.current_index + self.num_days_to_invest >= len(self.data):
            self.done = True

        # Execute action (buy, sell, hold)
        if action == 'buy':
            # Implement buying logic based on portfolio weights and available cash
            # Adjust portfolio value and weights accordingly
            pass
        elif action == 'sell':
            # Implement selling logic based on current portfolio holdings
            # Adjust portfolio value and weights accordingly
            pass
        else:  # action == 'hold'
            pass

        # Move to the next time step
        self.current_index += self.num_days_to_invest

        state = self.get_state()  # Get the new state
        reward = self.calculate_reward()  # Calculate reward
        done = self.done  # Check if episode is done
        info = {}  # Additional information (optional)

        return state, reward, done, info

    def get_state(self):
        # Return the current state based on the OHLCV data and current_index
        return self.data.iloc[self.current_index].values

    def calculate_reward(self):
        # Implement reward calculation based on trading performance
        return 0  # Placeholder for now

# Example usage:
env = TradingEnvironment('your_data.csv')
state = env.reset()
done = False
while not done:
    action = 'buy'  # Example action, replace with your own action selection logic
    next_state, reward, done, _ = env.step(action)
    state = next_state

In [None]:
# Hyperparameters
input_dim = 4  # Size of the state space (OHLC data)
output_dim = 3  # Size of the action space (buy, sell, hold)
lr = 0.001  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 1.0  # Initial exploration rate
epsilon_min = 0.01  # Minimum exploration rate
epsilon_decay = 0.995  # Exploration decay rate
batch_size = 64  # Mini-batch size for replay buffer
replay_buffer_size = 10000  # Size of the replay buffer

In [None]:
ohlc_data = []
env = Environment(ohlc_data)
agent = DQNAgent(input_dim, output_dim, lr, gamma, epsilon, epsilon_min, epsilon_decay, batch_size, replay_buffer_size)

In [None]:
# Training loop
num_episodes = 1000
for episode in range(num_episodes):
    state = env.reset()
    done = False
    total_reward = 0
    while not done:
        action = agent.select_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.remember(state, action, reward, next_state, done)
        agent.replay()
        state = next_state
        total_reward += reward

    agent.update_target_network()
    print(f"Episode {episode + 1}, Total Reward: {total_reward}")

# After training, you can use the trained DQN agent for inference
# Use agent.select_action(state) to select actions based on the learned Q-values
# Remember to set the agent in evaluation mode (agent.policy_net.eval()) when using it for inference