In [1]:
import sys
sys.path.append("./src")

import numpy as np
import pandas as pd
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from collections import deque

from Features import FeatureEngineer
from Environment import TradingEnvironment , random_policy

from tqdm import tqdm



In [2]:
gaz = pd.read_csv('data/Dutch TTF Natural Gas Futures - Données Historiques (1).csv')
fe_gaz = FeatureEngineer(gaz)
fe_gaz.apply_preprocessing()
gaz_df = fe_gaz.df

data = gaz_df["Dernier"]
len(data)

  self.df['Date']=pd.to_datetime(self.df['Date'])


1568

In [3]:
env = TradingEnvironment(data)

env.observation_space.shape[0]
env.action_space.n

env.index_loc
env.index_step
env.index_loc + env.index_step

300

In [4]:
backtest_history = env.run_backtest(random_policy)

history_df = pd.DataFrame(backtest_history)
history_df

Unnamed: 0,balance,price,position,action,new_balance,new_price,new_position,reward,done
0,10000.000,5.105,0,0,10000.000,5.840,0,10000.000,False
1,10000.000,5.840,0,0,10000.000,6.165,0,10000.000,False
2,10000.000,6.165,0,0,10000.000,5.865,0,10000.000,False
3,10000.000,5.865,0,0,10000.000,5.945,0,10000.000,False
4,10000.000,5.945,0,0,10000.000,6.095,0,10000.000,False
...,...,...,...,...,...,...,...,...,...
355,9646.523,84.015,13,0,9646.523,90.925,13,10828.548,False
356,9646.523,90.925,13,1,9555.598,93.985,14,10871.388,False
357,9555.598,93.985,14,1,9461.613,93.300,15,10861.113,False
358,9461.613,93.300,15,1,9368.313,87.772,16,10772.665,False


# Training

In [11]:
class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.memory, batch_size))
        return np.array(state), action, reward, np.array(next_state), done

    def __len__(self):
        return len(self.memory)
    

In [12]:
class DQN(nn.Module):
    def __init__(self, input_size, output_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        # x = F.softmax(self.fc3(x))
        x = self.fc3(x)
        return x

In [14]:
BATCH_SIZE = 32
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10
LR = 1e-4
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_observations = env.observation_space.shape[0]
num_actions = env.action_space.n

state = env.reset()
n_observations = len(state)

policy_net = DQN(num_observations, num_actions).to(device)
target_net = DQN(num_observations, num_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
replay_memory = ReplayMemory(10000)

In [15]:
def epsilon_greedy_action(state, index_step):
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * np.exp(-1.0 * index_step / EPS_DECAY)
    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1).indices.view(1, 1)
            # torch.Size([1, 1])
    else:
        return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)
        # torch.Size([1, 1])

In [20]:
def train_dqn(num_episodes):

    policy_net.train()
    history = []

    for episode in tqdm(range(num_episodes)):

        state = torch.tensor(env.reset(), dtype=torch.float32, device=device).unsqueeze(0)
        # torch.Size([1, 3])
        rewards = []

        for t in range(env.max_steps):

            action = epsilon_greedy_action(state, t) # torch.Size([1, 1])
            next_state, reward, done, _ = env.step(action.item())
            next_state = torch.tensor(next_state, dtype=torch.float32, device=device).unsqueeze(0) # torch.Size([1, 3])
            reward = torch.tensor([reward], device=device)

            replay_memory.push(state, action, reward, next_state, done)

            state = next_state

            if len(replay_memory) > BATCH_SIZE:

                states, actions, rewards, next_states, dones = replay_memory.sample(BATCH_SIZE)

                states = torch.tensor(states, dtype=torch.float32)
                actions = torch.tensor(actions, dtype=torch.long)
                rewards = torch.tensor(rewards, dtype=torch.float32)
                next_states = torch.tensor(next_states, dtype=torch.float32)
                dones = torch.tensor(dones, dtype=torch.float32)

                states = states.squeeze(1) 
                # torch.Size([batch_size, num_actions])
                next_states = states.squeeze(1)
                # torch.Size([batch_size, num_actions])
                actions = actions.unsqueeze(1)
                # torch.Size([batch_size, 1])

                state_action_values = policy_net(states).gather(1, actions)
                # torch.Size([batch_size, 1])
                next_state_values = target_net(next_states).max(1)[0].detach()
                # torch.Size([batch_size, num_actions])
                expected_state_action_values = rewards + (1 - dones) * GAMMA * next_state_values

                criterion = nn.SmoothL1Loss()
                loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            if done:
                break
        
        if episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())

        
        history.append(env.info)

        # print(f"Episode: {episode+1}, End Reward: {balance + price*pos}")

    return history
    
# Train the DQN
history = train_dqn(num_episodes=100)

100%|██████████| 100/100 [01:58<00:00,  1.19s/it]


In [25]:
df = pd.DataFrame(history)
df

Unnamed: 0,balance,price,position,action,new_balance,new_price,new_position,reward,done
0,8361.528,55.425,18,0,8361.528,55.156,18,9354.336,True
1,9500.435,11.420,26,-1,9511.855,11.860,25,9808.355,True
2,8937.367,99.425,18,0,8937.367,103.820,18,10806.127,True
3,9483.255,12.710,27,0,9483.255,12.245,27,9813.870,True
4,9810.737,79.075,13,0,9810.737,72.585,13,10754.342,True
...,...,...,...,...,...,...,...,...,...
95,9770.030,14.000,27,0,9770.030,14.240,27,10154.510,True
96,9787.320,19.285,22,0,9787.320,18.990,22,10205.100,True
97,9915.618,82.465,6,0,9915.618,83.400,6,10416.018,True
98,9600.580,50.835,13,0,9600.580,50.170,13,10252.790,True


In [24]:
df.reward.mean()

10125.073110000003

In [None]:
def eval_dqn(num_episodes):

    policy_net.eval()
    history = []

    for episode in tqdm(range(num_episodes)):

        state = torch.tensor(env.reset(), dtype=torch.float32, device=device).unsqueeze(0)
        rewards = []

        for t in range(env.max_steps):

            action = policy_net(state).max(1).indices.view(1, 1)
            next_state, reward, done, _ = env.step(action.item())
            next_state = torch.tensor(next_state, dtype=torch.float32, device=device).unsqueeze(0) # torch.Size([1, 3])
            reward = torch.tensor([reward], device=device)

            state = next_state

            if done:
                break
                
        history.append(env.info)

    return history
    
# Train the DQN
history = eval_dqn(num_episodes=100)