<pre>
░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░
░░░░░░░░░░▄▄█▀▀▀▀▀█▄▄░░░░░░░░░░
░░░░░░░░▄█▀░░▄░▄░░░░▀█▄░░░░░░░░
░░░░░░░░█░░░▀█▀▀▀▀▄░░░█░░░░░░░░
░░░░░░░░█░░░░█▄▄▄▄▀░░░█░░░░░░░░
░░░░░░░░█░░░░█░░░░█░░░█░░░░░░░░
░░░░░░░░▀█▄░▀▀█▀█▀░░▄█▀░░░░░░░░
░░░░░░░░░░▀▀█▄▄▄▄▄█▀▀░░░░░░░░░░
░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░
</pre>

<h1>
Value-Based Reinforcement Learning <br>
</h1>
By Alin Cijov

In [None]:
import numpy as np
import pandas as pd
import random
import copy
import time

import plotly
from plotly import tools
from plotly.graph_objs import *
from plotly.offline import iplot

import torch
from torch import optim
from torch import nn

from collections import namedtuple

# Prepare Data

In [None]:
df = pd.read_csv('../input/binance-top-cryptocurrencies/BTC.csv')
df.head()

In [None]:
# convert date format
df['date'] = pd.to_datetime(df['date'])

# set date as index
df = df.set_index('date')

In [None]:
df.head()

In [None]:
df.sort_values(by=['date'], inplace=True, ascending=True)

In [None]:
# 0.7 split limit
split_limit = int(len(df) * 0.7)

df_train = df.iloc[:split_limit]
df_test = df.iloc[split_limit:]

In [None]:
def plot_train_test(train, test, date_split):
    data = [Candlestick(x=train.index, open=train['open'], high=train['high'], low=train['low'], close=train['close'], name='train'),
            Candlestick(x=test.index, open=test['open'], high=test['high'], low=test['low'], close=test['close'], name='test')]
    
    layout = { 'shapes': [{'x0': date_split, 'x1': date_split, 'y0': 0, 'y1': 1, 'xref': 'x', 'yref': 'paper', 'line': {'color': 'rgb(0,0,0)', 'width': 1}}],
               'annotations': [{'x': date_split, 'y': 1.0, 'xref': 'x', 'yref': 'paper', 'showarrow': False, 'xanchor': 'left', 'text': ' test data'},
                               {'x': date_split, 'y': 1.0, 'xref': 'x', 'yref': 'paper', 'showarrow': False, 'xanchor': 'right', 'text': 'train data '}]}
    
    figure = Figure(data=data, layout=layout)
    iplot(figure)

In [None]:
plot_train_test(df_train, df_test, df_train.iloc[-1].name)

# Environment

In [None]:
class Environment:
    def __init__(self, data, time=90):
        self.data = data
        self.time = time
        self.actions = {"stay": 0, "buy": 1, "sell": 2}
        self.reset()
        
    def reset(self):
        self.t = 0
        self.done = False
        self.profits = 0
        self.positions = []
        self.position_value = 0
        self.history = [0 for _ in range(self.time)]
        return torch.tensor([self.position_value]  + self.history).type(torch.float32)
    
    def next(self):
        self.t += 1
        self.position_value = 0
        for p in self.positions:
            self.position_value += (self.data.iloc[self.t, :]['close'] - p)
        self.history.pop(0)
        self.history.append(self.data.iloc[self.t, :]['close'] - self.data.iloc[(self.t-1), :]['close'])
    
    def step(self, act):
        reward = 0
        
        if act == self.actions["buy"]:
            self.positions.append(self.data.iloc[self.t, :]['close'])
        elif act == self.actions["sell"]:
            if len(self.positions) == 0:
                reward -= 1
            else:
                profits = 0
                for p in self.positions:
                    profits += (self.data.iloc[self.t, :]['close'] - p)
                    reward += profits
                    self.profits += profits
                    self.positions = []
                    
        self.next()
        reward = 1 if reward > 0 else -1
        
        obs = torch.tensor([self.position_value] + self.history).type(torch.float32)
        
        return obs, reward, self.done

In [None]:
env = Environment(df_train)

# Replay Memory

In [None]:
Transition = namedtuple('Transition',
                        ('pobs', 'pact', 'reward', 'obs', 'done'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

# Deep Q-Networks (DQN)

In [None]:
class QNetwork(nn.Module):

    def __init__(self, input_size, hidden_size, output_size):
        super(QNetwork, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size)
        self.l2 = nn.Linear(hidden_size, hidden_size * 2)
        self.l3 = nn.Linear(hidden_size * 2, hidden_size)
        self.l4 = nn.Linear(hidden_size, hidden_size // 2)
        self.l5 = nn.Linear(hidden_size // 2, output_size)
        self.r = nn.ReLU()

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = self.r(self.l1(x))
        x = self.r(self.l2(x))
        x = self.r(self.l3(x))
        x = self.r(self.l4(x))
        x = self.l5(x)
        return x

In [None]:
Q = QNetwork(input_size=env.time + 1, hidden_size=256, output_size=3)
Q_ast = copy.deepcopy(Q)
optimizer = optim.Adam(Q.parameters(), lr=0.005)
loss_fn = nn.MSELoss()

In [None]:
epoch_num = 50
step_max = len(env.data)-1
memory_size = 200
batch_size = 32
epsilon = 1.0
epsilon_decrease = 0.01
epsilon_min = 0.1
start_reduce_epsilon = 10
train_freq = 10
update_q_freq = 10
gamma = 0.97
show_log_freq = 10

memory = ReplayMemory(memory_size)
total_step = 0
total_rewards = []
total_losses = []

start = time.time()
for epoch in range(epoch_num):
    pobs = env.reset()
    step = 0
    done = False
    total_reward = 0
    total_loss = 0
    
    while not done and step < step_max:
        
        # select act
        pact = torch.randint(0, 3, (1,))
        if np.random.rand() > epsilon:
            pact = Q.forward(pobs.reshape(1, -1))
            pact = torch.argmax(pact.data)
            
        # act
        obs, reward, done = env.step(pact)
        
        # add memory
        memory.push(pobs, pact, reward, obs, done)
        
        if len(memory) == memory_size:
            if total_step % train_freq == 0:
                batch = memory.sample(batch_size)
                
                b_pobs = torch.zeros(batch_size, 91)
                b_pact = torch.zeros(batch_size, 1, dtype=torch.long)
                b_reward = torch.zeros(batch_size, 1)
                b_obs = torch.zeros(batch_size, 91)
                b_done = torch.zeros(batch_size, 1, dtype=torch.bool)

                for i, b in enumerate(batch):
                    b_pobs[i] = b.pobs
                    b_pact[i] = b.pact
                    b_reward[i] = b.reward
                    b_obs[i] = b.obs
                    b_done[i] = b.done
                    
                q = Q(b_pobs)
                maxq = torch.max(Q_ast(b_obs).data, axis=1)
                target = copy.deepcopy(q.data)
                
                for j in range(batch_size):
                    target[j, b_pact[j]] = b_reward[j] + gamma * maxq.values[j] * (not b_done[j])
                
                optimizer.zero_grad()
                
                loss = loss_fn(q, target)
                total_loss += loss.data
                loss.backward()
                optimizer.step()
                
                
            if total_step % update_q_freq == 0:
                Q_ast = copy.deepcopy(Q)
        
        # epsilon
        if epsilon > epsilon_min and total_step > start_reduce_epsilon:
            epsilon -= epsilon_decrease
        
        # next step
        total_reward += reward
        pobs = obs
        step += 1
        total_step += 1
        
    total_rewards.append(total_reward)
    total_losses.append(total_loss)

    if (epoch+1) % show_log_freq == 0:
        log_reward = sum(total_rewards[((epoch+1)-show_log_freq):])/show_log_freq
        log_loss = sum(total_losses[((epoch+1)-show_log_freq):])/show_log_freq
        elapsed_time = time.time()-start
        print('\t'.join(map(str, [epoch+1, epsilon, total_step, log_reward, log_loss, elapsed_time])))
        start = time.time()

In [None]:
def plot_loss_reward(total_losses, total_rewards):

    figure = plotly.subplots.make_subplots(rows=1, cols=2, subplot_titles=('loss', 'reward'), print_grid=False)
    figure.append_trace(Scatter(y=total_losses, mode='lines', line=dict(color='skyblue')), 1, 1)
    figure.append_trace(Scatter(y=total_rewards, mode='lines', line=dict(color='orange')), 1, 2)
    figure['layout']['xaxis1'].update(title='epoch')
    figure['layout']['xaxis2'].update(title='epoch')
    figure['layout'].update(height=400, width=900, showlegend=False)
    iplot(figure)

In [None]:
plot_loss_reward(total_losses, total_rewards)

In [None]:
def plot_train_test_by_q(train_env, test_env, Q, algorithm_name, date_split):
    
    # train
    pobs = train_env.reset()
    train_acts = []
    train_rewards = []

    for _ in range(len(train_env.data)-1):
        
        pact = Q.forward(pobs.reshape(1, -1))
        pact = torch.argmax(pact.data)
        train_acts.append(pact)
            
        obs, reward, done = train_env.step(pact)
        train_rewards.append(reward)

        pobs = obs
        
    train_profits = train_env.profits
    
    # test
    pobs = test_env.reset()
    test_acts = []
    test_rewards = []

    for _ in range(len(test_env.data)-1):
    
        pact = Q.forward(pobs.reshape(1, -1))
        pact = torch.argmax(pact.data)
        test_acts.append(pact)
            
        obs, reward, done = test_env.step(pact)
        test_rewards.append(reward)

        pobs = obs
        
    test_profits = test_env.profits
    
    # plot
    train_copy = train_env.data.copy()
    test_copy = test_env.data.copy()
    train_copy['act'] = train_acts + [np.nan]
    train_copy['reward'] = train_rewards + [np.nan]
    test_copy['act'] = test_acts + [np.nan]
    test_copy['reward'] = test_rewards + [np.nan]
    train0 = train_copy[train_copy['act'] == 0]
    train1 = train_copy[train_copy['act'] == 1]
    train2 = train_copy[train_copy['act'] == 2]
    test0 = test_copy[test_copy['act'] == 0]
    test1 = test_copy[test_copy['act'] == 1]
    test2 = test_copy[test_copy['act'] == 2]
    act_color0, act_color1, act_color2 = 'gray', 'cyan', 'magenta'

    data = [
        Candlestick(x=train0.index, open=train0['open'], high=train0['high'], low=train0['low'], close=train0['close'], increasing=dict(line=dict(color=act_color0)), decreasing=dict(line=dict(color=act_color0))),
        Candlestick(x=train1.index, open=train1['open'], high=train1['high'], low=train1['low'], close=train1['close'], increasing=dict(line=dict(color=act_color1)), decreasing=dict(line=dict(color=act_color1))),
        Candlestick(x=train2.index, open=train2['open'], high=train2['high'], low=train2['low'], close=train2['close'], increasing=dict(line=dict(color=act_color2)), decreasing=dict(line=dict(color=act_color2))),
        Candlestick(x=test0.index, open=test0['open'], high=test0['high'], low=test0['low'], close=test0['close'], increasing=dict(line=dict(color=act_color0)), decreasing=dict(line=dict(color=act_color0))),
        Candlestick(x=test1.index, open=test1['open'], high=test1['high'], low=test1['low'], close=test1['close'], increasing=dict(line=dict(color=act_color1)), decreasing=dict(line=dict(color=act_color1))),
        Candlestick(x=test2.index, open=test2['open'], high=test2['high'], low=test2['low'], close=test2['close'], increasing=dict(line=dict(color=act_color2)), decreasing=dict(line=dict(color=act_color2)))
    ]
    title = '{}: train s-reward {}, profits {}, test s-reward {}, profits {}'.format(
        algorithm_name,
        int(sum(train_rewards)),
        int(train_profits),
        int(sum(test_rewards)),
        int(test_profits)
    )
    layout = {
        'title': title,
        'showlegend': False,
         'shapes': [
             {'x0': date_split, 'x1': date_split, 'y0': 0, 'y1': 1, 'xref': 'x', 'yref': 'paper', 'line': {'color': 'rgb(0,0,0)', 'width': 1}}
         ],
        'annotations': [
            {'x': date_split, 'y': 1.0, 'xref': 'x', 'yref': 'paper', 'showarrow': False, 'xanchor': 'left', 'text': ' test data'},
            {'x': date_split, 'y': 1.0, 'xref': 'x', 'yref': 'paper', 'showarrow': False, 'xanchor': 'right', 'text': 'train data '}
        ]
    }
    figure = Figure(data=data, layout=layout)
    iplot(figure)

In [None]:
plot_train_test_by_q(Environment(df_train), Environment(df_test), Q, 'DQN', df_train.iloc[-1].name)

# Double Deep Q-Networks (DDQN)

In [None]:
Q = QNetwork(input_size=env.time + 1, hidden_size=100, output_size=3)
Q_ast = copy.deepcopy(Q)
optimizer = optim.Adam(Q.parameters(), lr=0.005)
loss_fn = nn.MSELoss()

In [None]:
epoch_num = 100
step_max = len(env.data)-1
memory_size = 200
batch_size = 32
epsilon = 1.0
epsilon_decrease = 0.01
epsilon_min = 0.1
start_reduce_epsilon = 200
train_freq = 10
update_q_freq = 20
gamma = 0.97
show_log_freq = 10

memory = ReplayMemory(memory_size)
total_step = 0
total_rewards = []
total_losses = []

start = time.time()
for epoch in range(epoch_num):
    pobs = env.reset()
    step = 0
    done = False
    total_reward = 0
    total_loss = 0
    
    while not done and step < step_max:
        
        # select act
        pact = torch.randint(0, 3, (1,))
        if np.random.rand() > epsilon:
            pact = Q.forward(pobs.reshape(1, -1))
            pact = torch.argmax(pact.data)
            
        # act
        obs, reward, done = env.step(pact)
        
        # add memory
        memory.push(pobs, pact, reward, obs, done)
        
        if len(memory) == memory_size:
            if total_step % train_freq == 0:
                batch = memory.sample(batch_size)
                
                b_pobs = torch.zeros(batch_size, 91)
                b_pact = torch.zeros(batch_size, 1, dtype=torch.long)
                b_reward = torch.zeros(batch_size, 1)
                b_obs = torch.zeros(batch_size, 91)
                b_done = torch.zeros(batch_size, 1, dtype=torch.bool)

                for i, b in enumerate(batch):
                    b_pobs[i] = b.pobs
                    b_pact[i] = b.pact
                    b_reward[i] = b.reward
                    b_obs[i] = b.obs
                    b_done[i] = b.done
                    
                q = Q(b_pobs)
                indices = torch.argmax(q.data, axis=1)
                maxqs = Q_ast(b_obs).data
                
                target = copy.deepcopy(q.data)
                
                for j in range(batch_size):
                    target[j, b_pact[j]] = b_reward[j] + gamma * maxqs[j, indices[j]] * (not b_done[j])
                    
                optimizer.zero_grad()
                
                loss = loss_fn(q, target)
                total_loss += loss.data
                loss.backward()
                optimizer.step()
                
                
            if total_step % update_q_freq == 0:
                Q_ast = copy.deepcopy(Q)
        
        # epsilon
        if epsilon > epsilon_min and total_step > start_reduce_epsilon:
            epsilon -= epsilon_decrease
        
        # next step
        total_reward += reward
        pobs = obs
        step += 1
        total_step += 1
        
    total_rewards.append(total_reward)
    total_losses.append(total_loss)

    if (epoch+1) % show_log_freq == 0:
        log_reward = sum(total_rewards[((epoch+1)-show_log_freq):])/show_log_freq
        log_loss = sum(total_losses[((epoch+1)-show_log_freq):])/show_log_freq
        elapsed_time = time.time()-start
        print('\t'.join(map(str, [epoch+1, epsilon, total_step, log_reward, log_loss, elapsed_time])))
        start = time.time()

In [None]:
plot_loss_reward(total_losses, total_rewards)

In [None]:
plot_train_test_by_q(Environment(df_train), Environment(df_test), Q, 'DDQN', df_train.iloc[-1].name)

# Dueling DDQN

In [None]:
class QNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(QNetwork, self).__init__()

        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc_value = nn.Linear(hidden_size, hidden_size // 2)
        self.fc_adv = nn.Linear(hidden_size, hidden_size // 2)
        self.relu = nn.ReLU()

        self.value = nn.Linear(hidden_size // 2, 1)
        self.adv = nn.Linear(hidden_size // 2, output_size)

    def forward(self, state):
        y = self.relu(self.fc1(state))
        value = self.relu(self.fc_value(y))
        adv = self.relu(self.fc_adv(y))

        value = self.value(value)
        adv = self.adv(adv)

        advAverage = torch.mean(adv, dim=1, keepdim=True)
        Q = value + adv - advAverage

        return Q

In [None]:
Q = QNetwork(input_size=env.time + 1, hidden_size=64, output_size=3)
Q_ast = copy.deepcopy(Q)
optimizer = optim.Adam(Q.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

In [None]:
epoch_num = 100
step_max = len(env.data)-1
memory_size = 200
batch_size = 8
epsilon = 1.0
epsilon_decrease = 0.01
epsilon_min = 0.1
start_reduce_epsilon = 10
train_freq = 10
update_q_freq = 20
gamma = 0.97
show_log_freq = 10

memory = ReplayMemory(memory_size)
total_step = 0
total_rewards = []
total_losses = []

start = time.time()
for epoch in range(epoch_num):
    pobs = env.reset()
    step = 0
    done = False
    total_reward = 0
    total_loss = 0
    
    while not done and step < step_max:
        
        # select act
        pact = torch.randint(0, 3, (1,))
        if np.random.rand() > epsilon:
            pact = Q.forward(pobs.reshape(1, -1))
            pact = torch.argmax(pact.data)
            
        # act
        obs, reward, done = env.step(pact)
        
        # add memory
        memory.push(pobs, pact, reward, obs, done)
        
        if len(memory) == memory_size:
            if total_step % train_freq == 0:
                batch = memory.sample(batch_size)
                
                b_pobs = torch.zeros(batch_size, 91)
                b_pact = torch.zeros(batch_size, 1, dtype=torch.long)
                b_reward = torch.zeros(batch_size, 1)
                b_obs = torch.zeros(batch_size, 91)
                b_done = torch.zeros(batch_size, 1, dtype=torch.bool)

                for i, b in enumerate(batch):
                    b_pobs[i] = b.pobs
                    b_pact[i] = b.pact
                    b_reward[i] = b.reward
                    b_obs[i] = b.obs
                    b_done[i] = b.done
                    
                q = Q(b_pobs)
                indices = torch.argmax(q.data, axis=1)
                maxqs = Q_ast(b_obs).data
                
                target = copy.deepcopy(q.data)
                
                for j in range(batch_size):
                    target[j, b_pact[j]] = b_reward[j]+gamma*maxqs[j, indices[j]]*(not b_done[j])
                    
                optimizer.zero_grad()
                
                loss = loss_fn(q, target)
                total_loss += loss.data
                loss.backward()
                optimizer.step()
                
                
            if total_step % update_q_freq == 0:
                Q_ast = copy.deepcopy(Q)
        
        # epsilon
        if epsilon > epsilon_min and total_step > start_reduce_epsilon:
            epsilon -= epsilon_decrease
        
        # next step
        total_reward += reward
        pobs = obs
        step += 1
        total_step += 1
        
    total_rewards.append(total_reward)
    total_losses.append(total_loss)

    if (epoch+1) % show_log_freq == 0:
        log_reward = sum(total_rewards[((epoch+1)-show_log_freq):])/show_log_freq
        log_loss = sum(total_losses[((epoch+1)-show_log_freq):])/show_log_freq
        elapsed_time = time.time()-start
        print('\t'.join(map(str, [epoch+1, epsilon, total_step, log_reward, log_loss, elapsed_time])))
        start = time.time()

In [None]:
plot_loss_reward(total_losses, total_rewards)

In [None]:
plot_train_test_by_q(Environment(df_train), Environment(df_test), Q, 'Dueling DDQN', df_train.iloc[-1].name)