<pre>
░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░
░░░░░░░░░░▄▄█▀▀▀▀▀█▄▄░░░░░░░░░░
░░░░░░░░▄█▀░░▄░▄░░░░▀█▄░░░░░░░░
░░░░░░░░█░░░▀█▀▀▀▀▄░░░█░░░░░░░░
░░░░░░░░█░░░░█▄▄▄▄▀░░░█░░░░░░░░
░░░░░░░░█░░░░█░░░░█░░░█░░░░░░░░
░░░░░░░░▀█▄░▀▀█▀█▀░░▄█▀░░░░░░░░
░░░░░░░░░░▀▀█▄▄▄▄▄█▀▀░░░░░░░░░░
░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░
</pre>

## Policy Reinforcement Learning Trading
By Alin Cijov

In [None]:
import numpy as np
import pandas as pd
from scipy import signal

import math, random
import gym

import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F
from torch.distributions import Categorical

%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Load Data

In [None]:
ada_stock = pd.read_csv("../input/binance-top-cryptocurrencies/ADA.csv")
xrp_stock = pd.read_csv("../input/binance-top-cryptocurrencies/XRP.csv")

ada_open = ada_stock["open"].values
ada_close = ada_stock["close"].values
xrp_open = xrp_stock["open"].values
xrp_close = xrp_stock["close"].values

ada_stock.head()

# Examine Data

In [None]:
plt.figure(1, figsize=(15,5))
ada = plt.subplot(121)
xrp = plt.subplot(122)
ada.title.set_text("ADA")
ada.plot(range(0, len(ada_open)), ada_open)
xrp.title.set_text("XRP")
xrp.plot(range(0, len(xrp_open)), xrp_open)

# Prepare Data

## Signals

In [None]:
# ADA
ada_open = signal.detrend(ada_open)
ada_close = signal.detrend(ada_close)

# XRP
xrp_open = signal.detrend(xrp_open)
xrp_close = signal.detrend(xrp_close)

In [None]:
plt.figure(1, figsize=(15,5))
ada = plt.subplot(121)
xrp = plt.subplot(122)
ada.title.set_text("ADA Open")
ada.plot(range(0, len(ada_open)), ada_open)
xrp.title.set_text("XRP Open")
xrp.plot(range(0, len(xrp_open)), xrp_open)

In [None]:
# Equalize length
ada_open = ada_open[:len(xrp_open)]
ada_close = ada_close[:len(xrp_close)]

## Negative values

In [None]:
print("Min values: \nADA_open:{:.4f}, ADA_close:{:.4f}, XRP_open:{:.4f}, XRP_close:{:.4f} \n"
         .format(ada_open.min(), ada_close.min(), xrp_open.min(), xrp_close.min()))

add_nr = -1 * round(xrp_close.min(), 3)
print("Add {0:1.3f} to the open/close".format(add_nr))

ada_open += add_nr
ada_close += add_nr
xrp_open += add_nr
xrp_close += add_nr

## Equalize data

In [None]:
xrp_open = xrp_open[:len(ada_open)]
xrp_close = xrp_close[:len(xrp_close)]

# Environment

In [None]:
class Environment():

    def __init__(self, starting_cash_mean=200., max_stride=5, series_length=208, starting_point=1, randomize_cash_std=0, \
                 starting_shares_mean=0., randomize_shares_std=0., inaction_penalty=0.):
        
        self.starting_shares_mean = starting_shares_mean
        self.randomize_shares_std = randomize_shares_std
        self.starting_cash_mean = starting_cash_mean
        self.randomize_cash_std = randomize_cash_std
        
        self.actions = {"buy_ada": 0, "sell_ada": 1, "wait": 2, "buy_xrp": 3, "sell_xrp": 4}
        
        self.state = torch.zeros(8, dtype=torch.float32)
        
        self.starting_cash = max(int(np.random.normal(self.starting_cash_mean, self.randomize_cash_std)), 0.)
        
        self.series_length = series_length
        self.starting_point = starting_point
        self.cur_timestep = self.starting_point
        
        self.state[0] = max(int(np.random.normal(self.starting_shares_mean, self.randomize_shares_std)), 0.)
        self.state[1] = max(int(np.random.normal(self.starting_shares_mean, self.randomize_shares_std)), 0.)
        self.starting_portfolio_value = self.portfolio_value()
        self.state[2] = self.starting_cash
        self.state[3] = ada_open[self.cur_timestep]
        self.state[4] = xrp_open[self.cur_timestep]
        self.state[5] = self.starting_portfolio_value
        self.state[6] = self.five_day_window()[0]
        self.state[7] = self.five_day_window()[1]
        
        self.max_stride = max_stride
        self.stride = self.max_stride
        
        self.done = False
        self.diversification_bonus = 1.
        self.inaction_penalty = inaction_penalty
    
    def portfolio_value(self):
        return (self.state[0] * ada_close[self.cur_timestep]) + (self.state[1] * xrp_close[self.cur_timestep]) + self.state[2]
    
    def next_opening_price(self):
        step = self.cur_timestep + self.stride
        return [ada_open[step], xrp_open[step]]
    
    def five_day_window(self):
        step = self.cur_timestep
        if step < 5:
            return [ada_open[0], xrp_open[0]]
        ada5 = ada_open[step-5:step].mean()
        xrp5 = xrp_open[step-5:step].mean()
        return [ada5, xrp5]
    
    def is_done(self, cur_timestep, cur_value, gain):
        if cur_timestep >= self.starting_point + (self.series_length * self.stride):
            new_state = [self.state[0], self.state[1], self.state[2], *self.next_opening_price(), \
                        cur_value, *self.five_day_window()]
            self.state = new_state
            bonus = 0.
            if self.state[0] > 0 and self.state[1] > 0:
                bonus = self.diversification_bonus
            return new_state, cur_value + bonus + gain, True, { "msg": "done"}
        return None, cur_value + 0 + gain, False, { "msg": "not done"}
    
    
    def wait(self, cur_value, ts_left, gain):
        new_state = [self.state[0], self.state[1], self.state[2], * self.next_opening_price(), \
                    cur_value, *self.five_day_window()]
        self.state = new_state
        retval = new_state, -self.inaction_penalty - ts_left + gain, False, { "msg": "wait" }
        return retval
    
    
    def bankrupt(self, cur_value, ts_left, gain):
        new_state = [self.state[0], self.state[1], self.state[2], *self.next_opening_price(), \
                cur_value, *self.five_day_window()]
        self.state = new_state
        return new_state, -ts_left+gain/2, True, { "msg": "bankrupted self"}
    
    
    def sold_too_much(self, cur_value, ts_left, gain):
        new_state = [self.state[0], self.state[1], self.state[2], *self.next_opening_price(), \
                cur_value, *self.five_day_window()]
        self.state = new_state
        return new_state, -ts_left+gain/2, True, { "msg": "sold more than have"}
    
        
    def step(self, action):
        action = [action, 1.]
        cur_timestep = self.cur_timestep
        ts_left = self.series_length - (cur_timestep - self.starting_point)
        retval = None
        cur_value = self.portfolio_value()
        gain = cur_value - self.starting_portfolio_value
        
        retval = self.is_done(cur_timestep, cur_value, gain)
        if retval[2]:
            return retval
        
        if action[0] == self.actions["wait"]:
            return self.wait(cur_value, ts_left, gain)
            
        if action[0] == self.actions["buy_ada"]:
            if action[1] * ada_open[cur_timestep] > self.state[2]:
                retval = self.bankrupt(self, cur_value, ts_left, gain)
            else:
                ada_shares = self.state[0] + action[1]
                cash_spent = action[1] * ada_open[cur_timestep] * 1.1
                new_state = [ada_shares, self.state[1], self.state[2] - cash_spent, *self.next_opening_price(), \
                       cur_value, *self.five_day_window()]
                self.state = new_state
                retval = new_state, self.inaction_penalty-ts_left+gain, False, { "msg": "bought ADA"}
                
        if action[0] == self.actions["buy_xrp"]:
            if action[1] * xrp_open[cur_timestep] > self.state[2]:
                retval = self.bankrupt(self, cur_value, ts_left, gain)
            else:
                xrp_shares = self.state[1] + action[1]
                cash_spent = action[1] * xrp_open[cur_timestep] * 1.1
                new_state = [self.state[0], xrp_shares, self.state[2] - cash_spent, *self.next_opening_price(), \
                       cur_value, *self.five_day_window()]
                self.state = new_state
                retval = new_state, self.inaction_penalty-ts_left+gain, False, { "msg": "bought XRP"}  

        if action[0] == self.actions["sell_ada"]:
            if action[1] > self.state[0]:
                retval = self.sold_too_much(cur_value, ts_left, gain)
            else:
                ada_shares = self.state[0] - action[1]
                cash_gained = action[1] * ada_open[cur_timestep] * 0.9
                new_state = [ada_shares, self.state[1], self.state[2] + cash_gained, *self.next_opening_price(), \
                       cur_value, *self.five_day_window()]
                self.state = new_state
                retval = new_state, self.inaction_penalty-ts_left+gain, False, { "msg": "sold ADA"}
                
        if action[0] == self.actions["sell_xrp"]:
            if action[1] > self.state[1]:
                retval = self.sold_too_much(cur_value, ts_left, gain)
            else:
                xrp_shares = self.state[1] - action[1]
                cash_gained = action[1] * xrp_open[cur_timestep] * 0.9
                new_state = [self.state[0], xrp_shares, self.state[2] + cash_gained, *self.next_opening_price(), \
                       cur_value, *self.five_day_window()]
                self.state = new_state
                retval = new_state, self.inaction_penalty-ts_left+gain, False, { "msg": "sold XRP"}
                
        self.cur_timestep += self.stride
        return retval
    
    def reset(self):
        self.state = torch.zeros(8)
        self.starting_cash = max(int(np.random.normal(self.starting_cash_mean, self.randomize_cash_std)), 0.)
        self.cur_timestep = self.starting_point
        self.state[0] = max(int(np.random.normal(self.starting_shares_mean, self.randomize_shares_std)), 0.)
        self.state[1] = max(int(np.random.normal(self.starting_shares_mean, self.randomize_shares_std)), 0.)
        self.state[2] = self.starting_cash
        self.state[3] = ada_open[self.cur_timestep]
        self.state[4] = xrp_open[self.cur_timestep]
        self.starting_portfolio_value = self.portfolio_value()
        self.state[5] = self.starting_portfolio_value
        self.state[6] = self.five_day_window()[0]
        self.state[7] = self.five_day_window()[1]       
        self.done = False
        return self.state

# Reinforcement Learning Policy Model

In [None]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.input_layer = nn.Linear(8, 128)
        self.hidden_1 = nn.Linear(128, 128)
        self.hidden_2 = nn.Linear(32,31)
        self.hidden_state = torch.zeros(2,1,32)
        self.rnn = nn.GRU(128, 32, 2)
        self.action_head = nn.Linear(31, 5)
        self.value_head = nn.Linear(31, 1)
        self.saved_actions = []
        self.rewards = []

    def reset_hidden(self):
        self.hidden_state = torch.zeros(2,1,32)
        
    def forward(self, x):
        x = torch.as_tensor(x).type(torch.float32)
        x = torch.sigmoid(self.input_layer(x))
        x = torch.tanh(self.hidden_1(x))
        x, self.hidden_state = self.rnn(x.view(1,-1,128), self.hidden_state.data)
        x = F.relu(self.hidden_2(x.squeeze()))
        action_scores = self.action_head(x)
        state_values = self.value_head(x)
        return F.softmax(action_scores, dim=-1), state_values
    
    def act(self, state):
        probs, state_value = self.forward(state)
        m = Categorical(probs)
        action = m.sample()
        if action == 1 and env.state[0] < 1: action = torch.LongTensor([2]).squeeze()
        if action == 4 and env.state[1] < 1: action = torch.LongTensor([2]).squeeze()
        self.saved_actions.append((m.log_prob(action), state_value))
        return action.item()

In [None]:
series_length = int(len(ada_open) / 4) - 1
env = Environment(max_stride=4, series_length=series_length, starting_cash_mean=1000, randomize_cash_std=100, starting_shares_mean=100, randomize_shares_std=10)
model = Policy()
optimizer = optim.Adam(model.parameters(), lr=3e-4)

# Training

In [None]:
def end_episode():
    R = 0
    saved_actions = model.saved_actions
    policy_losses = []
    value_losses = []
    rewards = []
    for r in model.rewards[::-1]:
        R = r + (gamma * R)
        rewards.insert(0, R)
    rewards = torch.tensor(rewards)
    
    epsilon = (torch.rand(1) / 1e4) - 5e-5

    rewards += epsilon
    
    for (log_prob, value), r in zip(saved_actions, rewards):
        reward = torch.tensor(r - value.item())
        policy_losses.append(-log_prob * reward)
        value_losses.append(F.smooth_l1_loss(value, torch.tensor([r])))
        
    optimizer.zero_grad()
    loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
    loss = torch.clamp(loss, -1e-5, 1e5)
    loss.backward()
    optimizer.step()
    del model.rewards[:]
    del model.saved_actions[:]

In [None]:
env.reset()
del model.rewards[:]
del model.saved_actions[:]

gamma = 0.9
log_interval = 100

running_reward = 0
for episode in range(0, len(ada_open)):
    state = env.reset()
    reward = 0
    done = False
    msg = None
    i = 0
    while not done:
        action = model.act(state)
        state, reward, done, msg = env.step(action)
        model.rewards.append(reward)
        i += 1
        if done:
            break
    running_reward = running_reward * (1 - 1/log_interval) + reward * (1/log_interval)
    end_episode()

    if msg["msg"] == "done" and env.portfolio_value() > env.starting_portfolio_value * 1.1 and running_reward > 500:
        print("Early Stopping: " + str(int(reward)))
        break
        
    if episode % log_interval == 0:
        print("Episode: {:4d},  [Start: {:8.2f}],  [Finish: {:8.1f}],  [Status: {} at t={:4d}],  [Last_Reward: {:8.1f}],  [Running_Reward: {:8.1f}]"
              .format(episode, env.starting_portfolio_value, env.portfolio_value(), msg["msg"], env.cur_timestep, reward, running_reward))

# Buy, Sell, Wait

In [None]:
env = Environment(max_stride=4, series_length=series_length, starting_cash_mean=1000, randomize_cash_std=100, starting_shares_mean=100, randomize_shares_std=10)
env.reset()
print("Action:{:12s}, Contains:[{} ADA][{} XPR][{:8.2f}$]".format(" -Starting-  ", env.state[0], env.state[1], env.portfolio_value()))
for i in range(0,env.series_length + 1):
    action = model.act(env.state)
    next_state, reward, done, msg = env.step(action)
    if msg["msg"] == 'bankrupted self':
        print('bankrupted self by 1')
        break
    if msg["msg"] == 'sold more than have':
        print('sold more than have by 1')
        break
    print("Action: {:12s}, Contains:[{:6.2f} ADA][{:6.2f} XPR][{:8.2f}$]".format(msg["msg"], next_state[0], next_state[1], next_state[2]))
    if msg["msg"] == "done":
        print("Total portfolio value {}".format(env.portfolio_value()))
        break

# Test on the original data

In [None]:
ada_open_orig = ada_stock["open"].values
ada_close_orig = ada_stock["close"].values
xrp_open_orig = xrp_stock["open"].values
xrp_close_orig = xrp_stock["close"].values

In [None]:
bought_ada_at = []
bought_xrp_at = []
sold_ada_at = []
sold_xrp_at = []
bought_ada_at_orig = []
bought_xrp_at_orig = []
sold_ada_at_orig = []
sold_xrp_at_orig = []
nothing_at = []
ba_action_times = []
bm_action_times = []
sa_action_times = []
sm_action_times = []
n_action_times = []

In [None]:
env = Environment(max_stride=4, series_length=series_length, starting_cash_mean=1000, randomize_cash_std=100, starting_shares_mean=100, randomize_shares_std=10)
env.reset()

starting_val = env.starting_portfolio_value
print("Starting portfolio value: {}".format(starting_val))
for i in range(0,env.series_length + 1):
    action = model.act(env.state)
    if action == 0:
        bought_ada_at.append(ada_open[env.cur_timestep])
        bought_ada_at_orig.append(ada_open_orig[env.cur_timestep])
        ba_action_times.append(env.cur_timestep)
    if action == 1:
        sold_ada_at.append(ada_close[env.cur_timestep])
        sold_ada_at_orig.append(ada_close_orig[env.cur_timestep])
        sa_action_times.append(env.cur_timestep)
    if action == 2:
        nothing_at.append(35)
        n_action_times.append(env.cur_timestep)
    if action == 3:
        bought_xrp_at.append(xrp_open[env.cur_timestep])
        bought_xrp_at_orig.append(xrp_open_orig[env.cur_timestep])
        bm_action_times.append(env.cur_timestep)
    if action == 4:
        sold_xrp_at.append(xrp_close[env.cur_timestep])
        sold_xrp_at_orig.append(xrp_close_orig[env.cur_timestep])
        sm_action_times.append(env.cur_timestep)
    next_state, reward, done, msg = env.step(action)
    if msg["msg"] == 'bankrupted self':
        env.reset()
        break
    if msg["msg"] == 'sold more than have':
        env.reset()
        break
    if msg["msg"] == "done":
        print("{}, have {} aapl and {} msft and {} cash".format(msg["msg"], next_state[0], next_state[1], next_state[2]))
        val = env.portfolio_value()
        print("Finished portfolio value {}".format(val))
        env.reset()
        break

# Plot the analysis

In [None]:
plt.figure(1, figsize=(15,5))
ada = plt.subplot(121)
xrp = plt.subplot(122)
ada.plot(range(0, len(ada_open)), ada_open)
xrp.plot(range(0, len(xrp_open)), xrp_open)
ada.plot(ba_action_times, bought_ada_at, "ro")
ada.plot(sa_action_times, sold_ada_at, "go")
xrp.plot(bm_action_times, bought_xrp_at, "ro")
xrp.plot(sm_action_times, sold_xrp_at, "go")