In [None]:
from numpy.random import randn
import numpy as np
from itertools import count
from collections import namedtuple
from torch.utils.tensorboard import SummaryWriter 
import os
import seaborn as sns
import args
from scipy import stats
import pandas as pd

from replay_memory import Memory, Transition
from ounoise import OrnsteinUhlenbeckActionNoise as noise
from env import trade_env
import env

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import matplotlib.pyplot as plt
%matplotlib inline


seed = 543
memory = Memory(100000)
noise_scale = 1.5
final_noise_scale = 0.5
worth = 20000
cycle = 7
rho = 3 # coefficient of risk aversion

address = args.address
writer = SummaryWriter(address + '/tensor')
weights = address + '/weights'
tensors = address + '/tensor'
outputs = address + '/outputs'

# for file in os.listdir(tensors):
#     file = os.path.join(tensors,file)
#     os.remove(file)

In [None]:
class Actor(nn.Module):
    def __init__(self, hidden_size, num_inputs, action_space):
        super(Actor, self).__init__()
        self.affine1 = nn.Linear(num_inputs, hidden_size)
        self.ln1 = nn.LayerNorm(hidden_size)
        
        self.affine2 = nn.Linear(hidden_size, hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)
        
        self.value = nn.Linear(hidden_size, action_space)

    def forward(self, x):
        x = self.affine1(x)
        x = self.ln1(x)
        x = F.relu(x)
        
        x = self.affine2(x)
        x = self.ln2(x)
        actions = F.softmax(self.value(x), dim=-1)

        return actions
    
class Critic(nn.Module):
    def __init__(self, hidden_size, num_inputs, action_space):
        super(Critic, self).__init__()
        self.affine1 = nn.Linear(num_inputs, hidden_size)
        self.ln1 = nn.LayerNorm(hidden_size)
        
        self.affine2 = nn.Linear(action_space + hidden_size, hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)
        
        self.value = nn.Linear(hidden_size, 1)

    def forward(self, x, actions):
        x = self.affine1(x)
        x = self.ln1(x)
        x = F.relu(x)
        
        x = torch.cat((x, actions), 1)
        x = self.affine2(x)
        x = self.ln2(x)
        q_value = self.value(F.relu(x))

        return q_value

In [None]:
# torch.manual_seed(seed)

train_env = trade_env(env.train, worth, cycle, rho)
test_env = trade_env(env.test, worth, cycle, rho)

hidden_size =128
action_space =  train_env.action_space
state_space = train_env.state_space
num_inputs = (action_space - 1) * state_space #  flatten info matrix; minus one since cash does not hold info

critic = Critic(hidden_size, num_inputs, action_space)
critic_target = Critic(hidden_size, num_inputs, action_space)
actor = Actor(hidden_size, num_inputs, action_space)
actor_target = Actor(hidden_size, num_inputs, action_space)
actor_perturbed = Actor(hidden_size, num_inputs, action_space)

critic_optim = optim.Adam(critic.parameters(), lr=1e-3)
actor_optim = optim.Adam(actor.parameters(), lr=1e-3)

eps = np.finfo(np.float32).eps.item()

noise = noise(action_space)
# noise.reset()

In [None]:
tau = 0.001
def soft_update(target, source, tau):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
def hard_update(target, source):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(param.data)

In [None]:
def select_action_without_noise(state):  
    action = actor(state)
    action = action.detach().numpy()
    return action

In [None]:
# select action with para noise on the last layer
def select_action(state):  
    
    hard_update(actor_perturbed, actor)
    actor_params = actor_perturbed.state_dict()
    
    param = actor_params['value.bias']
    param += torch.tensor(noise()).float()
        
    action = actor_perturbed(state)
    return action.detach().numpy()

def select_action_with_para_noise(state):  
    
    hard_update(actor_perturbed, actor)
    actor_params = actor_perturbed.state_dict()
    
    param = actor_params['value.bias']
    param += torch.tensor(randn(action_space) * 1.8).float()
        
    action = actor_perturbed(state)
    return action.detach().numpy()

In [None]:
hard_update(critic_target, critic)
gamma = 0.99

In [None]:
def update_para():
    transitions = memory.sample(128)
    batch = Transition(*zip(*transitions))
    
    state_batch = torch.stack(batch.state) 
    action_batch = torch.stack(batch.action)
    utility_batch = torch.stack(batch.utility) 
    mask_batch = torch.stack(batch.mask)
    next_state_batch = torch.stack(batch.next_state)
    
    q_batch = critic(state_batch, action_batch)
    next_action_batch = actor_target(next_state_batch)
    next_q_batch = utility_batch + gamma * mask_batch * critic_target(next_state_batch, next_action_batch)
    
    value_loss = F.mse_loss(q_batch, next_q_batch)
    critic_optim.zero_grad()
    value_loss.backward()
    critic_optim.step()
    
    policy_loss = - critic(state_batch, actor(state_batch))
    policy_loss = policy_loss.mean()
    actor_optim.zero_grad()
    policy_loss.backward()
    actor_optim.step()
    
    soft_update(actor_target, actor, tau)
    soft_update(critic_target, critic, tau)
    return value_loss, policy_loss

In [None]:
def save_model(path):
    if not os.path.exists(path):
        os.makedirs(path)
    
    ddpg_actor = os.path.join(path, 'ddpg_actor_weights')
    ddpg_crtic = os.path.join(path, 'ddpg_critic_weights')
    torch.save(actor.state_dict(), ddpg_actor)
    torch.save(critic.state_dict(), ddpg_crtic)

In [None]:
def get_state_tensor(state):
    state = state.reshape(-1,  num_inputs)[0]
    return torch.tensor(state).float()

In [None]:
updates = 0 
running_reward = []

for i_episode in range(5000):

    abs_training_reward = 0
    relative_training_reward = 0 
    state = get_state_tensor(train_env.reset(worth))
    
    # dwindling noise 
    noise.scale = (noise_scale - final_noise_scale) * max(0, 3000-i_episode)/3000 + final_noise_scale
    if train_env.date!=train_env.end_date:
        for t in range(1, 100):

            action = select_action(state)
            
            next_state, reward, SPX_reward, utility, done = train_env.step(action) # env.step() takes numpy array as inputs
            
#             print(action)
#             print(train_env.worth)
            
            abs_training_reward += reward
            relative_training_reward += (reward - SPX_reward)
            action = torch.tensor(action).float()
            mask = torch.tensor([not done]).float()
            reward = torch.tensor([reward])
            utility = torch.tensor([utility])
            next_state = get_state_tensor(next_state)

            # save to memory
            memory.push(state, action, mask, utility, next_state)

            if len(memory)>=128:
                for _ in range(4):
                    value_loss, policy_loss = update_para()
                    writer.add_scalar('loss/value', value_loss, updates)
                    writer.add_scalar('loss/policy', policy_loss, updates)
                    updates += 1

            state = next_state

            if done:
                break
    #         print(abs_training_reward)
        writer.add_scalar('training/abs reward', abs_training_reward, i_episode)
        writer.add_scalar('training/relative reward', relative_training_reward, i_episode) 

    # test sample, evaluate model performance
    abs_test_reward = 0
    SPX_test_reward = 0
    if i_episode % 1 == 0:
        state = get_state_tensor(test_env.reset(worth))
        if test_env.date!=test_env.end_date:
            for t in range(1, 100):

                action = select_action_without_noise(state)
                next_state, reward, SPX_reward, utility, done = test_env.step(action) # env.step() takes numpy array as inputs
                abs_test_reward += reward
                SPX_test_reward += SPX_reward

                next_state = get_state_tensor(next_state)
                state = next_state

                if done:
                    break  
            
            relative_test_reward = abs_test_reward - SPX_test_reward
            writer.add_scalar('test/abs reward', abs_test_reward, i_episode)
            writer.add_scalar('test/relative reward', relative_test_reward, i_episode)

            running_reward += [relative_test_reward]
        if len(running_reward)>150 and np.median(running_reward[-100:]) > 70: # max reward: https://github.com/openai/gym/wiki/MountainCarContinuous-v0 

            save_model(weights)
            break

--- Test Models --- 

In [None]:
actor_weights = os.path.join(weights, 'ddpg_actor_weights')
critic_weights =os.path.join(weights, 'ddpg_critic_weights')

actor.load_state_dict(torch.load(actor_weights))
critic.load_state_dict(torch.load(critic_weights))

reward_list = []
relative_reward_list = []
for i_episode in range(5000):

    state = get_state_tensor(test_env.reset(worth))    
    episode_reward = 0
    relative_episode_reward = 0
    if test_env.date!=test_env.end_date:
        for t in range(1, 500):

            action = select_action_without_noise(state)
#             print(test_env.share)
#             print(test_env.date)
            
            next_state, reward, SPX_reward, utility, done = test_env.step(action) # env.step() takes numpy array as inputs
            episode_reward += reward
            relative_episode_reward += (reward - SPX_reward)

            next_state = get_state_tensor(next_state)
            state = next_state

            if done:
                break  
#         print("Episode {} reward: {}".format(str(test_env.date), str(episode_reward)))
        
    reward_list += [episode_reward]
    relative_reward_list +=[relative_episode_reward]

In [None]:
df = pd.DataFrame({'Relative Reward':relative_reward_list,'Absolute Reward': reward_list})
df.to_csv(outputs + '/test_sample_return.csv', index=False)

In [None]:
sns.distplot(relative_reward_list)

In [None]:
plt.plot(relative_reward_list)

In [None]:
np.median(relative_reward_list)

In [None]:
np.mean(relative_reward_list)

In [None]:
sns.distplot(reward_list)

In [None]:
plt.plot(reward_list)

In [None]:
np.median(reward_list)

In [None]:
np.mean(reward_list)

In [None]:
np.std(reward_list)

In [None]:
np.std(relative_reward_list)

In [None]:
stats.ttest_1samp(np.array(relative_reward_list), 0)

In [None]:
SPX_reward_list = np.array(reward_list) - np.array(relative_reward_list)

In [None]:
sns.distplot(SPX_reward_list)

In [None]:
np.median(SPX_reward_list)

In [None]:
np.mean(SPX_reward_list)

In [None]:
np.std(SPX_reward_list)

# Train Env

In [None]:
actor_weights = os.path.join(weights, 'ddpg_actor_weights')
critic_weights =os.path.join(weights, 'ddpg_critic_weights')

actor.load_state_dict(torch.load(actor_weights))
critic.load_state_dict(torch.load(critic_weights))

train_reward_list = []
train_relative_reward_list = []
for i_episode in range(5000):

    state = get_state_tensor(train_env.reset(worth))    
    episode_reward = 0
    relative_episode_reward = 0
    if train_env.date!=train_env.end_date:
        for t in range(1, 100):

            action = select_action(state)
            print(train_env.share)
            print(train_env.SPX_worth)
            
            next_state, reward, SPX_reward, done = train_env.step(action) # env.step() takes numpy array as inputs
            episode_reward += reward
            relative_episode_reward += (reward - SPX_reward)

            next_state = get_state_tensor(next_state)
            state = next_state

            if done:
                break  
#         print("Episode {} reward: {}".format(str(train_env.date), str(episode_reward)))
        
    train_reward_list += [episode_reward]
    train_relative_reward_list +=[relative_episode_reward]

In [None]:
sns.distplot(train_relative_reward_list)

In [None]:
np.median(train_relative_reward_list)

In [None]:
np.mean(train_relative_reward_list)

In [None]:
stats.ttest_1samp(np.array(reward_list), 0)

In [None]:
sns.distplot(train_reward_list)

In [None]:
plt.plot(train_reward_list)

In [None]:
np.median(train_reward_list)

In [None]:
np.mean(train_reward_list)

In [None]:
np.std(train_reward_list)