In [2]:
import gym
import torch
import random
import numpy as np
from torch import optim
from gym.utils import play
from dqn import ReplayMemory, DQN_agent

from collections import namedtuple

#### Before rewrite

In [None]:
policy_net = DQN() # used for calculating Q(s, a)
target_net = DQN() # used for calculating gamma * Q(s', a')
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.SGD(policy_net.parameters(), lr=0.001)

In [None]:
def epsilon_greedy(state, epsilon):
    # returns optimal action under current Q values epsilon-greedily
    trial = random.random()
#     print("epsilon: ", epsilon)
    if trial < epsilon:
#         print("random action")
        return random.randint(0, 1)
    else:
        with torch.no_grad():
            x = policy_net(torch.FloatTensor(state))
#             print(x, x.argmax().item())
            return x.argmax().item()

In [None]:
def learn(batch):
    # update model parameters
    # batch_i = (s, a, r, s') -- (0, 1, 2, 3)
    
    states = torch.cat(batch.state).view(BATCH_SIZE, 4)
    actions = torch.tensor(batch.action)
    rewards = torch.tensor(batch.reward)
    
    expected_next_state_rewards = torch.zeros(BATCH_SIZE)
    non_terminal_next_state_mask = torch.tensor(list(map(lambda ns: ns is not None, batch.next_state)))
    
    non_terminal_next_states = torch.tensor([ns for ns in batch.next_state if ns is not None])
    next_state_q_values_best = target_network(non_terminal_next_states).max(1)[0].detach()
    
    # computing r_s + GAMMA * max(q(s', a'))
    expected_next_state_rewards[non_terminal_next_state_mask] = GAMMA * next_state_q_values_best
    expected_next_state_rewards = expected_next_state_rewards + rewards
    
    predicted_reward_to_go = policy_net(states)
    q_s_a = torch.gather(predicted_reward_to_go, 1, actions.view(BATCH_SIZE, 1)).flatten() 
    
    # gradient descent
    optimizer.zero_grad()
    criterion = nn.MSELoss()
    loss = criterion(q_s_a,expected_next_state_rewards)
    loss.backward()
    # clipping gradient
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [None]:
## Training
env = gym.make("CartPole-v0")
EPS_INIT = 1
EPS_DECAY = 0.9
EPS_MIN = 0.05
GAMMA = 0.999
BATCH_SIZE = 128

epsilon = EPS_INIT
memory_length = 100000

i = 0
durations = []
memory = ReplayMemory(memory_length=memory_length)
for episode in range(10):
    obs = env.reset()
    s = torch.FloatTensor(obs)
    done = False
    env.render()
    
    duration = 0
    print([x[1] for x in memory.memory])
    while not done:
        duration += 1
        
        action = epsilon_greedy(s, epsilon)
        obs, reward, done, info = env.step(action)
        
        if done:
            obs = None # ended in a terminal state
        else:
            obs = torch.FloatTensor(obs)
        
        transition = [s, action, reward, obs] # s-a-r-s'
        memory.add_info(transition)
        
        # learn
        if len(memory) == memory_length:
            batch = memory.sample(BATCH_SIZE)
            batch = Transition(*zip(*batch))
            learn(batch)
        
        if i % 10 == 0:
            # update target net
            target_net.load_state_dict(policy_net.state_dict())
        
        if epsilon * EPS_DECAY > EPS_MIN:
            epsilon *= EPS_DECAY
        else:
            epsilon = EPS_MIN
        
        s = obs
        i += 1
    durations.append(duration)
env.close()

In [None]:
print(np.mean(durations))

In [None]:
try:
    durations = []
    duration = 0
    for t in range(1000):
        duration += 1
        env.render()
        action = env.action_space.sample() # random action
        obs, reward, done, info = env.step(action) # info is a dictionary

        if done: # ended in terminal state
            obs = env.reset()
            durations.append(duration)
            duration = 0
finally:
    env.close()


#### After Rewrite

In [None]:
import gym
import torch
import random
import numpy as np
from torch import optim
from gym.utils import play
from agent import ReplayMemory, DQN_One, DQN, train

from collections import namedtuple

In [None]:
env = gym.make("CartPole-v0")
agent = DQN(memory_length=1000)

In [None]:
train(agent, env, 32, 0.99, 1, 0.99, 0.05, 2)

In [None]:
random.randrange(2)

In [None]:
torch.tensor([1, 1]).argmax()

In [None]:
torch.tensor(2)

#### Trading Env

In [27]:
from crypto_env import CryptoEnv
import pandas as pd

In [32]:
from gym_anytrading.envs import StocksEnv

In [36]:
df = pd.read_csv("data/raw/STOCKS_GOOGL.csv")
# df.columns=['Time', 'Open', 'High', 'Low', 'Close', 'Volume']

In [37]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2009-05-22,198.528534,199.524521,196.196198,196.946945,196.946945,3433700
1,2009-05-26,196.171173,202.702698,195.19519,202.382385,202.382385,6202700
2,2009-05-27,203.023026,206.136139,202.607605,202.982986,202.982986,6062500
3,2009-05-28,204.54454,206.016022,202.507507,205.405411,205.405411,5332200
4,2009-05-29,206.261261,208.823822,205.555557,208.823822,208.823822,5291100


In [38]:
env = StocksEnv(df, window_size=32, frame_bound=(3, 10))

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 0 and the array at index 1 has size 1