In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
#Import packages
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from itertools import count
from collections import deque

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import tqdm
from tqdm import tnrange, tqdm_notebook
from time import sleep
import os

#Import Custom Classes

from Source.nn_model_dqn import QNetwork
from Source.dqn_rcv_agent import Agent, ReplayBuffer, EpsilonGreedyStrategy
from Source.env_manager import EnvManager
from Source.misc_fun.utils import plot, get_moving_average

In [4]:
#Hyper-parameters
BUFFER_SIZE = int(1e5)      #replay buffer size
BATCH_SIZE = 60             #minibatch size
GAMMA = 0.999                #discount factor
TAU = 1e-3                  #for soft update of target parameters
LR = 5e-4                   #learning rate
UPDATE_EVERY = 50            #how often to update the network
eps_start = 1
eps_end = 0.01
eps_decay = 0.9983 #125e-6
train_episodes = 25
test_episodes = 1
seed = 0                    #random seed number
episode_step_limit = 50
#%%
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
#Choose the environment
em = EnvManager(device, 'combrf-v2', seed)
available_actions = em.num_actions_available()
random.seed(seed)
state_size = em.state_size()

#Select the strategy
strategy = EpsilonGreedyStrategy(eps_start, eps_end, eps_decay)

#Initialize the agent
agent = Agent(strategy, state_size, available_actions, seed, device)

#Instantiate MemoryBuffer
memory = ReplayBuffer(available_actions, BUFFER_SIZE, BATCH_SIZE, seed, device)


policy_net = QNetwork(state_size, available_actions, seed).to(device)
target_net = QNetwork(state_size, available_actions, seed).to(device)
print(policy_net)

#Initialize target_net weights to policy_net weights
target_net.load_state_dict(policy_net.state_dict())
target_net.eval() #Set the target_net in eval mode

#Select the optimizer
optimizer = optim.Adam(params=policy_net.parameters(), lr=LR)

QNetwork(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=130, out_features=256, bias=True)
    (1): Linear(in_features=256, out_features=256, bias=True)
    (2): Linear(in_features=256, out_features=64, bias=True)
  )
  (output): Linear(in_features=64, out_features=8, bias=True)
)


## Test with random untrained actions

In [6]:
obs = em.env.reset()
print(obs)
ep_rwd=[]
while True:
    action = random.randrange(em.env.action_space.n)
    
    obs, rwd, done, _ = em.env.step(action)
    print(action)
    ep_rwd.append(rwd)
    if done:
        break
        
print("Episode score: {}".format(np.sum(ep_rwd)))

[[ 4.72280004e-08  1.36425981e-07 -1.77550709e-07  3.31816633e-08
   1.45853489e-07 -1.72510123e-07  1.89390689e-08  1.54418329e-07
   1.36425981e-07 -1.77550709e-07  3.31816633e-08  1.45853489e-07
  -1.72510123e-07  1.89390689e-08  1.54418329e-07 -1.66449203e-07
  -1.77550709e-07  3.31816633e-08  1.45853489e-07 -1.72510123e-07
   1.89390689e-08  1.54418329e-07 -1.66449203e-07  4.58445694e-09
   3.31816633e-08  1.45853489e-07 -1.72510123e-07  1.89390689e-08
   1.54418329e-07 -1.66449203e-07  4.58445694e-09  1.62069841e-07
   1.45853489e-07 -1.72510123e-07  1.89390689e-08  1.54418329e-07
  -1.66449203e-07  4.58445694e-09  1.62069841e-07 -1.59403799e-07
  -1.72510123e-07  1.89390689e-08  1.54418329e-07 -1.66449203e-07
   4.58445694e-09  1.62069841e-07 -1.59403799e-07 -9.79727037e-09
   1.89390689e-08  1.54418329e-07 -1.66449203e-07  4.58445694e-09
   1.62069841e-07 -1.59403799e-07 -9.79727037e-09  1.68762770e-07
   1.54418329e-07 -1.66449203e-07  4.58445694e-09  1.62069841e-07
  -1.59403

## Train the DQN agent

In [11]:
ep_rewards = []
policy_net.train()

outer = tqdm.tqdm_notebook(total=train_episodes, desc='training loop: ', position=0)

for episode in range(train_episodes):
    obs = em.reset()

    ep_loss = 0.0
    ep_rwd = 0.0
    timestep = 0
    agent.current_step +=1
    while True:
        action = agent.act(obs, policy_net)
        next_obs, reward, done, _ = em.step(action)
        #agent.step(obs, action, reward, next_obs, done)
        ep_rwd += reward.item()
        memory.add(obs, action, reward, next_state, done)
        
        obs = next_obs
        #state_tensor = torch.from_numpy(state).float().unsqueeze(0).to(device)
        
        #if memory.can_provide_sample():
        #    experiences = memory.sample()
        #    states, actions, rewards, next_states, dones = experiences
        #    
        #    #print(states.shape, states.dtype)
        #    #print(actions.unsqueeze(-1).shape)
        #    current_q_values = policy_net(states).gather(1,index=actions.unsqueeze(-1))
        #    next_q_values = target_net(next_states).detach().max(1)[0]
        #    target_q_values = (next_q_values*GAMMA) + rewards
            
        #    loss = F.mse_loss(current_q_values, target_q_values.unsqueeze(1))
            #print("loss: ", loss.item())
        #    ep_loss += loss.item()
            
        #    optimizer.zero_grad()
        #    loss.backward()
        #    optimizer.step()
        
        if done:
            ep_rewards.append(ep_rwd)
            moving_avg_rwd = get_moving_average(100, ep_rewards)
            print('\rEpisode {}\t,\tScore: {:.2f}, eps: {}, moving avg_rwd: {}'.format(episode+1, ep_rwd, agent.strategy.get_exploration_rate(agent.current_step), moving_avg_rwd[-1]), end="\r")
            #plot(episode_rewards, 100)
            break
            
        
    if episode % UPDATE_EVERY == 0:
        target_net.load_state_dict(policy_net.state_dict())
    
    if (np.mean(ep_rewards[-100:]) >= 20000):
        print("Goal is reached in {} episodes!\n".format(episode))
        break
        
    
    
    # update tqdm bar
    outer.update(1)
    
torch.save(policy_net.state_dict(), 'checkpoint.pth')    
#timer.finish()
plot(episode_rewards, 100)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, description='training loop: ', max=25.0, style=ProgressStyle(descripti…

Episode 1	,	Score: 6.95, eps: 0.9932173203563519, moving avg_rwd: 0.0Episode 2	,	Score: 6.64, eps: 0.9915288509117461, moving avg_rwd: 0.0Episode 3	,	Score: 6.80, eps: 0.9898432518651962, moving avg_rwd: 0.0Episode 4	,	Score: 6.87, eps: 0.9881605183370252, moving avg_rwd: 0.0Episode 5	,	Score: 6.68, eps: 0.9864806454558522, moving avg_rwd: 0.0Episode 6	,	Score: 6.93, eps: 0.9848036283585773, moving avg_rwd: 0.0

RuntimeError: Subtraction, the `-` operator, with a bool tensor is not supported. If you are trying to invert a mask, use the `~` or `logical_not()` operator instead.