In [1]:
import sys
import pdb
import gym
import math
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from tensorboardX import SummaryWriter
from BayesianNetwork import BayesianNetwork
from BayesianQNetwork import BQN_learn

In [2]:
envt = "CartPole"

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
def test_agent(agent):
    
    count = 0    
    test_episode_rew = 0
    test_return = []
    
    done = False
    agent.dbqn.eval()
    
    obs = env.reset()
    act = agent.act(obs, use_sample=False, num_sample=0)
    
    while count <= 99:
        if done:
            test_return.append(test_episode_rew)
            test_episode_rew = 0
            count = count + 1
            
            obs = env.reset()
            act = agent.act(obs, use_sample=False, num_sample=0)
        
        obs1, rew, done, _ = env.step(act)       
        act = agent.act(obs1, use_sample=False, num_sample=0)        
        test_episode_rew = test_episode_rew + rew        
    
    agent.dbqn.train()
    return np.mean(np.array(test_return))

In [5]:
lr = 1e-2
batch_size = 64
gamma = 0.9
if envt == "CartPole":   
    lr = 1e-3
    gamma = 0.9
    steps = 20000    
    buffer_size = 20000
    features_list = [4, 32, 2]
    env = gym.make('CartPole-v0')
elif envt == "Pendulum":
    steps = 200000
    buffer_size = 50000
    features_list = [2, 32, 5]
    env = gym.make('Pendulum-v0')
elif envt == "MountainCar":
    steps = 200000
    buffer_size = 50000
    features_list = [2, 32, 3]
    env = gym.make('MountainCar-v0')

In [6]:
runs = 5
run_result = []

for run in range(runs):
        
    writer = SummaryWriter()
    dbqn = BayesianNetwork(features_list, 4, batch_size, steps).to(DEVICE)    
    target_dbqn = BayesianNetwork(features_list, 4, batch_size, steps).to(DEVICE)
    agent = BQN_learn(dbqn, target_dbqn, gamma, lr, batch_size, buffer_size, writer)
    
    done = False

    episode_rew = 0
    episode_count = 0
    res = []

    obs = env.reset()
    act = agent.reset(obs)         

    while agent.t <= steps and episode_count < 300:

        if done:
            print("Episode " + str(episode_count) + " with reward = " + str(episode_rew))  
            writer.add_scalar('data/reward', episode_rew, episode_count)
            res.append(episode_rew)
            episode_rew = 0
            episode_count = episode_count + 1                            

            if episode_count%25 == 0:
                test_result = test_agent(agent)
                print("Test Result = " + str(test_result))
                writer.add_scalar('data/test_reward', test_result, episode_count)
                
#             for param_group in agent.optimizer.param_groups:
#                 if param_group['lr'] > 1e-3:
#                     param_group['lr'] = 1e-2 - 1e-3*(episode_count//100)

            obs = env.reset()
            act = agent.reset(obs)   

        obs1, rew, done, _ = env.step(act)       
        act = agent.step(obs, act, rew, obs1, done)
        obs = obs1
        episode_rew = episode_rew + rew    
            
    writer.export_scalars_to_json("./all_scalars.json")
    writer.close()
        
    run_result.append(res)

Episode 0 with reward = 8.0
Episode 1 with reward = 41.0
Episode 2 with reward = 13.0
Episode 3 with reward = 37.0
Episode 4 with reward = 31.0
Episode 5 with reward = 17.0
Episode 6 with reward = 13.0
Episode 7 with reward = 12.0
Episode 8 with reward = 20.0
Episode 9 with reward = 14.0
Episode 10 with reward = 44.0
Episode 11 with reward = 13.0
Episode 12 with reward = 24.0
Episode 13 with reward = 16.0
Episode 14 with reward = 11.0
Episode 15 with reward = 25.0
Episode 16 with reward = 29.0
Episode 17 with reward = 14.0
Episode 18 with reward = 26.0
Episode 19 with reward = 32.0
Episode 20 with reward = 95.0
Episode 21 with reward = 33.0
Episode 22 with reward = 13.0
Episode 23 with reward = 11.0
Episode 24 with reward = 29.0
Test Result = 9.36
Episode 25 with reward = 10.0
Episode 26 with reward = 24.0
Episode 27 with reward = 16.0
Episode 28 with reward = 23.0
Episode 29 with reward = 10.0
Episode 30 with reward = 17.0
Episode 31 with reward = 14.0
Episode 32 with reward = 18.0
Ep

Episode 262 with reward = 62.0
Episode 263 with reward = 50.0
Episode 264 with reward = 58.0
Episode 265 with reward = 39.0
Episode 266 with reward = 44.0
Episode 267 with reward = 16.0
Episode 268 with reward = 20.0
Episode 269 with reward = 61.0
Episode 270 with reward = 71.0
Episode 271 with reward = 52.0
Episode 272 with reward = 34.0
Episode 273 with reward = 58.0
Episode 274 with reward = 16.0
Test Result = 87.14
Episode 275 with reward = 27.0
Episode 276 with reward = 64.0
Episode 277 with reward = 51.0
Episode 278 with reward = 19.0
Episode 279 with reward = 121.0
Episode 280 with reward = 111.0
Episode 281 with reward = 26.0
Episode 282 with reward = 30.0
Episode 283 with reward = 23.0
Episode 284 with reward = 33.0
Episode 285 with reward = 128.0
Episode 286 with reward = 67.0
Episode 287 with reward = 82.0
Episode 288 with reward = 21.0
Episode 289 with reward = 96.0
Episode 290 with reward = 119.0
Episode 291 with reward = 73.0
Episode 292 with reward = 101.0
Episode 293 wi

KeyboardInterrupt: 