In [2]:
import gym
from AgentRL.agents import DQN
from AgentRL.common.buffers import prioritised_replay_buffer, standard_replay_buffer

import torch

def train(env):

    # Set the hyperparameters
    training = True
    render = False
    display_freq = 10
    episodes = 1000
    timestep_limit = 100
    seed = 3
    
    # intialise the environment
    env = env
    env.seed(seed)

    running_reward = []

    # initialise the agent
    buffer = prioritised_replay_buffer(max_size=10_000, seed=seed)
    # buffer = standard_replay_buffer(max_size=50_000, seed=seed)
    agent = DQN(
        state_dim=env.observation_space.shape[0], 
        action_num=env.action_space.n, 
        replay_buffer=buffer,

        algorithm_type='default',
        hidden_dim = 16,
        learning_rate = 5e-4,
        batch_size = 32,
        gamma = 0.95,
        
        target_update_method = 'hard',
        tau = 0.01, # for soft
        target_update_freq = 20, # for hard
        
        exploration_method="greedy",
                
        categorical = False,
        v_range = (0, 200),
        atom_size = 51,
        
        multi_step = 1,

        seed = seed

    )

    for ep in range(1, episodes + 1):

        # reset the state
        state, done = env.reset(), False
        counter = 0
        episode_reward = 0

        # run the training loop
        while not done:

            action = agent.get_action(state=state.flatten())              
            next_state, reward, done, info = env.step(action=action[0])

            # render the environment
            if render: 
                env.render(mode='close')

            # update the reward total
            episode_reward += reward


            if training: 

                # push test samples to the replay buffer
                agent.push(state=state, action=action,
                            next_state=next_state, reward=reward/100, done=done)

                agent.update()                       

            # update the state
            state = next_state
            counter += 1

            # terminate when episode limit is reached            
            if counter >= timestep_limit:
                done = True

            # print the episode reward
            if done: 

                # get reward mean
                running_reward.append(episode_reward)

                if ep % display_freq == 0:
                    # print('Ep {} - Mean Reward {} Exploration {}'.format(ep, sum(running_reward) / display_freq, round(agent.policy.current_exploration, 2)))
                    print('Ep {} - Mean Reward {}'.format(ep, sum(running_reward) / display_freq))
                    running_reward = []  

    # close the display
    env.close()    
    
if __name__ == "__main__":
    
    # get the environment    
    env = gym.make("CartPole-v0")
    
    # run the program
    try: 
        train(env)
    
    # shut the y window if interrupted
    except KeyboardInterrupt:
        env.close()        

--------------------
DQN SETTINGS:
--------------------
State dim: 4
Action dim: 1
Input type: array
Seed: 3
Device: cpu

Hyperparameters:
--------------------
Algorithm type: default
Hidden dimensions: 16
Batch size: 32
Discount factor: 0.95
Learning rate: 0.0005
Steps per network update: 1
Target Update Method: hard
Steps per target update: 20
Replay type: prioritised
Exploration Method: greedy
Starting exploration: 1.0
Exploration decay factor: 0.999
Minimum exploration: 0.01
Categorical Learning: False
--------------------

Ep 10 - Mean Reward 15.8
Ep 20 - Mean Reward 23.7
Ep 30 - Mean Reward 17.9
Ep 40 - Mean Reward 17.3
Ep 50 - Mean Reward 19.9
Ep 60 - Mean Reward 20.0
Ep 70 - Mean Reward 22.2
Ep 80 - Mean Reward 26.9
Ep 90 - Mean Reward 35.4
Ep 100 - Mean Reward 31.9
Ep 110 - Mean Reward 28.7
Ep 120 - Mean Reward 30.6
Ep 130 - Mean Reward 29.3
Ep 140 - Mean Reward 34.2
Ep 150 - Mean Reward 51.0
Ep 160 - Mean Reward 64.4
Ep 170 - Mean Reward 81.6
Ep 180 - Mean Reward 98.9
