# 1. Import Dependencies

In [1]:
import gym 
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
import os

  from .autonotebook import tqdm as notebook_tqdm


# 2. Test Environment

In [2]:
environment_name = "Breakout-v0"

In [3]:
env = gym.make(environment_name)

A.L.E: Arcade Learning Environment (version 0.7.4+069f8bd)
[Powered by Stella]


In [4]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

  logger.warn(


Episode:1 Score:2.0
Episode:2 Score:2.0
Episode:3 Score:2.0
Episode:4 Score:3.0
Episode:5 Score:3.0


In [5]:
env.action_space.sample()

0

In [6]:
env.observation_space.sample()

array([[[109, 140, 131],
        [244,  65, 117],
        [182, 226,  50],
        ...,
        [ 98,  27, 141],
        [  5,  85, 111],
        [ 46, 216, 169]],

       [[221, 178, 154],
        [ 87,  93, 230],
        [230,  23, 151],
        ...,
        [ 89,  94, 248],
        [ 36,  93,  87],
        [165, 242, 243]],

       [[134,  79,  72],
        [200,  37,  76],
        [ 26, 159,  67],
        ...,
        [176, 131, 172],
        [205, 185, 140],
        [113, 105, 110]],

       ...,

       [[242,  67, 111],
        [ 37,  37,  47],
        [226, 242, 127],
        ...,
        [ 91, 207, 113],
        [203, 201,  94],
        [233,  11, 207]],

       [[ 19,  73, 199],
        [ 52, 187, 129],
        [157,  76, 117],
        ...,
        [ 81, 233,  63],
        [ 77, 141, 115],
        [ 53,  47, 141]],

       [[ 71, 130,  31],
        [102, 157, 221],
        [102,  37,  24],
        ...,
        [228, 135, 198],
        [114, 148, 190],
        [193,  57,  37]]

# 3. Vectorise Environment and Train Model

In [7]:
env = make_atari_env('Breakout-v0', n_envs=4, seed=0)

In [8]:
env = VecFrameStack(env, n_stack=4)

In [9]:
log_path = os.path.join('Training', 'Logs')

In [10]:
model = A2C("CnnPolicy", env, verbose=1, tensorboard_log=log_path)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [31]:
model.learn(total_timesteps=100000)

Logging to Training/Logs/A2C_7
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 609      |
|    ep_rew_mean        | 8.67     |
| time/                 |          |
|    fps                | 196      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.232   |
|    explained_variance | 0.955    |
|    learning_rate      | 0.0007   |
|    n_updates          | 11317    |
|    policy_loss        | 0.0168   |
|    value_loss         | 0.0103   |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 650       |
|    ep_rew_mean        | 9.5       |
| time/                 |           |
|    fps                | 195       |
|    iterations         | 200       |
|    time_elapsed       | 5         |
|    total_timesteps    | 1000      

<stable_baselines3.a2c.a2c.A2C at 0x7fca6160df40>

# 4. Save and Reload Model

In [32]:
a2c_path = os.path.join('Training', 'Saved Models', 'A2C_model')

In [33]:
model.save(a2c_path)

In [34]:
del model

In [35]:
env = make_atari_env('Breakout-v0', n_envs=1, seed=0)
env = VecFrameStack(env, n_stack=4)

In [36]:
model = A2C.load(a2c_path, env)

Wrapping the env in a VecTransposeImage.


# 5. Evaluate and Test

In [39]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(12.0, 3.9496835316262997)

In [38]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(9.7, 2.5317977802344322)

In [22]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

KeyboardInterrupt: 

In [41]:
env.close()