# Breakout

In [None]:
!pip install stable-baselines3[extra]
!pip3 install torch torchvision torchaudio
!pip install pyglet
!pip3 install atari_py

In [1]:
import gym
import os                                                        # System (to save models and logs)
from stable_baselines3 import A2C                                # Import the algorithm we'll use
from stable_baselines3.common.env_util import make_atari_env     # Create multiple env
from stable_baselines3.common.vec_env import VecFrameStack       # Train on them at the same time
from stable_baselines3.common.evaluation import evaluate_policy  # Test the model

  from .autonotebook import tqdm as notebook_tqdm


## Import game image (downloaded from http://www.atarimania.com/roms/Roms.rar)

In [9]:
!python3 -m atari_py.import_roms .ROMS

## Execute 5 rounds of the game with random actions

In [21]:
env = gym.make('Breakout-v0')
rounds = 5
for round in range(1, rounds+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, infos = env.step(action)
        score += reward
    print('Round: {}, Score: {}'.format(round, score))
env.close()

Round: 1, Score: 1.0
Round: 2, Score: 2.0
Round: 3, Score: 2.0
Round: 4, Score: 1.0
Round: 5, Score: 2.0


## Use vectors to train the model with multiple environments at the same time

In [18]:
env = make_atari_env('Breakout-v0', n_envs=4, seed=0)
env = VecFrameStack(env, n_stack=4)

## Declare the model and directory to save ths logs

In [19]:
log_path = os.path.join('Training', 'Logs')
model = A2C('CnnPolicy', env, verbose = 1, tensorboard_log=log_path)

Using cpu device
Wrapping the env in a VecTransposeImage.


## Model's learning

In [20]:
model.learn(total_timesteps=4000000)

Logging to Training/Logs/A2C_2
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 287      |
|    ep_rew_mean        | 1.68     |
| time/                 |          |
|    fps                | 311      |
|    iterations         | 100      |
|    time_elapsed       | 6        |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.38    |
|    explained_variance | 0.214    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.145   |
|    value_loss         | 0.0327   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 293      |
|    ep_rew_mean        | 1.82     |
| time/                 |          |
|    fps                | 313      |
|    iterations         | 200      |
|    time_elapsed       | 12       |
|    total_timesteps    | 4000     |
| train

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 319       |
|    ep_rew_mean        | 2.18      |
| time/                 |           |
|    fps                | 315       |
|    iterations         | 1400      |
|    time_elapsed       | 88        |
|    total_timesteps    | 28000     |
| train/                |           |
|    entropy_loss       | -0.0716   |
|    explained_variance | 0.991     |
|    learning_rate      | 0.0007    |
|    n_updates          | 1399      |
|    policy_loss        | -0.000263 |
|    value_loss         | 0.00355   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 320       |
|    ep_rew_mean        | 2.2       |
| time/                 |           |
|    fps                | 315       |
|    iterations         | 1500      |
|    time_elapsed       | 95        |
|    total_timesteps    | 30000     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 305      |
|    ep_rew_mean        | 1.93     |
| time/                 |          |
|    fps                | 315      |
|    iterations         | 2700     |
|    time_elapsed       | 170      |
|    total_timesteps    | 54000    |
| train/                |          |
|    entropy_loss       | -0.15    |
|    explained_variance | 0.993    |
|    learning_rate      | 0.0007   |
|    n_updates          | 2699     |
|    policy_loss        | 0.00117  |
|    value_loss         | 0.00292  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 308      |
|    ep_rew_mean        | 2        |
| time/                 |          |
|    fps                | 315      |
|    iterations         | 2800     |
|    time_elapsed       | 177      |
|    total_timesteps    | 56000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 321      |
|    ep_rew_mean        | 2.21     |
| time/                 |          |
|    fps                | 316      |
|    iterations         | 4000     |
|    time_elapsed       | 252      |
|    total_timesteps    | 80000    |
| train/                |          |
|    entropy_loss       | -0.187   |
|    explained_variance | 0.965    |
|    learning_rate      | 0.0007   |
|    n_updates          | 3999     |
|    policy_loss        | 0.000184 |
|    value_loss         | 0.00415  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 319      |
|    ep_rew_mean        | 2.18     |
| time/                 |          |
|    fps                | 316      |
|    iterations         | 4100     |
|    time_elapsed       | 259      |
|    total_timesteps    | 82000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 308      |
|    ep_rew_mean        | 2.01     |
| time/                 |          |
|    fps                | 316      |
|    iterations         | 5300     |
|    time_elapsed       | 334      |
|    total_timesteps    | 106000   |
| train/                |          |
|    entropy_loss       | -0.479   |
|    explained_variance | 0.994    |
|    learning_rate      | 0.0007   |
|    n_updates          | 5299     |
|    policy_loss        | 0.00119  |
|    value_loss         | 0.00157  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 314      |
|    ep_rew_mean        | 2.12     |
| time/                 |          |
|    fps                | 316      |
|    iterations         | 5400     |
|    time_elapsed       | 340      |
|    total_timesteps    | 108000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 303      |
|    ep_rew_mean        | 1.88     |
| time/                 |          |
|    fps                | 316      |
|    iterations         | 6700     |
|    time_elapsed       | 423      |
|    total_timesteps    | 134000   |
| train/                |          |
|    entropy_loss       | -0.195   |
|    explained_variance | 0.971    |
|    learning_rate      | 0.0007   |
|    n_updates          | 6699     |
|    policy_loss        | -0.00146 |
|    value_loss         | 0.00631  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 299      |
|    ep_rew_mean        | 1.82     |
| time/                 |          |
|    fps                | 316      |
|    iterations         | 6800     |
|    time_elapsed       | 429      |
|    total_timesteps    | 136000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 344      |
|    ep_rew_mean        | 2.8      |
| time/                 |          |
|    fps                | 315      |
|    iterations         | 8000     |
|    time_elapsed       | 507      |
|    total_timesteps    | 160000   |
| train/                |          |
|    entropy_loss       | -0.96    |
|    explained_variance | 0.705    |
|    learning_rate      | 0.0007   |
|    n_updates          | 7999     |
|    policy_loss        | 0.118    |
|    value_loss         | 0.0637   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 342      |
|    ep_rew_mean        | 2.84     |
| time/                 |          |
|    fps                | 315      |
|    iterations         | 8100     |
|    time_elapsed       | 513      |
|    total_timesteps    | 162000   |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 437       |
|    ep_rew_mean        | 4.95      |
| time/                 |           |
|    fps                | 315       |
|    iterations         | 9300      |
|    time_elapsed       | 588       |
|    total_timesteps    | 186000    |
| train/                |           |
|    entropy_loss       | -0.0715   |
|    explained_variance | 0.972     |
|    learning_rate      | 0.0007    |
|    n_updates          | 9299      |
|    policy_loss        | -8.77e-05 |
|    value_loss         | 0.0402    |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 442      |
|    ep_rew_mean        | 5.09     |
| time/                 |          |
|    fps                | 315      |
|    iterations         | 9400     |
|    time_elapsed       | 595      |
|    total_timesteps    | 188000   |
| train/             

<stable_baselines3.a2c.a2c.A2C at 0x177514820>

## Saving the model

In [21]:
a2c_path = os.path.join('Training', 'Saved Models', 'A2C_Breakout-4M')
model.save(a2c_path)
#del model

## Loading a model

In [38]:
model = A2C.load(os.path.join('Training', 'Saved Models', 'A2C_Breakout-4M'), env)

Wrapping the env in a VecTransposeImage.


## Test the model

### Evaluate on 10 games

In [22]:
# EVALUATE THE RESULTS (10 games)
evaluate_policy(model, env, n_eval_episodes=10, render=True)
# Output with 100 000 timesteps : (7.5, 2.247220505424423)
# Output with 500 000 timesteps : (10.9, 3.2388269481403293)
# Output with 800 000 timesteps : (14.2, 4.833218389437829)
# Output with 1 000 000 timesteps : (16.7, 5.883026432033091)
# Output with 2 000 000 timesteps : (20.3, 9.17660067781093)
# Output with 3 000 000 timesteps : (17.9, 7.542545989253232)
# Output with 4 000 000 timesteps : (21.2, 6.615134163416491)

(5.5, 1.6881943016134133)

### Evaluate on 50 games

In [40]:
# EVALUATE THE RESULTS (50 games)
evaluate_policy(model, env, n_eval_episodes=50, render=True)
# Output with 100 000 timesteps : (7.64, 2.5981531902487967)
# Output with 500 000 timesteps : (10.82, 2.8683793333518497)
# Output with 800 000 timesteps : (16.44, 7.797845856388801)
# Output with 1 000 000 timesteps : (15.74, 5.712477571071942)
# Output with 2 000 000 timesteps : (19.56, 8.549058427686642)
# Output with 3 000 000 timesteps : (20.46, 8.66304796246679)
# Output with 4 000 000 timesteps : (17.88, 6.819501448053223)

(17.62, 7.959623106655239)

### Evaluate on 100 games

In [39]:
# EVALUATE THE RESULTS (100 games)
evaluate_policy(model, env, n_eval_episodes=100, render=True)
# Output with 100 000 timesteps : (7.07, 2.0555048041782826)
# Output with 500 000 timesteps : (10.66, 3.6829879174387745)
# Output with 800 000 timesteps : (16.13, 7.400885082204695)
# Output with 1 000 000 timesteps : (16.44, 6.2214467770768564)
# Output with 2 000 000 timesteps : (17.88, 7.303807226371737)
# Output with 3 000 000 timesteps : (19.99, 8.09875916421769)
# Output with 4 000 000 timesteps : (20.59, 8.037530715337889)

(19.99, 8.09875916421769)