## YouTube and Code Reference - Nicholas Renotte
https://www.youtube.com/watch?v=Mut_u40Sqz4&list=PLgNJO2hghbmjlE6cuKMws2ejC54BTAaWV&index=8

https://github.com/nicknochnack/ReinforcementLearningCourse/blob/main/Project%201-Breakout.ipynb

### 1. Import dependencies

In [3]:
import gym 
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
import os

In [None]:
#Dowload ararimania ROMS archive from - http://www.atarimania.com/rom_collection_archive_atari_2600_roms.html
# Unzip and place ROMS and HC-ROMS folder in the place of jupyter notebook.

### 2. Test Environment

In [None]:
# Help to install atari py
# https://blog.csdn.net/ScienceVip/article/details/105097833
# https://giters.com/Kojoley/atari-py

In [4]:
import atari_py
print(atari_py.list_games())

['adventure', 'air_raid', 'alien', 'amidar', 'assault', 'asterix', 'asteroids', 'atlantis', 'bank_heist', 'battle_zone', 'beam_rider', 'berzerk', 'bowling', 'boxing', 'breakout', 'carnival', 'centipede', 'chopper_command', 'crazy_climber', 'defender', 'demon_attack', 'double_dunk', 'elevator_action', 'enduro', 'fishing_derby', 'freeway', 'frostbite', 'gopher', 'gravitar', 'hero', 'ice_hockey', 'jamesbond', 'journey_escape', 'kaboom', 'kangaroo', 'krull', 'kung_fu_master', 'montezuma_revenge', 'ms_pacman', 'name_this_game', 'phoenix', 'pitfall', 'pong', 'pooyan', 'private_eye', 'qbert', 'riverraid', 'road_runner', 'robotank', 'seaquest', 'skiing', 'solaris', 'space_invaders', 'star_gunner', 'tennis', 'time_pilot', 'tutankham', 'up_n_down', 'venture', 'video_pinball', 'wizard_of_wor', 'yars_revenge', 'zaxxon']


In [5]:
!python -m atari_py.import_roms .\ROMS\ROMS

C:\Users\shilp\anaconda3\python.exe: No module named atari_py.import_roms


In [6]:
environment_name = "Breakout-v0"
env = gym.make(environment_name)

In [7]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()



Episode:1 Score:0.0
Episode:2 Score:0.0
Episode:3 Score:1.0
Episode:4 Score:1.0
Episode:5 Score:2.0


In [8]:
env.action_space.sample()

1

In [9]:
env.observation_space.sample()

array([[[ 44,  39,  61],
        [236,  13,  70],
        [144,   0,  85],
        ...,
        [228, 210, 109],
        [161, 151, 208],
        [ 97, 163, 150]],

       [[124,  15,  76],
        [160, 222,  10],
        [ 23, 222, 110],
        ...,
        [ 16,  73, 150],
        [ 75,  84,  36],
        [198, 234, 108]],

       [[213,  89, 144],
        [ 81,  28,   1],
        [ 17, 167,  88],
        ...,
        [ 25,  17,  21],
        [129, 174,  42],
        [230,  28,  46]],

       ...,

       [[163, 113, 124],
        [108,  64, 236],
        [184, 204, 173],
        ...,
        [ 65, 192, 189],
        [ 30, 109, 227],
        [241,  43, 251]],

       [[ 88,  76,  97],
        [  6, 104, 159],
        [  6, 244,  75],
        ...,
        [221,  89, 174],
        [ 67, 203,  69],
        [  6, 122,   3]],

       [[139,   0, 246],
        [ 51,  10,  98],
        [188, 163, 221],
        ...,
        [126, 249,  26],
        [ 65,  96,  71],
        [230, 159, 204]]

### 3. Vectorise Environment and Train Model

In [10]:
env = make_atari_env('Breakout-v0', n_envs=4, seed=0)
env = VecFrameStack(env, n_stack=4)

In [11]:
log_path = os.path.join('Training', 'Logs')
model = A2C("CnnPolicy", env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=30000)

Using cpu device
Wrapping the env in a VecTransposeImage.
Logging to Training\Logs\A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 272      |
|    ep_rew_mean        | 1.32     |
| time/                 |          |
|    fps                | 53       |
|    iterations         | 100      |
|    time_elapsed       | 37       |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.33    |
|    explained_variance | -0.384   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.0288  |
|    value_loss         | 0.203    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 270      |
|    ep_rew_mean        | 1.34     |
| time/                 |          |
|    fps                | 64       |
|    iterations         | 200      |
|    time_elapsed      

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 327      |
|    ep_rew_mean        | 2.53     |
| time/                 |          |
|    fps                | 80       |
|    iterations         | 1400     |
|    time_elapsed       | 348      |
|    total_timesteps    | 28000    |
| train/                |          |
|    entropy_loss       | -1.19    |
|    explained_variance | 0.745    |
|    learning_rate      | 0.0007   |
|    n_updates          | 1399     |
|    policy_loss        | -0.0405  |
|    value_loss         | 0.0505   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 343      |
|    ep_rew_mean        | 2.91     |
| time/                 |          |
|    fps                | 79       |
|    iterations         | 1500     |
|    time_elapsed       | 377      |
|    total_timesteps    | 30000    |
| train/                |          |
|

KeyboardInterrupt: 

### 4. Save and Reload Model

In [12]:
a2c_path = os.path.join('Training', 'A2C_model')
model.save(a2c_path)

In [13]:
del model

In [14]:
env = make_atari_env('Breakout-v0', n_envs=1, seed=0)
env = VecFrameStack(env, n_stack=4)
model = A2C.load(a2c_path, env)

Wrapping the env in a VecTransposeImage.


### 5. Evaluate and Test

In [15]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(4.3, 2.238302928559939)

In [None]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

In [16]:
env.close()