# Dependências

In [1]:
import os
import gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.evaluation import evaluate_policy

# Ambiente

In [15]:
environment_name = "Breakout-v0"

In [16]:
env = gym.make(environment_name)

In [9]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

TypeError: 'int' object is not subscriptable

# Vetorizando ambiente e treinando modelo


In [17]:
env = make_atari_env('Breakout-v0', n_envs = 4, seed = 0)
env = VecFrameStack(env,n_stack = 4)

In [18]:
log_path = os.path.join('Training', 'Logs')
model = A2C('CnnPolicy', env, verbose = 1, tensorboard_log = log_path)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [19]:
model.learn(total_timesteps=100000)

Logging to Training\Logs\A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 263      |
|    ep_rew_mean        | 1.15     |
| time/                 |          |
|    fps                | 54       |
|    iterations         | 100      |
|    time_elapsed       | 36       |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.39    |
|    explained_variance | 0.359    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.00866  |
|    value_loss         | 0.0105   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 266      |
|    ep_rew_mean        | 1.25     |
| time/                 |          |
|    fps                | 55       |
|    iterations         | 200      |
|    time_elapsed       | 72       |
|    total_timesteps    | 4000     |
| train

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 306      |
|    ep_rew_mean        | 2.02     |
| time/                 |          |
|    fps                | 54       |
|    iterations         | 1400     |
|    time_elapsed       | 515      |
|    total_timesteps    | 28000    |
| train/                |          |
|    entropy_loss       | -1.09    |
|    explained_variance | 0.834    |
|    learning_rate      | 0.0007   |
|    n_updates          | 1399     |
|    policy_loss        | -0.0238  |
|    value_loss         | 0.0343   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 313      |
|    ep_rew_mean        | 2.14     |
| time/                 |          |
|    fps                | 54       |
|    iterations         | 1500     |
|    time_elapsed       | 554      |
|    total_timesteps    | 30000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 400      |
|    ep_rew_mean        | 4.33     |
| time/                 |          |
|    fps                | 53       |
|    iterations         | 2800     |
|    time_elapsed       | 1041     |
|    total_timesteps    | 56000    |
| train/                |          |
|    entropy_loss       | -0.519   |
|    explained_variance | 0.442    |
|    learning_rate      | 0.0007   |
|    n_updates          | 2799     |
|    policy_loss        | -0.0693  |
|    value_loss         | 0.16     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 380      |
|    ep_rew_mean        | 3.88     |
| time/                 |          |
|    fps                | 54       |
|    iterations         | 2900     |
|    time_elapsed       | 1070     |
|    total_timesteps    | 58000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 398      |
|    ep_rew_mean        | 4.01     |
| time/                 |          |
|    fps                | 57       |
|    iterations         | 4200     |
|    time_elapsed       | 1462     |
|    total_timesteps    | 84000    |
| train/                |          |
|    entropy_loss       | -0.231   |
|    explained_variance | -0.0506  |
|    learning_rate      | 0.0007   |
|    n_updates          | 4199     |
|    policy_loss        | 0.133    |
|    value_loss         | 0.0902   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 410      |
|    ep_rew_mean        | 4.27     |
| time/                 |          |
|    fps                | 57       |
|    iterations         | 4300     |
|    time_elapsed       | 1493     |
|    total_timesteps    | 86000    |
| train/                |          |
|

<stable_baselines3.a2c.a2c.A2C at 0x1884c1d41c0>

# Salvando e recarregando

In [2]:
a2c_path = os.path.join('Training', 'Saved Models', 'A2C_Breakout_Model')

In [21]:
model.save(a2c_path)

In [22]:
del model

In [4]:
model = A2C.load(a2c_path, env)

Wrapping the env in a VecTransposeImage.


# Testando

In [3]:
env = make_atari_env('Breakout-v0', n_envs = 1, seed = 0)
env = VecFrameStack(env,n_stack = 4)

In [5]:
evaluate_policy(model, env, n_eval_episodes = 10, render = True)



(6.5, 3.4713109915419564)

In [None]:
env.close()