# 1. Import Dependencies

In [1]:
import gymnasium as gym
import ale_py
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack # multi-env wrapper
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env # helper to create Atari env
import os

# 2. Test Environment

In [None]:
environment_name = "ALE/Breakout-v5"

In [None]:
env = gym.make(environment_name)

In [None]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        action = env.action_space.sample()
        n_state, reward, truncated, terminated, info = env.step(action)
        done = truncated or terminated
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

In [None]:
env.action_space.sample()

In [None]:
env.observation_space.sample()

# 3. Vectorise Environment and Train Model

In [2]:
env = make_atari_env('ALE/Breakout-v5', n_envs=4, seed=0) # 4 parallel envs

A.L.E: Arcade Learning Environment (version 0.11.2+ecc1138)
[Powered by Stella]


In [3]:
env = VecFrameStack(env, n_stack=4) # stack 4 frames

In [4]:
log_path = os.path.join('Training', 'Logs')

In [5]:
model = A2C("CnnPolicy", env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [6]:
model.learn(total_timesteps=500000)

Logging to Training/Logs/A2C_2
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 231      |
|    ep_rew_mean        | 1.75     |
| time/                 |          |
|    fps                | 288      |
|    iterations         | 100      |
|    time_elapsed       | 6        |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.39    |
|    explained_variance | 0.91     |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.189   |
|    value_loss         | 0.0301   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 232      |
|    ep_rew_mean        | 1.77     |
| time/                 |          |
|    fps                | 294      |
|    iterations         | 200      |
|    time_elapsed       | 13       |
|    total_timesteps    | 4000     |
| train

<stable_baselines3.a2c.a2c.A2C at 0x141b9e550>

# 4. Save and Reload Model

In [7]:
a2c_path = os.path.join('Training', 'Saved Models', 'A2C_500k_model')

In [8]:
model.save(a2c_path)

In [None]:
del model

In [None]:
env = make_atari_env('ALE/Breakout-v5', n_envs=4, seed=0)
env = VecFrameStack(env, n_stack=4)

In [None]:
model = A2C.load(a2c_path, env)

# 5. Evaluate and Test

In [22]:
import os
import numpy as np # Import numpy to calculate mean/std
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

# --- 1. Define Paths and Load Model ---
a2c_path = os.path.join('Training', 'Saved Models', 'A2C_model.zip')
model = A2C.load(a2c_path)


# --- 2. Create the Environment for Evaluation ---
env = make_atari_env('ALE/Breakout-v5', n_envs=1, env_kwargs={'render_mode': 'human'})
env = VecFrameStack(env, n_stack=4)


# --- 3. Evaluate the Agent ---
episode_rewards, episode_lengths = evaluate_policy(
    model,
    env,
    n_eval_episodes=5,
    deterministic=True,
    render=True,
    return_episode_rewards=True # ==> This is the key change
)


# --- 4. Close Environment and Print Results ---
env.close()

# Loop through the results to print each episode's reward
print("\n--- Individual Episode Rewards ---")
for i, reward in enumerate(episode_rewards):
    print(f"Episode {i + 1}: {reward:.2f}")

# You can still calculate and print the mean and std yourself
mean_reward = np.mean(episode_rewards)
std_reward = np.std(episode_rewards)

print("\n--- Summary ---")
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")


--- Individual Episode Rewards ---
Episode 1: 14.00
Episode 2: 17.00
Episode 3: 16.00
Episode 4: 22.00
Episode 5: 13.00

--- Summary ---
Mean reward: 16.40 +/- 3.14
