In [1]:
import retro
from gym import Env # to wrap the environment
from gym.spaces import MultiBinary, Box # 
import numpy as np # to calculate the delta between the frames
import cv2 # for grayscaling

from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy


In [2]:
# Creating custom environment that will carry out all the steps
# we pass our pass environment

class StreetFighter(Env): 
    def __init__(self):
        super().__init__()
        # Specify action space and observation space 
        # resizing and making gray scale
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8) 
        self.action_space = MultiBinary(12)
        # Startup and instance of the game 
        # additional parameter to filter only valid actions
        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis', use_restricted_actions=retro.Actions.FILTERED)
    
    def reset(self):
        # Return the first frame 
        obs = self.game.reset()
        obs = self.preprocess(obs) 
        self.previous_frame = obs # want to also keep track of the previous frame to calculate a delta between the frames
        
        # Create a attribute to hold the score delta 
        self.score = 0 
        return obs
    
    def preprocess(self, observation): 
        # Grayscaling 
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        # Resize 
        resize = cv2.resize(gray, (84,84), interpolation=cv2.INTER_CUBIC)
        # Add the channels value
        channels = np.reshape(resize, (84,84,1))
        return channels 
    
    def step(self, action): 
        # Take a step 
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs) 
        
        # Frame delta - use this to train our agent
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs 
        
        # Reshape the reward function
        reward = info['score'] - self.score 
        self.score = info['score'] 
        
        return frame_delta, reward, done, info
    
    def render(self, *args, **kwargs):
        self.game.render()
        
    def close(self):
        self.game.close()

In [3]:
# evaluate the model
# Bring in the eval policy method for metric calculation

timesteps = 1000
model = PPO.load(f"training/models/PPO_{timesteps}_SF")


In [4]:
# create environment
env = StreetFighter()
mean_reward, _ = evaluate_policy(model, env, render=True, n_eval_episodes=5)




In [5]:
mean_reward, _ 

(2000.0, 0.0)

In [12]:
env.close()

In [6]:
# check logs
log_path = 'training/logs/PPO_10'
!tensorboard --logdir={log_path}


TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.12.1 at http://localhost:6006/ (Press CTRL+C to quit)
^C


In [None]:
# hyperparameter tuning

# install libraries for hyperparameter tuning
!pip3 install torch torchvision optuna

# resource: https://optuna.org/

In [13]:
# Importing the optimzation frame - HPO
import optuna

In [14]:
# Hyperparameter tuning - very important when it comes to RL
# Function to return test hyperparameters - define the object function

def optimize_ppo(trial): 
    return {
        'n_steps':trial.suggest_int('n_steps', 500, 1000),
        #'gamma':trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate':trial.suggest_loguniform('learning_rate', 1e-5, 1e-3),
        #'clip_range':trial.suggest_uniform('clip_range', 0.1, 0.4),
        #'gae_lambda':trial.suggest_uniform('gae_lambda', 0.8, 0.99)
    }

# https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html
# video: https://www.youtube.com/watch?v=6sNIDqgICLY

In [29]:
# Run a training loop and return mean reward 
def optimize_agent(trial):
    # create the experiment
    timesteps = 1000 # increase this + n_eval_episodes for smoother performance and number trials
    # do not hyperparameter tune on large timesteos though
    save_path = f"training/models/opt/PPO_{timesteps}_SF_{trial.number}"

    model_params = optimize_ppo(trial) 

    # Create environment 
    env = StreetFighter()

    # Create algo 
    model = PPO('CnnPolicy', env, tensorboard_log="training/Logs", verbose=1, **model_params)
    model.learn(total_timesteps=timesteps)

    # Evaluate model 
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
    env.close()

    model.save(save_path)

    return mean_reward


In [30]:
# Creating the experiment 
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=10, n_jobs=1) # training for n x number of steps
#study.optimize(optimize_agent, n_trials=100, n_jobs=1)

[I 2024-06-07 21:48:31,330] A new study created in memory with name: no-name-87c3b552-db05-4dbb-a1b2-7cc45406076d
  'learning_rate':trial.suggest_loguniform('learning_rate', 1e-5, 1e-3),
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=814 and n_envs=1)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to training/Logs/PPO_13
----------------------------
| time/              |     |
|    fps             | 385 |
|    iterations      | 1   |
|    time_elapsed    | 2   |
|    total_timesteps | 814 |
----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 2           |
|    time_elapsed         | 15          |
|    total_timesteps      | 1628        |
| train/                  |             |
|    approx_kl            | 0.008379249 |
|    clip_fraction        | 0.0532      |
|    clip_range           | 0.2         |
|    entropy_loss         | -8.31       |
|    explained_variance   | -0.000622   |
|    learning_rate        | 2.3e-05     |
|    loss                 | 2.38e+03    |
|    n_updates            | 10          |


[I 2024-06-07 21:49:48,996] Trial 0 finished with value: 1000.0 and parameters: {'n_steps': 814, 'learning_rate': 2.295270625385499e-05}. Best is trial 0 with value: 1000.0.
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=915 and n_envs=1)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to training/Logs/PPO_14
----------------------------
| time/              |     |
|    fps             | 314 |
|    iterations      | 1   |
|    time_elapsed    | 2   |
|    total_timesteps | 915 |
----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 100          |
|    iterations           | 2            |
|    time_elapsed         | 18           |
|    total_timesteps      | 1830         |
| train/                  |              |
|    approx_kl            | 0.0059193387 |
|    clip_fraction        | 0.0255       |
|    clip_range           | 0.2          |
|    entropy_loss         | -8.32        |
|    explained_variance   | -0.000259    |
|    learning_rate        | 1.43e-05     |
|    loss                 | 23.3         |
|    n_updates            | 

[I 2024-06-07 21:52:08,262] Trial 1 finished with value: 2000.0 and parameters: {'n_steps': 915, 'learning_rate': 1.4257329420365463e-05}. Best is trial 1 with value: 2000.0.
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=599 and n_envs=1)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to training/Logs/PPO_15
----------------------------
| time/              |     |
|    fps             | 293 |
|    iterations      | 1   |
|    time_elapsed    | 2   |
|    total_timesteps | 599 |
----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 100         |
|    iterations           | 2           |
|    time_elapsed         | 11          |
|    total_timesteps      | 1198        |
| train/                  |             |
|    approx_kl            | 0.006880082 |
|    clip_fraction        | 0.0501      |
|    clip_range           | 0.2         |
|    entropy_loss         | -8.32       |
|    explained_variance   | -13.4       |
|    learning_rate        | 9.53e-05    |
|    loss                 | -0.0795     |
|    n_updates            | 10          |


[I 2024-06-07 21:53:40,641] Trial 2 finished with value: 3300.0 and parameters: {'n_steps': 599, 'learning_rate': 9.533441326698291e-05}. Best is trial 2 with value: 3300.0.
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=974 and n_envs=1)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to training/Logs/PPO_16
----------------------------
| time/              |     |
|    fps             | 308 |
|    iterations      | 1   |
|    time_elapsed    | 3   |
|    total_timesteps | 974 |
----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 102          |
|    iterations           | 2            |
|    time_elapsed         | 19           |
|    total_timesteps      | 1948         |
| train/                  |              |
|    approx_kl            | 0.0099665485 |
|    clip_fraction        | 0.0529       |
|    clip_range           | 0.2          |
|    entropy_loss         | -8.31        |
|    explained_variance   | 0.000351     |
|    learning_rate        | 2.94e-05     |
|    loss                 | 2.83e+03     |
|    n_updates            | 

[I 2024-06-07 21:57:55,056] Trial 3 finished with value: 56700.0 and parameters: {'n_steps': 974, 'learning_rate': 2.9448191982907483e-05}. Best is trial 3 with value: 56700.0.
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=630 and n_envs=1)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to training/Logs/PPO_17
----------------------------
| time/              |     |
|    fps             | 332 |
|    iterations      | 1   |
|    time_elapsed    | 1   |
|    total_timesteps | 630 |
----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 103         |
|    iterations           | 2           |
|    time_elapsed         | 12          |
|    total_timesteps      | 1260        |
| train/                  |             |
|    approx_kl            | 0.012875857 |
|    clip_fraction        | 0.0579      |
|    clip_range           | 0.2         |
|    entropy_loss         | -8.31       |
|    explained_variance   | 0.000959    |
|    learning_rate        | 4.04e-05    |
|    loss                 | 1.8e+03     |
|    n_updates            | 10          |


[I 2024-06-07 22:00:13,746] Trial 4 finished with value: 2000.0 and parameters: {'n_steps': 630, 'learning_rate': 4.040120879188815e-05}. Best is trial 3 with value: 56700.0.
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=770 and n_envs=1)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to training/Logs/PPO_18
----------------------------
| time/              |     |
|    fps             | 375 |
|    iterations      | 1   |
|    time_elapsed    | 2   |
|    total_timesteps | 770 |
----------------------------
---------------------------------------
| time/                   |           |
|    fps                  | 115       |
|    iterations           | 2         |
|    time_elapsed         | 13        |
|    total_timesteps      | 1540      |
| train/                  |           |
|    approx_kl            | 0.5318988 |
|    clip_fraction        | 0.373     |
|    clip_range           | 0.2       |
|    entropy_loss         | -8.18     |
|    explained_variance   | 0.000611  |
|    learning_rate        | 0.000114  |
|    loss                 | 1.67e+04  |
|    n_updates            | 10        |
|    policy_gradient_loss | -0

[I 2024-06-07 22:01:39,046] Trial 5 finished with value: 4900.0 and parameters: {'n_steps': 770, 'learning_rate': 0.00011372450859659157}. Best is trial 3 with value: 56700.0.
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=833 and n_envs=1)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to training/Logs/PPO_19
----------------------------
| time/              |     |
|    fps             | 394 |
|    iterations      | 1   |
|    time_elapsed    | 2   |
|    total_timesteps | 833 |
----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 112        |
|    iterations           | 2          |
|    time_elapsed         | 14         |
|    total_timesteps      | 1666       |
| train/                  |            |
|    approx_kl            | 0.18491146 |
|    clip_fraction        | 0.376      |
|    clip_range           | 0.2        |
|    entropy_loss         | -8.18      |
|    explained_variance   | -0.000593  |
|    learning_rate        | 0.000202   |
|    loss                 | 42.9       |
|    n_updates            | 10         |
|    policy_gra

[I 2024-06-07 22:03:08,481] Trial 6 finished with value: 1000.0 and parameters: {'n_steps': 833, 'learning_rate': 0.00020184850051436205}. Best is trial 3 with value: 56700.0.
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=706 and n_envs=1)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to training/Logs/PPO_20
----------------------------
| time/              |     |
|    fps             | 319 |
|    iterations      | 1   |
|    time_elapsed    | 2   |
|    total_timesteps | 706 |
----------------------------
---------------------------------------
| time/                   |           |
|    fps                  | 102       |
|    iterations           | 2         |
|    time_elapsed         | 13        |
|    total_timesteps      | 1412      |
| train/                  |           |
|    approx_kl            | 26.691061 |
|    clip_fraction        | 0.826     |
|    clip_range           | 0.2       |
|    entropy_loss         | -5.22     |
|    explained_variance   | -0.0018   |
|    learning_rate        | 0.000984  |
|    loss                 | 103       |
|    n_updates            | 10        |
|    policy_gradient_loss | 0.

[I 2024-06-07 22:05:02,148] Trial 7 finished with value: 9200.0 and parameters: {'n_steps': 706, 'learning_rate': 0.000984269778949596}. Best is trial 3 with value: 56700.0.
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=613 and n_envs=1)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to training/Logs/PPO_21
----------------------------
| time/              |     |
|    fps             | 395 |
|    iterations      | 1   |
|    time_elapsed    | 1   |
|    total_timesteps | 613 |
----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 110          |
|    iterations           | 2            |
|    time_elapsed         | 11           |
|    total_timesteps      | 1226         |
| train/                  |              |
|    approx_kl            | 0.0019855374 |
|    clip_fraction        | 0.00487      |
|    clip_range           | 0.2          |
|    entropy_loss         | -8.32        |
|    explained_variance   | -5.92        |
|    learning_rate        | 1.45e-05     |
|    loss                 | -0.0441      |
|    n_updates            | 

[I 2024-06-07 22:07:00,412] Trial 8 finished with value: 2700.0 and parameters: {'n_steps': 613, 'learning_rate': 1.4492614573932641e-05}. Best is trial 3 with value: 56700.0.
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=517 and n_envs=1)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to training/Logs/PPO_22
----------------------------
| time/              |     |
|    fps             | 352 |
|    iterations      | 1   |
|    time_elapsed    | 1   |
|    total_timesteps | 517 |
----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 110          |
|    iterations           | 2            |
|    time_elapsed         | 9            |
|    total_timesteps      | 1034         |
| train/                  |              |
|    approx_kl            | 0.0050890213 |
|    clip_fraction        | 0.0119       |
|    clip_range           | 0.2          |
|    entropy_loss         | -8.32        |
|    explained_variance   | 0.000427     |
|    learning_rate        | 2e-05        |
|    loss                 | 34.9         |
|    n_updates            | 

[I 2024-06-07 22:08:49,135] Trial 9 finished with value: 2000.0 and parameters: {'n_steps': 517, 'learning_rate': 2.0040580824160887e-05}. Best is trial 3 with value: 56700.0.


In [33]:
study.best_params

# pass these values in your model

{'n_steps': 974, 'learning_rate': 2.9448191982907483e-05}

In [34]:
# load best model

model = PPO.load("training/models/opt/PPO_1000_SF_3")

In [37]:
# Create environment 
env = StreetFighter()

mean_reward, _ = evaluate_policy(model, env, render=True, n_eval_episodes=5)



In [38]:
mean_reward, _

(56700.0, 0.0)

In [39]:
env.close()

In [None]:
# for fine tuning on further steps, watch the tutorial here:
# https://www.youtube.com/watch?v=rzbFhu6So5U (1 hour 59 minutes)

**Exam Project: You have 24 hours - Can you improve the score of the AI agent?**

Some suggestions, aside from increasing the training steps (which is not enough to get an excellent grade)..
- Could you try a different algorithm for e.g. Deep Q-Network? 
- Hyperparameter Tuning
- Experiment with a different reward function?

You are expected to increase the score of the current AI agent and showcase the experiments you carried out to increase the score. The best score will be the overall winner but you will be graded on how you applied the concepts from the course and your own research to increase the score.

Resource: https://www.youtube.com/watch?v=rzbFhu6So5U