### Final Project : Deep Reinforcement Learning Extended

### Stanley Chou RUID:193005065 , Anis Chihoub, Sunny Chen
### Due: 12/16/2022

#1 Imports and Making sure the Environment Works

In [1]:
import gym

# Import retro to play Street Fighter using a ROM
import retro
# Import time to slow down game
import time
import torch
#import google drive from colab in order to avoid reuploading


# See the different retro games
#retro.data.list_games()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
print(device)

cuda


In [3]:
#General Imports

import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay
from collections import namedtuple, deque
from itertools import count
from PIL import Image
from gym.wrappers import Monitor
from gym.wrappers.monitoring.video_recorder import VideoRecorder
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

import math
import glob
import io
import base64
from IPython.display import HTML

In [4]:
#load our environment
env = retro.make(game= 'StreetFighterIISpecialChampionEdition-Genesis')



# Need to add preprocessing - grey scale the screen, get the frame differences currentscreen -last screen, possibly downsize the data
#adjust the reward function to make it less sparse - and make it equal to the score of the game + the health you take away from the opponent, and minus the health you lose

In [5]:


# Import the space shapes for the environment
from gym import spaces
from gym.spaces import MultiBinary, Box
# Import numpy to calculate frame delta 
import numpy as np
# Import opencv for grayscaling
import cv2




     


In [6]:
#The custom environment is necessarry for us to implement our own loss functions

# Create custom environment 
class StreetFighter(gym.Env): 
    def __init__(self):
        super().__init__()
        # Specify action space and observation space 
        #The dimensions for the game environment itself is this in gray scale)
        self.observation_space = Box(low=0, high=255, shape=(64, 64, 1), dtype=np.uint8)
        #12 possible button presses on the fighting controller hence multibinary 12
        self.action_space = MultiBinary(12)
        # Startup and instance of the game 
        #the filter parameter forces the agent to only use valid button combinations e.g no passing [1,1,1,1,1,1,1,1,1,1,1,1] as the action vector and getting something 
        # (this example vector is equivalent to a person holding all the buttons on the arcade box controller and inputting all 4 directions at the same time which is impossible and does nothing so don't do it)
        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis', use_restricted_actions = retro.Actions.FILTERED)
    
    def reset(self):
        # Return the first frame 
        obs = self.game.reset()
        obs = self.preprocess(obs) 
        #reset state so the previous frame in this case is the initial frame
        self.previous_frame = obs 
        
        # Create a attribute to hold the score delta 
        self.score = 0 
        # Create a attribute to hold the Player Health delta
        self.health = 176 
        # Create a attribute to hold the Enemy Health delta The default health for characters at each round is 176
        self.enemy_health = 176
        self.matches_won = 0
        self.enemy_matches_won = 0
        return obs
    
    #Grayscaling and resizing the image to improve training time since we are poor and need computations to be easy
    #Original image is 200x256x3 so we are turning it into a grayscale image of half the y dimension 128x128x1 (originally we did 100x128x1 but for some reason square images looked and did better)
    def preprocess(self, observation): 
        # Grayscaling the frame
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        # Resize (128x128)
        resize = cv2.resize(gray, (64,64), interpolation=cv2.INTER_CUBIC)
        # Add the channels value (128x128z1)
        channels = np.reshape(resize, (64,64,1))
        #print(channels.shape)
        return channels
    
    def step(self, action): 
        # Take a step 
        obs, reward, done, info = self.game.step(action)
        #preprocess the new game frame
        obs = self.preprocess(obs) 
        
        # Frame delta newframe - old frame
        frame_delta = obs - self.previous_frame
        #print(frame_delta.shape)
        self.previous_frame = obs 
        
        # Reshape the reward function ################################################################## This is where we adjust the game reward
        score_delta = info['score'] - self.score 
        health_delta = info['health'] - self.health
        enemy_health_delta = info['enemy_health'] - self.enemy_health
        reward = score_delta + 50*health_delta - 50*enemy_health_delta
        # add 500 for win - 500 for loss 
        if (self.matches_won != info['matches_won'] and info['matches_won'] != 0):
            reward = reward + 500
        if (self.enemy_matches_won != info['enemy_matches_won'] and info['enemy_matches_won'] != 0):
            reward = reward - 500
        
        #reward function losing health is bad hurting the enemy is good, getting points is good
        self.score = info['score'] 
        self.health = info['health']
        self.enemy_health = info['enemy_health']
        self.matches_won = info['matches_won']
        self.enemy_matches_won = info['enemy_matches_won']
        return frame_delta, reward, done, info
    
    #functions for rendering the game as an mp4 and for closing the game
    def render(self, *args, **kwargs):
        self.game.render()
        
    def close(self):
        self.game.close()



     


#Testing Custom Environment space

In [7]:
#code to display results

from base64 import b64encode
def render_mp4(videopath: str) -> str:
  mp4 = open(videopath, 'rb').read()
  base64_encoded_mp4 = b64encode(mp4).decode()

In [8]:

# Importing the optimzation frame - HPO
import optuna
# PPO algo for RL. Proximal Policy Optimization shown to work better 
from stable_baselines3 import PPO

# Bring in the eval policy method for metric calculation
from stable_baselines3.common.evaluation import evaluate_policy
# Import the sb3 monitor for logging 
from stable_baselines3.common.monitor import Monitor
# Import the vec wrappers to vectorize and frame stack
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
# Import os to deal with filepaths
import os

In [9]:
#directories to store models so that we can save each increment model
LOG_DIR = './logs2/'
OPT_DIR = './opt2/'

In [10]:
# Function to return test hyperparameters - define the object function
#Optuna as a library allows us to suggest hyper parameters we would like to train our model with
# these 5 hyper parameters are the parameters needed for the PPO (Proximal Policy Optimization Algorithm)
# Through some research we read the DQN provided unstable results, and this was shown in what we got from it
# PPO is also less memory intensive and should train afaster
def optimize_ppo(trial): 
    return {
        #ppo wants steps to be a multiple of 64 to work well
        'n_steps':trial.suggest_int('n_steps', 2048, 8192),
        'gamma':trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate':trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range':trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda':trial.suggest_uniform('gae_lambda', 0.8, 0.99)
    }

In [11]:
SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_modelv2.1'.format(1))

## 3 Training and Testing

In [12]:
# Import base callback 
from stable_baselines3.common.callbacks import BaseCallback

In [13]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [14]:
#where we save the model, we save a model every 10000 steps
CHECKPOINT_DIR = './train2/'

In [15]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

In [16]:
# Create environment 
env.close()

env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [17]:
#set model
model = PPO.load('best_model_5180000')

In [18]:
env.close()
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [23]:
# Reset game to starting state
obs = env.reset()
# Set flag to flase
done = False
for game in range(1): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()

        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)
        time.sleep(0.01)
        #print(info)
env.reset()    


array([[[[  0,   0,   0,  36],
         [  0,   0,   0,  36],
         [  0,   0,   0, 155],
         ...,
         [  0,   0,   0,  62],
         [  0,   0,   0,  34],
         [  0,   0,   0,  38]],

        [[  0,   0,   0,  36],
         [  0,   0,   0,  40],
         [  0,   0,   0,  61],
         ...,
         [  0,   0,   0,  30],
         [  0,   0,   0,  36],
         [  0,   0,   0,  36]],

        [[  0,   0,   0,  36],
         [  0,   0,   0,  36],
         [  0,   0,   0,  14],
         ...,
         [  0,   0,   0,  32],
         [  0,   0,   0,  39],
         [  0,   0,   0,  36]],

        ...,

        [[  0,   0,   0, 156],
         [  0,   0,   0, 204],
         [  0,   0,   0, 170],
         ...,
         [  0,   0,   0, 170],
         [  0,   0,   0, 204],
         [  0,   0,   0, 190]],

        [[  0,   0,   0, 162],
         [  0,   0,   0, 162],
         [  0,   0,   0, 162],
         ...,
         [  0,   0,   0, 162],
         [  0,   0,   0, 162],
         

In [135]:
info

[{'enemy_matches_won': 2,
  'score': 38800,
  'matches_won': 0,
  'continuetimer': 10,
  'enemy_health': 0,
  'health': 0,
  'episode': {'r': 38800, 'l': 13743, 't': 689.282573},
  'terminal_observation': array([[[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          ...,
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]],
  
         [[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          ...,
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]],
  
         [[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          ...,
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]],
  
         ...,
  
         [[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          ...,
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]],
  
         [[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          ...,
          [0, 0, 0, 0],
          [0, 0, 