In [1]:
import retro
from gym import Env
from gym.spaces import MultiBinary, Box, Discrete
import numpy as np
import cv2
from matplotlib import pyplot as plt    

In [2]:
class StreetFighter(Env):
    def __init__(self,game_state='Champion.Level1.RyuVsGuile.state'):
        super().__init__()
        self.observation_space = Box(low=0,high=255,shape=(84,84,1), dtype=np.uint8)
        self.action_space = Discrete(2**12)
        self.game = retro.make(game='StreetFighterII-Champion', state=game_state,use_restricted_actions=retro.Actions.DISCRETE)

    def step(self, action):
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs)
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs
        reward = info['score'] - self.score
        self.score = info['score']
        return frame_delta, reward, done, info

    def render(self,*args,**kwargs):
        self.game.render()

    def reset(self):
        obs = self.game.reset()
        obs = self.preprocess(obs)
        self.previous_frame = obs
        self.score = 0
        return obs

    def preprocess(self, observation):
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (84,84), cv2.INTER_CUBIC)
        channels = np.reshape(resize, (84,84,1))
        return channels

    def close(self):
        self.game.close()

In [147]:
env.close()

In [119]:
env = StreetFighter()

In [121]:
obs = env.reset()
action = env.action_space.sample()
while True:
    state, reward, done, info = env.step(action)
    # if reward != 0:
    #     print(reward)
    #     print(info)
    env.render()
    if done:
        break
    action = env.action_space.sample()
env.close()

In [4]:
import optuna
import os
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack

In [5]:
LOG_DIR = './logs/'
OPT_DIR = './opt/'

In [6]:
def optimize(trial):
    return {
        'learning_rate':trial.suggest_loguniform('learning_rate',1e-5,1e-4),
        'gamma':trial.suggest_loguniform('gamma', 0.8,0.9999),
        'tau':trial.suggest_loguniform('tau', 0.001,0.01),
    }

In [145]:
env.close()

In [7]:
def optimize_agent(trial):
    try:
        model_params = optimize(trial)
        env = StreetFighter()
        env = Monitor(env,LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env,4,channels_order='last')

        model = DQN("CnnPolicy",env,tensorboard_log=LOG_DIR,verbose=0,batch_size=216,buffer_size=80000, **model_params) # cnn policy uses conv neural net for 
        model.learn(total_timesteps=10000)

        mean_reward, _ = evaluate_policy(model,env,n_eval_episodes=10)
        env.close()
        
        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
        print(mean_reward)
        
        return mean_reward
    except Exception as e:
        print(e)
        return -1000

In [8]:
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent,n_trials=10,n_jobs=1)

[I 2024-04-17 01:43:36,766] A new study created in memory with name: no-name-e9ad6619-610f-4640-896c-42625f94869e
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
[I 2024-04-17 01:53:32,165] Trial 0 finished with value: 43000.0 and parameters: {'learning_rate': 1.0441637166392424e-05, 'gamma': 0.8747694471766431, 'tau': 0.004366731241391021}. Best is trial 0 with value: 43000.0.


43000.0


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
[I 2024-04-17 01:57:55,204] Trial 1 finished with value: 3600.0 and parameters: {'learning_rate': 7.968728754174326e-05, 'gamma': 0.8133541902290516, 'tau': 0.002005755104267706}. Best is trial 0 with value: 43000.0.


3600.0


[I 2024-04-17 02:02:12,382] Trial 2 finished with value: 2700.0 and parameters: {'learning_rate': 6.744269577862206e-05, 'gamma': 0.817254794707618, 'tau': 0.0030184665847828303}. Best is trial 0 with value: 43000.0.


2700.0


[I 2024-04-17 02:05:55,095] Trial 3 finished with value: 2500.0 and parameters: {'learning_rate': 1.1245882391457875e-05, 'gamma': 0.8870886071389116, 'tau': 0.001591208035679002}. Best is trial 0 with value: 43000.0.


2500.0


[I 2024-04-17 02:15:18,534] Trial 4 finished with value: 8800.0 and parameters: {'learning_rate': 1.3297085592279778e-05, 'gamma': 0.8158765500452622, 'tau': 0.0017349746032888499}. Best is trial 0 with value: 43000.0.


8800.0


[I 2024-04-17 02:18:31,436] Trial 5 finished with value: 2500.0 and parameters: {'learning_rate': 2.348534473024836e-05, 'gamma': 0.9228256618551754, 'tau': 0.006977873816395121}. Best is trial 0 with value: 43000.0.


2500.0


[I 2024-04-17 02:24:42,872] Trial 6 finished with value: 15500.0 and parameters: {'learning_rate': 1.0803167515577025e-05, 'gamma': 0.9749308964328317, 'tau': 0.0018762102897521594}. Best is trial 0 with value: 43000.0.


15500.0


[I 2024-04-17 02:28:57,689] Trial 7 finished with value: 4900.0 and parameters: {'learning_rate': 4.4051584256391133e-05, 'gamma': 0.9888577470821505, 'tau': 0.001892289276111243}. Best is trial 0 with value: 43000.0.


4900.0


[I 2024-04-17 02:34:53,490] Trial 8 finished with value: 23200.0 and parameters: {'learning_rate': 3.7421915682778476e-05, 'gamma': 0.8550033281050156, 'tau': 0.003183241726434877}. Best is trial 0 with value: 43000.0.


23200.0


[I 2024-04-17 02:38:44,004] Trial 9 finished with value: 5200.0 and parameters: {'learning_rate': 5.628977335298013e-05, 'gamma': 0.9092951046959884, 'tau': 0.0030305924726291736}. Best is trial 0 with value: 43000.0.


5200.0


In [10]:
study.best_trial

FrozenTrial(number=0, state=TrialState.COMPLETE, values=[43000.0], datetime_start=datetime.datetime(2024, 4, 17, 1, 43, 36, 768013), datetime_complete=datetime.datetime(2024, 4, 17, 1, 53, 32, 165284), params={'learning_rate': 1.0441637166392424e-05, 'gamma': 0.8747694471766431, 'tau': 0.004366731241391021}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.0001, log=True, low=1e-05, step=None), 'gamma': FloatDistribution(high=0.9999, log=True, low=0.8, step=None), 'tau': FloatDistribution(high=0.01, log=True, low=0.001, step=None)}, trial_id=0, value=None)

In [11]:
study.best_params

{'learning_rate': 1.0441637166392424e-05,
 'gamma': 0.8747694471766431,
 'tau': 0.004366731241391021}

In [39]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

In [41]:
env.close()

In [None]:
def Q_Learning(
        env, # openai gym environment
        alpha:float, # step size
        gamma:float,
        num_episode:int
) -> np.array:
    def epsilon_greedy_policy(s,done,w,epsilon=.0):
        nA = env.action_space.n
        Q = [np.dot(w, X(s,done,a)) for a in range(nA)]

        if np.random.rand() < epsilon:
            return np.random.randint(nA)
        else:
            return np.argmax(Q)
        

    # Loop for each episode
    for ep in range(num_episode):
        state = env.reset()
        
        while True:
            
        # Loop until terminal

            # Choose action from S using policy derived from Q

            # Take action A, observe R, S'

            # update Q

            # update S