In [2]:
RUNNING_LOCALLY = True

In [3]:
import os

# PLEASE SET YOUR OWN WORKING_DIRECTORY WHEN RUNNING LOCALLY
WORKING_DIRECTORY = "../"

if not RUNNING_LOCALLY:
    os.chdir("/home/yash/")
    print("Current Directory ->", os.getcwd())

    WORKING_DIRECTORY = "/home/yash/working_dir/"

    # Ensure that you are working in the right environment
    !echo $CONDA_PREFIX

LOG_FILE = WORKING_DIRECTORY + "log_file.txt"

def write_to_log(statement, include_blank_line=False):
    try:
        with open(LOG_FILE, "a") as myfile:
            if include_blank_line:
                myfile.write("\n\n" + statement)
            else:
                myfile.write("\n" + statement)
    except:
        # Running this locally may cause errors, and isn't required
        pass

In [4]:
import gym
import time
import numpy as np
from collections import deque
from gym import spaces
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import time

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [5]:
class ConcatObs(gym.Wrapper):
    def __init__(self, env, k):
        gym.Wrapper.__init__(self, env)
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
        self.observation_space = \
            spaces.Box(low=0, high=255, shape=((k,) + shp), dtype=env.observation_space.dtype)

    def reset(self):
        ob = self.env.reset()
        for _ in range(self.k):
            self.frames.append(ob)
        return self._get_ob()

    def step(self, action):
        ob, reward, done, info = self.env.step(action)
        self.frames.append(ob)
        return self._get_ob(), reward, done, info

    def _get_ob(self):
        return np.array(self.frames)

In [6]:
# A bunch of wrappers to get us started, please use these
class ObservationWrapper(gym.ObservationWrapper):
    def __init__(self, env, GRAYSCALE=False, NORMALIZE=False):
        self.GRAYSCALE = GRAYSCALE
        self.NORMALIZE = NORMALIZE
        super().__init__(env)
    
    def observation(self, obs):
        # Normalise observation by 255

        
        if self.NORMALIZE:
            obs = obs / 255.0
        # Convert to grayscale -> This isn't quite working right now, but we can update the function quite easily later
        if self.GRAYSCALE:
#             obs = obs
            obs = tf.image.rgb_to_grayscale(obs)
                    
        image = obs[:,2:-9,8:,:]
        image = tf.image.resize(image,[84,84])
        image = tf.transpose(tf.reshape(image, image.shape[:-1]),perm = [1,2,0])
        return image

class RewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def reward(self, reward):
        # Clip reward between 0 to 1
        return np.clip(reward, 0, 1)
    
class ActionWrapper(gym.ActionWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def action(self, action):
        return action

In [7]:
env = gym.make("ALE/Riverraid-v5")

A.L.E: Arcade Learning Environment (version 0.7.5+db37282)
[Powered by Stella]


In [8]:
# Using the wrappers for the environment
env = ObservationWrapper(RewardWrapper(ActionWrapper(ConcatObs(env, 4))), GRAYSCALE=True, NORMALIZE=True)
obs = env.reset()

2022-05-04 23:01:14.568795: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [116]:
#visualization
#image = tf.keras.preprocessing.image.array_to_img(tf.reshape(image[:,:,3],[84,84,1]))

In [16]:
print(env.observation_space)
print(env.action_space.n)

Box(0, 255, (4, 210, 160, 3), uint8)
18


## SARSA($\lambda$)

In [17]:
hidden_size = 256
num_actions = env.action_space.n
obs_space_shape = [84, 84, 4]
learning_rate = 0.1

def create_q_model(input_shape, hidden_size, num_actions):
    # Network defined by the Deepmind paper
    inputs = layers.Input(shape=input_shape)
    # Convolutions on the frames on the screen
    
    layer1 = layers.Conv2D(16, 8, strides=4, activation="relu")(inputs)
    layer2 = layers.Conv2D(32, 4, strides=2, activation="relu")(layer1)
    layer3 = layers.Flatten()(layer2)
    layer4 = tf.keras.layers.Dense(hidden_size, activation='relu')(layer3) 
    action = tf.keras.layers.Dense(num_actions, activation='softmax')(layer4)

    return keras.Model(inputs=inputs, outputs=action)

q_net = create_q_model(input_shape=obs.shape, hidden_size=hidden_size, num_actions=num_actions)

In [18]:
# elig_trace = np.zeros((obs.shape, num_actions))   # eligibility trace
trainer = tf.keras.optimizers.SGD(learning_rate = learning_rate)

**One episode**

In [44]:
def choose_action(env, q_net, state, epsilon):
    if np.random.rand(1) < epsilon:
        action = env.action_space.sample()
    else:
        q_values = q_net(np.array([obs]))
        action = tf.math.argmax(q_values, axis=1).numpy()[0] # greedy
    return action


def update_policy(q_net, obs, action, reward, nst_obs, next_action, done):
    
    with tf.GradientTape() as tape:

        q_values = q_net(np.array([obs]))
        one_hot_action = tf.one_hot([action], num_actions)
        predict = tf.reduce_sum(one_hot_action * q_values, axis=1)

        q_values_next = q_net(np.array([nst_obs]))
        target = reward + (1 - done) * gamma * q_values_next

        loss = tf.reduce_mean(tf.square(target - predict))

    grad = tape.gradient(loss, q_net.trainable_variables)
    trainer.apply_gradients(zip(grad, q_net.trainable_variables))
    
def train_one_episode(q_net, e, gamma, num_steps):
    rTot = 0
    obs = env.reset()
    for step in range(num_steps):
        action = choose_action(env, q_net, obs, e)
        nst_obs, reward, done, _ = env.step(action)
        delta = 1.0 if done else 0.0

        next_action = choose_action(env, q_net, nst_obs, e)
        update_policy(q_net, obs, action, reward, nst_obs, next_action, done)

        rTot += reward
        obs = nst_obs

        if done:
            break
            
    return rTot

In [45]:
#hyper parameters
num_steps = 500
gamma = 0.99
e = 0.2

start_time = time.time()
rTot = train_one_episode(q_net, e, gamma, num_steps)
end_time = time.time()

inter = end_time - start_time

# Close the env
env.close()

In [46]:
print(rTot,inter)

14.0 13.43814992904663


**Episodes with n_game**

In [47]:
n_games = 10 # when running in the GCP, we can set a much higher value
E = 1000      # To adjust the epsilon value

In [48]:
# Write to log
model_name = "SARSA_" + str(n_games)
write_to_log("Starting " + model_name, include_blank_line=True)

In [49]:
rot_list = []
time_list = []
for i in range(n_games):
    if (i + 1) % 5 == 0:
        print(f'Running game {i+1}/{n_games}...')
        write_to_log(f'Running game {i+1}/{n_games}...', include_blank_line=False)
    
    e = E / (i + E)
    if e < 0.1:
        e = 0.1
        
    start_time = time.time()
    rTot = train_one_episode(q_net, e, gamma, num_steps)
    end_time = time.time()
    
    inter = end_time-start_time
    rot_list.append(rTot)
    time_list.append(inter)

env.close()

In [50]:
print("average_reward:{}".format(sum(rot_list)/n_games))
print("average_time:{}".format(sum(time_list)/n_games))

average_reward:14.5
average_time:12.447486996650696


In [51]:
print(rot_list)

[15.0, 14.0, 16.0, 15.0, 12.0, 14.0, 15.0, 15.0, 13.0, 16.0]


In [None]:
write_to_log("Completed " + model_name, include_blank_line=False)

In [None]:
write_to_log("average_reward:{}".format(sum(rot_list)/n_games), include_blank_line=False)
write_to_log("average_time:{}".format(sum(time_list)/n_games), include_blank_line=False)

1. After 100 games:<br>
   average_reward:773.95<br>
   average_time:21.668797335624696<br>

In [162]:
sum(rot_list[-5:])/5

1534.6

In [163]:
sum(time_list[-5:])/5

15.960728788375855

In [172]:
model_path = WORKING_DIRECTORY + "model/cDQN_" + str(n_games)
q_net.save(model_path)

INFO:tensorflow:Assets written to: model/cDQN_100\assets


In [173]:
q_net_copy = keras.models.load_model(model_path)

