# Deep-Q Reinforcement Learning for Retro Video Games
- Built off of the [OpenAI Gym framework](https://github.com/openai/mlsh/tree/master/gym)

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!pip install atari-py
!apt-get install python-opengl -y
!pip install piglet
%pip install -U gym>=0.21.0
%pip install -U gym[atari,accept-rom-license]

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Suggested packages:
  libgle3
The following NEW packages will be installed:
  python-opengl
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 496 kB of archives.
After this operation, 5,416 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 python-opengl all 3.1.0+dfsg-1 [496 kB]
Fetched 496 kB in 0s (3,242 kB/s)
Selecting previously unselected package python-opengl.
(Reading database ... 155222 files and directories currently installed.)
Preparing to unpack .../python-opengl_3.1.0+dfsg-1_all.deb ...
Unpacking python-opengl (3.1.0+dfsg-1) ...
Setting up python-opengl (3.1.0+dfsg-1) ...
Collecting piglet
  Downloading piglet-1.0.0-py2.py3-none-any.whl (2.2 kB)
Collecting piglet-templates
  Downloading piglet_templates-1.2.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.0 MB/s 
Installin

In [None]:
import torch
from torch import nn
from torchvision import transforms as T
from PIL import Image
import numpy as np
from pathlib import Path
from collections import deque
import random, time, copy

# Import OpenAI Gym
import gym
from gym.spaces import Box
from gym.wrappers import FrameStack

  for external in metadata.entry_points().get(self.group, []):


In [None]:
# Get GPU if available
if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')

print(device)

cuda


In [None]:
# Initialize Super Mario environment

env = gym.make('ALE/Frogger-v5')
height, width, channels = env.observation_space.shape
actions = 5
# env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0")

# # Limit the action-space
# actions = [
#   ["right"],        # Move right
#   ["right", "A"]    # Jump right
# ]
# env = JoypadSpace(env, actions)

env.reset()
next_state, _, _, info = env.step(action=0)
print(
  f"Inspect environment:\n"
  f"  - Shape of state: {next_state.shape}\n"
  f"  - Game info: {info}"
)

Inspect environment:
  - Shape of state: (210, 160, 3)
  - Game info: {'lives': 4}


In [None]:
"""
Classes for configuring environment and frame pre-processing for neural network.

Note: "observation" refers to our agent's observation of their environment,
which is given by the frames of the game in our case.
"""

class SkipFrame(gym.Wrapper):
  """
  Implement environment step function with frame skipping.

  Arguments
    env: game environment
    skip (int): number of frames to skip between steps
  """
  def __init__(self, env, skip):
    super().__init__(env)
    self.skip = skip

  def step(self, action):
    """
    Repeat same action and accumulate reward SKIP times per step.
    """
    total_reward = 0.0
    done = False
    for i in range(self.skip):
        obs, reward, done, info = self.env.step(action)
        total_reward += reward
        if done:
            break
    return obs, total_reward, done, info


class GrayScaleObservation(gym.ObservationWrapper):
  """
  Perform grayscale conversion on frames input to neural network.

  Arguments
    env: game environment
  """
  def __init__(self, env):
    """
    Set dimensions and specs for grayscale frames.
    """
    super().__init__(env)
    obs_shape = self.observation_space.shape[:2]
    self.observation_space = Box(low=0,
                                  high=255,
                                  shape=obs_shape,
                                  dtype=np.uint8)

  def permute_observation(self, observation):
    """
    Permute dimensions given by gym tensor (H,W,C) to expected dimensions for
    PyTorch tensor (C,H,W).
    """
    observation = np.transpose(observation, (2, 0, 1))
    observation = torch.tensor(observation.copy(), dtype=torch.float)
    return observation

  def observation(self, observation):
    """
    Perform permutation and grayscale conversion on given observation.
    """
    observation = self.permute_observation(observation)
    transform = T.Grayscale()
    observation = transform(observation)
    return observation


class ResizeObservation(gym.ObservationWrapper):
  """
  Resize (or crop) each observation to given shape.

  Arguments
    env: game environment
    shape: (height, width) tuple specifying desired shape of observations
  """
  def __init__(self, env, shape):
    super().__init__(env)
    self.shape = tuple(shape)

    # Apply new observation shape
    obs_shape = self.shape + self.observation_space.shape[2:]
    self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

  def observation(self, observation):
    """
    Resize and normalize observations.
    """
    transforms = T.Compose(
        [T.Resize(self.shape), T.Normalize(0, 255)]
    )
    observation = transforms(observation).squeeze(0)
    return observation

In [None]:
# Apply wrappers for observation processing
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env)
env = ResizeObservation(env, shape=(84,84))
env = FrameStack(env, num_stack=4)

In [None]:
class Agent:
  def __init__(self, state_dim, n_actions):
    self.state_dim = state_dim
    self.n_actions = n_actions

    # Deep Q-network to learn actions
    self.dqn = DQN(self.state_dim, self.n_actions).float().to(device)

    # Rate at which to explore new actions
    self.epsilon = 1
    self.epsilon_decay = 0.99999975
    self.epsilon_min = 0.1
    self.step_num = 0

  def act(self, state):
    """
    Given a state, choose an epsilon-greedy action.

    Arguments:
      state: single observation of current game state

    Returns:
      action_idx: index of chosen action
    """
    # Choose a random new action to gain experience
    if np.random.rand() < self.epsilon:
      action_idx = np.random.randint(self.n_actions)

    # Otherwise, choose action to maximize Q function
    else:
      state = state.__array__()
      state = torch.tensor(state).to(device)
      state = state.unsqueeze(0)
      q_values = self.dqn(state, model="main")
      action_idx = torch.argmax(q_values, axis=1).item()

    # Epsilon decay
    self.epsilon *= self.epsilon_decay
    self.epsilon = max(self.epsilon_min, self.epsilon)

    # Increment step
    self.step_num += 1
    return action_idx

In [None]:
class Agent(Agent):
  """
  Define agent's memory buffer and implement memory sampling for experience
  replay.

  Arguments
    state_dim: shape of game states
    n_actions: number of actions in action space
  """
  def __init__(self, state_dim, n_actions):
    super().__init__(state_dim, n_actions)
    self.memory = deque(maxlen=10000)
    self.batch_size = 8

  def cache(self, state, next_state, action, reward, done):
    """
    Cache an experience in agent's memory buffer.

    Arguments
      state: game state when action was taken
      next_state: game state after action was taken
      action: index of action taken
      reward: reward accrued by taking action
      done: whether transition
    """
    state = state.__array__()
    next_state = next_state.__array__()

    state = torch.tensor(state).to(device)
    next_state = torch.tensor(next_state).to(device)
    action = torch.tensor([action]).to(device)
    reward = torch.tensor([reward]).to(device)
    done = torch.tensor([done]).to(device)

    self.memory.append((state, next_state, action, reward, done))

  def recall(self):
    """
    Retrieve random batch of experiences from memory
    """
    batch = random.sample(self.memory, self.batch_size)
    state, next_state, action, reward, done = map(torch.stack, zip(*batch))
    return state, \
           next_state, \
           action.squeeze(), \
           reward.squeeze(), \
           done.squeeze()

In [None]:
class DQN(nn.Module):
  """
  Define convolutional Deep Q-Network architecture to choose and learn actions.

  Arguments
    state_dim: shape of input tensor given as (C, H, W)
    n_actions: number of actions in action space
  """
  def __init__(self, state_dim, n_actions):
    super().__init__()

    # Main DQN
    self.net = nn.Sequential(
        nn.Conv2d(in_channels=state_dim[0], out_channels=32, kernel_size=8, stride=4),
        nn.ReLU(),
        nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
        nn.ReLU(),
        nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
        nn.ReLU(),
        nn.Flatten(),
        nn.Linear(3136, 512),
        nn.ReLU(),
        nn.Linear(512, n_actions),
    )

    # Target DQN
    self.target = copy.deepcopy(self.net)

    # Prevent target network updates
    for p in self.target.parameters():
        p.requires_grad = False

  def forward(self, input, model):
    """
    Perform forward pass for given model.
    """
    if model == "main":
        return self.net(input)
    elif model == "target":
        return self.target(input)

In [None]:
class Agent(Agent):
  """
  Compute Q-values and target Q-values.

  Arguments
    state_dim: shape of input tensor given as (C, H, W)
    n_actions: number of actions in action space
  """
  def __init__(self, state_dim, n_actions):
    super().__init__(state_dim, n_actions)

    # Set discount factor
    self.gamma = 0.9

  def estimate(self, state, action):
    """
    Compute Q values over batch for given action.
    """
    current_Q = self.dqn(state, model="main")[
      np.arange(0, self.batch_size), action
    ]
    return current_Q

  @torch.no_grad()
  def target(self, reward, next_state, done):
    """
    Compute target value labels for Q network.
    """
    q_next = self.dqn(next_state, model="main")
    best_action = torch.argmax(q_next, axis=1)
    q_next = self.dqn(next_state, model="target")[
      np.arange(0, self.batch_size), best_action
    ]
    return (reward + (1 - done.float()) * self.gamma * q_next).float()

In [None]:
class Agent(Agent):
  """
  Define loss and optimizer to update agents.

  Arguments
    state_dim: shape of input tensor given as (C, H, W)
    n_actions: number of actions in action space
  """
  def __init__(self, state_dim, n_actions):
    super().__init__(state_dim, n_actions)

    # Adam optimizer
    self.optimizer = torch.optim.Adam(self.dqn.parameters(), lr=0.00025)

    # L1 loss for difference between target and Q-values
    self.loss_fn = torch.nn.SmoothL1Loss()

  def update_main(self, estimate, target):
    """
    Update main Q network using loss on target.
    """
    loss = self.loss_fn(estimate, target)
    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()
    return loss.item()

  def sync_target(self):
    """
    Synchronize target network with main.
    """
    self.dqn.target.load_state_dict(self.dqn.net.state_dict())

In [None]:
class Agent(Agent):
  """
  Configure and perform learning.

  Arguments
    state_dim: shape of input tensor given as (C, H, W)
    n_actions: number of actions in action space
  """
  def __init__(self, state_dim, n_actions):
    super().__init__(state_dim, n_actions)

    # Experience required before training
    self.min_experiences = 1e4

    # Number of experiences between main network updates
    self.learn_interval = 3

    # Number of experiences between syncing networks
    self.sync_interval = 1e3

  def learn(self):

    # Sync target network
    if self.step_num % self.sync_interval == 0:
        self.sync_target()

    # Wait for experience threshold
    if self.step_num < self.min_experiences:
        return None, None

    # Check update interval
    if self.step_num % self.learn_interval != 0:
        return None, None

    # Randomly sample experiences from memory
    state, next_state, action, reward, done = self.recall()

    # Get Q-value estimates
    est = self.estimate(state, action)

    # Get target values
    tgt = self.target(reward, next_state, done)

    # Backpropagate loss through main network
    loss = self.update_main(est, tgt)

    return (est.mean().item(), loss)

In [None]:
class RecordEvaluation:
  """
  Record and report training progress and metrics.
  """
  def __init__(self, log_interval):
    # History metrics
    self.ep_rewards = []
    self.ep_lengths = []
    self.ep_avg_losses = []
    self.ep_avg_qs = []

    # Moving averages, added for every call to record()
    self.moving_avg_ep_rewards = []
    self.moving_avg_ep_lengths = []
    self.moving_avg_ep_avg_losses = []
    self.moving_avg_ep_avg_qs = []

    # Current episode metric
    self.init_episode()

    # Timing
    self.prev_time = time.time()

    self.log_interval = log_interval

  def log_step(self, reward, loss, q):
    self.curr_ep_reward += reward
    self.curr_ep_length += 1
    if loss:
        self.curr_ep_loss += loss
        self.curr_ep_q += q
        self.curr_ep_loss_length += 1

  def log_episode(self):
    self.ep_rewards.append(self.curr_ep_reward)
    self.ep_lengths.append(self.curr_ep_length)
    if self.curr_ep_loss_length == 0:
        ep_avg_loss = 0
        ep_avg_q = 0
    else:
        ep_avg_loss = np.round(self.curr_ep_loss / self.curr_ep_loss_length, 5)
        ep_avg_q = np.round(self.curr_ep_q / self.curr_ep_loss_length, 5)
    self.ep_avg_losses.append(ep_avg_loss)
    self.ep_avg_qs.append(ep_avg_q)

    self.init_episode()

  def init_episode(self):
    self.curr_ep_reward = 0.0
    self.curr_ep_length = 0
    self.curr_ep_loss = 0.0
    self.curr_ep_q = 0.0
    self.curr_ep_loss_length = 0

  def record(self, episode, epsilon, step):
    mean_ep_reward = np.round(np.mean(self.ep_rewards[-100:]), 3)
    mean_ep_length = np.round(np.mean(self.ep_lengths[-100:]), 3)
    mean_ep_loss = np.round(np.mean(self.ep_avg_losses[-100:]), 3)
    mean_ep_q = np.round(np.mean(self.ep_avg_qs[-100:]), 3)
    self.moving_avg_ep_rewards.append(mean_ep_reward)
    self.moving_avg_ep_lengths.append(mean_ep_length)
    self.moving_avg_ep_avg_losses.append(mean_ep_loss)
    self.moving_avg_ep_avg_qs.append(mean_ep_q)

    elapsed_time = np.round(time.time() - self.prev_time, 3)
    self.prev_time = time.time()

    print(
      f"Summary for episodes {episode+2-self.log_interval} through {episode+1}:\n"
      f"  - Step {step}\n"
      f"  - Epsilon {epsilon}\n"
      f"  - Mean Reward {mean_ep_reward}\n"
      f"  - Mean Length {mean_ep_length}\n"
      f"  - Mean Loss {mean_ep_loss}\n"
      f"  - Mean Q Value {mean_ep_q}\n"
      f"  - Total time (s): {elapsed_time}\n"
    )

In [None]:
import os
import pandas as pd
path = '/content/drive/MyDrive/weights/agent.pt'

In [None]:
agent = Agent(state_dim=(4, 84, 84), n_actions=env.action_space.n)
# agent.step_num = 692632
# episodes = 0
if os.path.getsize(path) != 0:
  checkpoint = torch.load(path)
  agent.dqn.net.load_state_dict(checkpoint['model_state_dict'])
  agent.dqn.target.load_state_dict(checkpoint['target_state_dict'])
  # episodes = checkpoint['episode']
  agent.epsilon = checkpoint['epsilon']
  print("Loaded agent")

print(agent.epsilon)
episodes = 23000
record = RecordEvaluation(log_interval=200)
save_interval = 1000
for e in range(episodes, episodes+20000):

    state = env.reset()

    # Play the game!
    while True:
        # Run agent on the state
        action = agent.act(state)

        # Agent performs action
        next_state, reward, done, info = env.step(action)

        # Remember
        agent.cache(state, next_state, action, reward, done)

        # Learn
        q, loss = agent.learn()

        # Logging
        record.log_step(reward, loss, q)

        # Update state
        state = next_state

        # Check if end of game
        if done or (info['lives'] == 0):
            break

    record.log_episode()

    if (e+1) % save_interval == 0:
      print("Saving Model at Episode", e)
      torch.save({'epsilon': agent.epsilon, 'steps':agent.step_num, 'episode': e, 'model_state_dict': agent.dqn.net.state_dict(), 'target_state_dict': agent.dqn.target.state_dict()}, path)
      df = pd.DataFrame(np.stack((record.ep_rewards, record.ep_lengths), axis=1), columns=['Reward','Length'])
      df.to_csv(path + str(e))

    if (e+1) % record.log_interval == 0:
      record.record(episode=e, epsilon=agent.epsilon, step=agent.step_num)

Loaded agent
0.6352040095599248
Summary for episodes 23001 through 23200:
  - Step 14807
  - Epsilon 0.6328569895445961
  - Mean Reward 10.95
  - Mean Length 73.34
  - Mean Loss 0.006
  - Mean Q Value 2.212
  - Total time (s): 96.478

Summary for episodes 23201 through 23400:
  - Step 29830
  - Epsilon 0.6304845944593359
  - Mean Reward 11.16
  - Mean Length 76.41
  - Mean Loss 0.008
  - Mean Q Value 3.33
  - Total time (s): 128.113

Summary for episodes 23401 through 23600:
  - Step 44643
  - Epsilon 0.6281540700140065
  - Mean Reward 11.14
  - Mean Length 74.38
  - Mean Loss 0.008
  - Mean Q Value 3.267
  - Total time (s): 126.463

Summary for episodes 23601 through 23800:
  - Step 59834
  - Epsilon 0.6257730217714323
  - Mean Reward 10.97
  - Mean Length 73.74
  - Mean Loss 0.008
  - Mean Q Value 3.234
  - Total time (s): 129.439

Saving Model at Episode 23999
Summary for episodes 23801 through 24000:
  - Step 74380
  - Epsilon 0.6235015305356553
  - Mean Reward 10.37
  - Mean Lengt

In [None]:
episodes = 25

for e in range(episodes):
    state = env.reset()
    while True:

        # env.render()
        action = agent.act(state)
        next_state, reward, done, info = env.step(action)
        agent.cache(state, next_state, action, reward, done)
        q, loss = agent.learn()
        record.log_step(reward, loss, q)
        state = next_state

        # Check if end of game
        if done or (info["lives"] == 0):
            break

    record.log_episode()
    record.record(episode=e, epsilon=agent.epsilon, step=agent.step_num)