This notebook follows the official pytorch tutorial:
[Mario-Playing RL Agent](https://pytorch.org/tutorials/intermediate/mario_rl_tutorial.html#train-a-mario-playing-rl-agent)

Useful resources:
- Model paper: [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/pdf/1509.06461.pdf)
- Official repo of the code: [MadMario](https://github.com/yfeng997/MadMario)
- Reinforcement Learning cheatsheet: [RL Cheatsheet](https://colab.research.google.com/drive/1eN33dPVtdPViiS1njTW_-r-IYCDTFU7N)

In [7]:
import torch
from torch import nn
from torchvision import transforms as T

import numpy as np
import random, datetime, os, copy
from collections import deque
from pathlib import Path
from PIL import Image

import gym
from gym.spaces import Box
from gym.wrappers import FrameStack
import gym_super_mario_bros

# NES Emulator for Gym
from nes_py.wrappers import JoypadSpace

# suppress warning for now
import warnings 
warnings.filterwarnings(action="once")

**Optimal Action-Value function** $Q^\star(s, a)$: Gives the expected return if you start in state $s$, take arbitrary action $a$, and then for each future time step take the action that maximizes returns. $Q$ can be said to stand for the "qaulity" of the action in a state. We try to approximate this function.

In [5]:
env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0")
# limit the action-spcae to
#    0. walk right
#    1. jump right
env = JoypadSpace(env, [["right"], ["right", 'A']])

env.reset()
next_state, reward, done, info = env.step(action=0)

print(f"{next_state.shape},\n {reward},\n {done},\n {info}")

  logger.warn(


(240, 256, 3),
 0,
 False,
 {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40, 'y_pos': 79}


preprocess environment with different **wrappers**: `GrayScaleObseravtion`, `ResieObservation`, `SkipFrame`, and `FrameStack`

In [8]:
class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        """Return only every `skip`-th frame"""
        super().__init__(env)
        self._skip = skip
        
    def step(self, action):
        """Repeat action, and sum reward"""
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            # accumulate reward and repeat the same action
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info

In [12]:
class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs_shape = self.observation_space.shape[:2]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)
        
    def permute_orientation(self, observation):
        # permute [H, W, C] array to [C, H, W] tensor
        observation = np.transpose(observation, (2, 0, 1))
        observation = torch.tensor(observation.copy(), dtype=torch.float)
        return observation
    
    def observation(self, observation):
        observation = self.permute_orientation(observation)
        transform = T.GrayScale()
        observation = transform(observation)
        return observation

In [10]:
class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        if isinstance(shape, int):
            self.shape = (shape, shape)
        else:
             self.shape = tuple(shape)
                
        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=225, shape=obs_shape, dtype=np.uint8)
        
    def observation(self,  observation):
        transforms = T.Compose([T.Resize(self.shape), T.Normalize(0, 225)])
        observation = transforms(observation).squeeze(0)
        return observation

In [14]:
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env)
env = ResizeObservation(env, shape=84)
env = FrameStack(env, num_stack=4)

Our agent should be able to:
- **Act**
- **Remember**
- **Learn**

In [17]:
class Mario:
    def __init__(self, state_dim, action_dim, save_dir):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.save_dir = save_dir
        
        self.use_cuda = torch.cuda.is_available()
        
        self.net = MarioNet(self.state_dim, self.action_dim).float()
        if self.use_cuda:
            self.net = self.net.to(device="cuda")
            
        self.exploration_rate = 1
        self.exploration_rate_decay = 0.99999975
        self.exploration_rate_min = 0.1
        self.curr_step = 0
        
        self.save_every = 5e5
        
    def act(self, state):
        """Given a state, choose an epsilon-greedy action and update value of step.
        
        Inputs:
            state(LazyFrame): A single observation of the current state, dimension is (state_dim).
            
        Outputs:
            action_idx(int): An integer representing which action Marion will perform.
        """
        if np.random.rand() < self.exploration_rate:
            action_idx = np.random.randint(self.action_dim)
        else:
            state = state.__array__()
            if self.use_cuda:
                state = torch.tensor(state).cuda()
            else:
                state = torch.tensor(state)
            
            state = state.unsqueeze(0)
            action_values = self.net(state, model="online")
            action_idx = torch.argmax(action_values, axis=1).item()
            
        self.exploration_rate *= self.exploration_rate_decay
        self.exploration_rate  = max(self.exploration_rate_min, self.exploration_rate)
        
        self.curr_step += 1
        return action_idx
                
        

`cache()`: store `experience` and to memory pof the agent.

`recall()`: randomly sample a batch of experiences from agent memory.

In [18]:
class Mario(Mario):
    def __init__(self, state_dim, action_dim, save_dir):
        super().__init__(state_dim, action_dim, save_dir)
        self.memory = deque(maxlen=100000)
        self.batch_size = 32
        
    def cache(self, state, next_state, action, reward, done):
        """Store the experience to self.memory (reply buffer)
        
        Inputs:
            state(LazyFrame)
            next_state(LazyFrame)
            action(int)
            reward(float)
            done(bool)
        """
        state = state.__array__()
        next_state = next_state.__array__()
        
        if self.use_cuda:
            state      = torch.tensor(state).cuda()
            next_state = torch.tensor(next_state).cuda()
            action     = torch.tensor([action]).cuda()
            reward     = torch.tensor([reward]).cuda()
            done       = torch.tensor([done]).cuda()
        else:
            state      = torch.tensor(state)
            next_state = torch.tensor(next_sate)
            action     = torch.tensor([action])
            reward     = torch.tensor([reward])
            done       = torch.tensor([done])
            
        self.memory_append((state, next_state, action, reward, done,))
        
    def recall(self):
        """Retrieve a batch of experiences from memory"""
        batch = random.sample(self.memory, self.batch_size)
        state, next_state, action, reward, done = map(torch.stach, zip(*batch))
        return state, next_state, action_squeeze(), reward.squeeze(), done.squeeze()

Our agent uses [DDQN algorithm](https://arxiv.org/pdf/1509.06461) under the hood. DDQN uses two ConvNets - $Q_{\text{online}}$ and $Q_{\text{target}}$ that independently approximate the optimal action-value function/

We share feature generator `features` across $Q_{\text{online}}$ and $Q_{\text{target}}$, but maintain seperate FC classifiers for each $\theta_{\text{target}}$

In [19]:
class MarioNet(nn.Module):
    """Mini cnn structure.
    input -> (conv2d + relu) x 3 -> flatten -> (dense + relu) x 2 -> output
    """
    def __init__(self, input_dim, output_dim):
        super().__init__()
        c, h, w = input_dim
        
        if h != 84:
            raise ValueError(f"Expecting input height: 84, got: {h}")
        if w != 84:
            raise ValueError(f"Expecting input width: 84, got: {w}")
            
        self.online = np.Sequential(
            nn.Conv2d(in_channels=c, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim),
        )
        
        self.target = copy.deepcopy(self.online)
        
        # freeze theta_target
        for p in self.target.parameters():
            p.requires_grad = False
            
    def forward(self, input, model):
        if model   == "online":
            return self.online(input)
        elif model == "target":
            return self.target(input)