# Super Mario Bros RL Notebook
## Rohan H, Talaal S, Thang N, Yuet W
### University of Bath

![SegmentLocal](mario!.gif "segment")
(we are going to have to delete him sadly...)


### How to run:
The official website for the game environment can be found here: https://pypi.org/project/gym-super-mario-bros/

In a nutshell, you will need:
- Python 3.5/3.6/3.7/3.8 (I have tested on 3.7)
- gymnasium (gym is deprecated)
- ipykernel for running the notebook
- gym-super-mario-bros 
- other essential packages/libraries like NumPy
- an average computer

### Check if the environment works

In [1]:
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
env_example = gym_super_mario_bros.make('SuperMarioBros-v0', apply_api_compatibility=True, render_mode='human')
env_example = JoypadSpace(env_example, SIMPLE_MOVEMENT)

test = False
done = True
if test:
    for step in range(1000):
        if done:
            obs, info = env_example.reset()
        obs, reward, terminated, truncated, info = env_example.step(env_example.action_space.sample())
        done = not terminated or truncated

env_example.close()

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  f"The environment {id} is out of date. You should consider "
  f"The environment creator metadata doesn't include `render_modes`, contains: {list(env_creator.metadata.keys())}"


# /// DQN Algorithm ///

## Importing Libraries, Defining Functions and Classes

In [20]:
# Rohan's Cell

# YOUR CODE HERE
# example here: https://docs.pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

import math
import gc
import random
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import gym
from gym.spaces import Box
from gym.wrappers.frame_stack import FrameStack
from gym.core import ObservationWrapper
import gym_super_mario_bros.actions as actions

import torch
import torch.nn as nn
import torch.optim as opt
#import torch.nn.functional as func
#from torchvision import transforms as T

from collections import namedtuple, deque
from itertools import count
#import itertools

import cv2

def clean_mem():
    if torch.cuda.is_available():    
        torch.cuda.empty_cache()
    gc.collect()

def device():
    if torch.cuda.is_available():    
        torch.cuda.empty_cache()
        return torch.device("cuda")
    
    else:
        return torch.device("cpu")

system = device()

Transition = namedtuple('Transition',
    ['state', 'action', 'next_state', 'reward'])


class ReplayMemory(object):
    
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)


class DQN_full(nn.Module):

    def __init__(self, shape, actions_n):
        super(DQN_full, self).__init__()
        self.convolution = nn.Sequential( # Convolutional layers
        nn.Conv2d(shape[0], 16, kernel_size=8, stride=4),
        nn.ReLU(),
        nn.Conv2d(16, 32, kernel_size=4, stride=2),
        nn.ReLU(),
        nn.Conv2d(32, 32, kernel_size=3, stride=1),
        nn.ReLU()
        )

        output_size = self._get_conv_out(shape)
        self.fully_connected = nn.Sequential( # 32 neurons -> 64 neurons -> "n" actions 
            nn.Linear(output_size, 64),
            nn.ReLU(),
            nn.Linear(64, actions_n)
        )
    def _get_conv_out(self, shape):
        with torch.no_grad():
            blank = torch.zeros(1, *shape)
            out = self.convolution(blank)
            return int(np.prod(out.size())) 

    def forward(self, x):
        out = self.convolution(x).view(x.size(0), -1)
        return self.fully_connected(out)
    

class ReShape(ObservationWrapper): # Put RGB channel as first argument
    def __init__(self, env):
        super().__init__(env)
        obs_shape = env.observation_space.shape
        self.observation_space = Box(
            low=self.observation_space.low.min(),
            high=self.observation_space.high.max(),
            shape=(obs_shape[2], obs_shape[0], obs_shape[1]),
            dtype=env.observation_space.dtype,
        )

    def observation(self, observation):
        return np.transpose(observation, (2, 0, 1))

class SmallBox84(ObservationWrapper): # Resize to 84x84
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = Box(
            low=0,
            high=1.0,
            shape=(1, 84, 84),
            dtype=np.uint8,
        )
    def observation(self, observation):
        return cv2.resize(observation, (84, 84), interpolation=cv2.INTER_AREA)

class GreyScale(ObservationWrapper): # Convert to greyscale
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = Box(
            low=0,
            high=1.0,
            shape=(1, env.observation_space.shape[1], env.observation_space.shape[2]),
            dtype=np.uint8,
        )
    def transform(self, observation):
        return np.dot(observation[..., :3], [0.299, 0.587, 0.114]).reshape(1, observation.shape[0], observation.shape[1])

    def observation(self, observation):
        observation = self.transform(observation)
        return observation.astype(np.float32) / 255.0
    

env = gym_super_mario_bros.make('SuperMarioBros-v0', apply_api_compatibility=True, render_mode='robot')
env = JoypadSpace(env, actions.SIMPLE_MOVEMENT)
env = ReShape(env)
env = GreyScale(env)
env = SmallBox84(env)

  f"The environment {id} is out of date. You should consider "
  f"The environment creator metadata doesn't include `render_modes`, contains: {list(env_creator.metadata.keys())}"


In [21]:
gamma = 0.95 # discount reward
update = 0.001 # learning rate
epsilon_start = 0.99 # initial exploration rate
epsilon_end = 0.001 # final exploration rate
e_decay = 2500 # decay rate
action_n = env.action_space.n
obs, info = env.reset()

network = DQN_full(obs.shape, action_n).to(system)

optimiser = opt.AdamW(network.parameters(), lr=update)
memory = ReplayMemory(5000) 
steps_passed = 0

def select_action(state, network):
    global steps_passed
    sample = random.random()
    steps_passed += 1
    epsilon = epsilon_end + (epsilon_start - epsilon_end) * math.exp(-1. * steps_passed / e_decay)
    if sample > epsilon:
        with torch.no_grad():
            return network(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[env.action_space.sample()]], device=system, dtype=torch.long)


## Model Optimiser

In [22]:
batch_size = 32
clean_mem()

def optimize_model():
    if len(memory) < batch_size: # take 32 of 5000 to update model params
        return
    transitions = memory.sample(batch_size)
    batch = Transition(*zip(*transitions))

    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=system, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    state_action_values = network(state_batch).gather(1, action_batch)

    next_state_values = torch.zeros(batch_size, device=system)
    with torch.no_grad():
        next_state_values[non_final_mask] = network(non_final_next_states).max(1).values
    
    # Compute expected Q values
    expected_state_action_values = (next_state_values * gamma) + reward_batch

    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    optimiser.zero_grad()
    loss.backward()
    
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(network.parameters(), 15)
    optimiser.step()


## Training Loop

In [24]:
if system == torch.device("cuda"):
    episodes = 600
else:
    episodes = 15

episode_rewards = []
episode_steps = []

clean_mem()
for episode_n in range(episodes):
    print("Starting Episode ", episode_n)
    state, info = env.reset()
    state = torch.tensor(state.copy(), dtype=torch.float32, device=system).unsqueeze(0)
    total_reward = 0
    done = None
    steps_passed = 0
    for i in range(1000): # or use count() until terminated
        action = select_action(state, network)
        obs, reward, terminated, truncated, info = env.step(action.item())
        total_reward += reward
        reward = torch.tensor([reward], device=system)
        done = terminated or truncated
        if terminated:
            next_state = None
        else:
            next_state = torch.tensor(obs, dtype=torch.float32, device=system).unsqueeze(0)

        memory.push(state, action, next_state, reward)
        state = next_state
        if i % 5 == 0:
            optimize_model()

        if i % 100 == 0:
            clean_mem()
        if done:
            break

    clean_mem()
        
    episode_rewards.append(total_reward)
    print(total_reward)

print('Done Training')

if system == torch.device("cuda"):
    torch.save(network.state_dict(), 'mario_DQN_full.pth')
else:
    torch.save(network.state_dict(), 'mario_DQN_short.pth')


Starting Episode  0
504.0
Starting Episode  1


RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 227082240 bytes.

In [15]:
# Cleanup ready for DDQN
del memory, optimiser, network
clean_mem()

## /// DDQN ///

Since it is a variant of the DQN algorithm, some of the assets remain the same.


In [16]:
memory = ReplayMemory(5000)

class DDQN(nn.Module):
    def __init__(self, shape, actions_n):
        super(DDQN, self).__init__()
        self.convolution = nn.Sequential( # Convolutional layers
        nn.Conv2d(shape[0], 16, kernel_size=8, stride=4),
        nn.ReLU(),
        nn.Conv2d(16, 32, kernel_size=4, stride=2),
        nn.ReLU(),
        nn.Conv2d(32, 32, kernel_size=3, stride=1),
        nn.ReLU()
        )

        output_size = self._get_conv_out(shape)
        self.fully_connected = nn.Sequential( # 32 neurons -> 64 neurons -> "n" actions 
            nn.Linear(output_size, 64),
            nn.ReLU(),
            nn.Linear(64, actions_n)
        )
    def _get_conv_out(self, shape):
        with torch.no_grad():
            blank = torch.zeros(1, *shape)
            out = self.convolution(blank)
            return int(np.prod(out.size())) 

    def forward(self, x):
        out = self.convolution(x).view(x.size(0), -1)
        return self.fully_connected(out
        )

primary_network = DQN_full(obs.shape, action_n).to(system) 
target_network = DDQN(obs.shape, action_n).to(system)
target_network.load_state_dict(primary_network.state_dict())
target_network.eval()

optimiser = opt.AdamW(primary_network.parameters(), lr=update)

def optimize_ddqn():
    if len(memory) < batch_size: # take 32 of 5000 to update model params
        return
    transitions = memory.sample(batch_size)
    batch = Transition(*zip(*transitions))

    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=system, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    state_action_values = primary_network(state_batch).gather(1, action_batch)

    next_state_values = torch.zeros(batch_size, device=system)
    with torch.no_grad():
        if len(non_final_next_states) > 0:
            # Select best actions using the primary network
            q_values_primary = primary_network(non_final_next_states)
            best_actions = q_values_primary.argmax(1).unsqueeze(1)

            q_values_target = target_network(non_final_next_states)
            next_state_values[non_final_mask] = q_values_target.gather(1, best_actions).squeeze(1)

    expected_state_action_values = (next_state_values * gamma) + reward_batch

    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    optimiser.zero_grad()
    loss.backward()
    
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(primary_network.parameters(), 15)
    optimiser.step()


def optimise_target_network(tau=0.001):
    for target_param, primary_param in zip(target_network.parameters(), primary_network.parameters()):
        target_param.data.copy_(tau * primary_param.data + (1.0 - tau) * target_param.data)

### Training Loop for DDQN

In [19]:
if system == torch.device("cuda"):
    episodes = 600
else:
    episodes = 15

episode_rewards = []
episode_steps = []
clean_mem()

for episode_n in range(episodes):
    print("Starting Episode ", episode_n)
    state, info = env.reset()
    state = torch.tensor(state.copy(), dtype=torch.float32, device=system).unsqueeze(0)
    total_reward = 0
    done = None
    steps_passed = 0
    for i in range(100): # or use count() to go until terminated
        action = select_action(state, primary_network)
        obs, reward, terminated, truncated, info = env.step(action.item())
        total_reward += reward
        reward = torch.tensor([reward], device=system)
        done = terminated or truncated
        if terminated:
            next_state = None
        else:
            next_state = torch.tensor(obs, dtype=torch.float32, device=system).unsqueeze(0)

        memory.push(state, action, next_state, reward)
        state = next_state
        if i % 5 == 0:
            optimize_ddqn()
            optimise_target_network(tau=0.001)

        if i % 100 == 0:
            clean_mem()
        if done:
            break
        
    episode_rewards.append(total_reward)
    print(total_reward)

    clean_mem()

print('Done Training')

if system == torch.device("cuda"):
    torch.save(primary_network.state_dict(), 'mario_DDQN_full.pth')
else:
    torch.save(primary_network.state_dict(), 'mario_DDQN_short.pth')

Starting Episode  0
113.0
Starting Episode  1
87.0
Starting Episode  2
122.0
Starting Episode  3
106.0
Starting Episode  4
127.0
Starting Episode  5
114.0
Starting Episode  6
134.0
Starting Episode  7


RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 2416640 bytes.

In [7]:
# Cleanup ready for A2C
del primary_network, target_network, optimiser, memory 
clean_mem()

NameError: name 'primary_network' is not defined

# /// A2C Algorithm ///

To save time, we will use some of the same classes and functions from DQN. Make sure you run all of the prior code (except the training loop for DQN).

In [11]:
steps_passed = 0
episode_rewards = []
episode_steps = []
gamma = 0.99
a2c_update = 0.001

if system == torch.device("cuda"):
    episodes = 600
else:
    episodes = 40


class CriticNetwork(nn.Module):

    def __init__(self, shape, actions_n):
        super(CriticNetwork, self).__init__()
        self.convolution = nn.Sequential( # combined convolutional layers
        nn.Conv2d(shape[0],16, kernel_size=8, stride=4),
        nn.ReLU(),
        nn.Conv2d(16, 32, kernel_size=4, stride=2),
        nn.ReLU(),
        nn.Conv2d(32, 32, kernel_size=3, stride=1),
        nn.ReLU()
        )

        output_size = self._get_conv_out(shape)
        self.actor = nn.Sequential(  
            nn.Linear(output_size, 64),
            nn.ReLU(),
            nn.Linear(64, actions_n),
            nn.ReLU()
        )
        self.critic = nn.Sequential(
            nn.Linear(output_size, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def _get_conv_out(self, shape):
        with torch.no_grad():
            blank = torch.zeros(1, *shape)
            out = self.convolution(blank)
            return int(np.prod(out.size())) 

    def get_values(self, x): # for debugging
        x = self.convolution(x)
        actor = self.actor(x).view(x.size(0), -1)
        critic = self.critic(x).view(x.size(0), -1)
        return actor, critic
    
    def forward(self, x):
        x = self.convolution(x).view(x.size(0), -1)
        probs = nn.Softmax(dim=1)(self.actor(x))
        state_values = self.critic(x)
        return probs, state_values


a2c_model = CriticNetwork(obs.shape, action_n).to(system)
a2c_optimiser = opt.Adam(a2c_model.parameters(), lr=a2c_update)

clean_mem()
for episode_n in range(episodes):
    print("Starting Episode ", episode_n)
    state, info = env.reset()
    state = torch.tensor(state.copy(), dtype=torch.float32, device=system).unsqueeze(0)
    total_reward = 0
    probability_logs = []
    rewards = []
    values = []
    for i in range(1000):  # or count():
        action_probability, value = a2c_model(state)
        action = torch.multinomial(action_probability, 1).item()
        prob_log = torch.log(action_probability.squeeze(0)[action])
        obs, reward, terminated, truncated, info = env.step(action)
        next_state = torch.tensor(obs, dtype=torch.float32, device=system).unsqueeze(0)
        probability_logs.append(prob_log)
        total_reward += reward
        rewards.append(reward)
        values.append(value)
        done = terminated or truncated
        if terminated:
            next_state = None
        else:
            next_state = torch.tensor(obs, dtype=torch.float32, device=system).unsqueeze(0)
        state = next_state

        if i % 100 == 0:
            clean_mem()

        if done:
            break

    episode_steps.append(i+1)
    episode_rewards.append(total_reward)
    print(total_reward)


    returns = []
    discounted_return = 0
    for reward in reversed(rewards):
        discounted_return = reward + gamma * discounted_return
        returns.insert(0, discounted_return)
    returns = torch.tensor(returns, dtype=torch.float32, device=system)

    probability_logs = torch.stack(probability_logs)
    values = torch.cat(values).squeeze()
    advantage = returns - values

    actor_loss = -(probability_logs * advantage.detach()).mean()
    critic_loss = advantage.pow(2).mean()

    loss = actor_loss + critic_loss
    a2c_optimiser.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(a2c_model.parameters(), 0.5)
    a2c_optimiser.step()

    clean_mem()

print('Done Training')

if system == torch.device("cuda"):
    torch.save(a2c_model.state_dict(), 'mario_A2C_full.pth')
else:
    torch.save(a2c_model.state_dict(), 'mario_A2C_short.pth')

Starting Episode  0
757.0
Starting Episode  1
504.0
Starting Episode  2
504.0
Starting Episode  3
-58.0
Starting Episode  4
-46.0
Starting Episode  5
-50.0
Starting Episode  6
-50.0
Starting Episode  7
-50.0
Starting Episode  8
-50.0
Starting Episode  9
-46.0
Starting Episode  10
52.0
Starting Episode  11
776.0
Starting Episode  12
504.0
Starting Episode  13
744.0
Starting Episode  14
504.0
Starting Episode  15
504.0
Starting Episode  16
507.0
Starting Episode  17
507.0
Starting Episode  18
241.0
Starting Episode  19
-11.0
Starting Episode  20
-32.0
Starting Episode  21
-40.0
Starting Episode  22
-44.0
Starting Episode  23
-53.0
Starting Episode  24
-45.0
Starting Episode  25
-41.0
Starting Episode  26
-25.0
Starting Episode  27
43.0
Starting Episode  28
632.0
Starting Episode  29
504.0
Starting Episode  30
504.0
Starting Episode  31
504.0
Starting Episode  32
504.0
Starting Episode  33
743.0
Starting Episode  34
504.0
Starting Episode  35
504.0
Starting Episode  36
631.0
Starting Epis