# Super Mario Bros RL Notebook
## Rohan H, Talaal S, Thang N, Yuet W
### University of Bath

![SegmentLocal](mario!.gif "segment")
(we are going to have to delete him sadly...)


### How to run:
The official website for the game environment can be found here: https://pypi.org/project/gym-super-mario-bros/

In a nutshell, you will need:
- Python 3.5/3.6/3.7/3.8 (I have tested on 3.7)
- gymnasium (gym is deprecated)
- ipykernel for running the notebook
- gym-super-mario-bros 
- other essential packages/libraries like NumPy
- an average computer

### Check if the environment works

In [1]:
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
env_example = gym_super_mario_bros.make('SuperMarioBros-v0', apply_api_compatibility=True, render_mode='human')
env_example = JoypadSpace(env_example, SIMPLE_MOVEMENT)

test = False
done = True
if test:
    for step in range(1000):
        if done:
            obs, info = env_example.reset()
        obs, reward, terminated, truncated, info = env_example.step(env_example.action_space.sample())
        done = not terminated or truncated

env_example.close()

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  f"The environment {id} is out of date. You should consider "
  f"The environment creator metadata doesn't include `render_modes`, contains: {list(env_creator.metadata.keys())}"


# /// DQN Algorithm ///

## Importing Libraries, Defining Functions and Classes

In [None]:
# Rohan's Cell

# YOUR CODE HERE
# example here: https://docs.pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

import math
import random
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import gym
from gym.spaces import Box
from gym.wrappers.frame_stack import FrameStack
from gym.core import ObservationWrapper
import gym_super_mario_bros.actions as actions

import torch
import torch.nn as nn
import torch.optim as opt
#import torch.nn.functional as func
#from torchvision import transforms as T

from collections import namedtuple, deque
#from itertools import count
#import itertools

import cv2

def device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    else:
        return torch.device("cpu")

system = device()

Transition = namedtuple('Transition',
    ['state', 'action', 'next_state', 'reward'])


class ReplayMemory(object):
    
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)


class DQN_full(nn.Module):

    def __init__(self, shape, actions_n):
        super(DQN_full, self).__init__()
        self.convolution = nn.Sequential( # Convolutional layers
        nn.Conv2d(shape[0], 16, kernel_size=8, stride=4),
        nn.ReLU(),
        nn.Conv2d(16, 32, kernel_size=4, stride=2),
        nn.ReLU(),
        nn.Conv2d(32, 32, kernel_size=3, stride=1),
        nn.ReLU()
        )

        output_size = self._get_conv_out(shape)
        self.fully_connected = nn.Sequential( # 64 neurons -> 512 neurons -> "n" actions 
            nn.Linear(output_size, 64),
            nn.ReLU(),
            nn.Linear(64, actions_n)
        )
    def _get_conv_out(self, shape):
        with torch.no_grad():
            blank = torch.zeros(1, *shape)
            out = self.convolution(blank)
            return int(np.prod(out.size())) 

    def forward(self, x):
        out = self.convolution(x).view(x.size(0), -1)
        return self.fully_connected(out)
    

class ReShape(ObservationWrapper): # Put RGB channel as first argument
    def __init__(self, env):
        super().__init__(env)
        obs_shape = env.observation_space.shape
        self.observation_space = Box(
            low=self.observation_space.low.min(),
            high=self.observation_space.high.max(),
            shape=(obs_shape[2], obs_shape[0], obs_shape[1]),
            dtype=env.observation_space.dtype,
        )

    def observation(self, observation):
        return np.transpose(observation, (2, 0, 1))

class SmallBox84(ObservationWrapper): # Resize to 84x84
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = Box(
            low=0,
            high=1.0,
            shape=(1, 84, 84),
            dtype=np.uint8,
        )
    def observation(self, observation):
        return cv2.resize(observation, (84, 84), interpolation=cv2.INTER_AREA)

class GreyScale(ObservationWrapper): # Convert to greyscale
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = Box(
            low=0,
            high=1.0,
            shape=(1, env.observation_space.shape[1], env.observation_space.shape[2]),
            dtype=np.uint8,
        )
    def transform(self, observation):
        return np.dot(observation[..., :3], [0.299, 0.587, 0.114]).reshape(1, observation.shape[0], observation.shape[1])

    def observation(self, observation):
        observation = self.transform(observation)
        return observation.astype(np.float32) / 255.0
    

env = gym_super_mario_bros.make('SuperMarioBros-v2', apply_api_compatibility=True, render_mode='robot')
env = JoypadSpace(env, actions.SIMPLE_MOVEMENT)
env = ReShape(env)
env = GreyScale(env)
env = SmallBox84(env)

In [7]:
gamma = 0.95 # discount reward
update = 0.004 # learning rate
epsilon_start = 0.9 # initial exploration rate
epsilon_end = 0.005 # final exploration rate
e_decay = 2500 # decay rate
action_n = env.action_space.n
obs, info = env.reset()

network = DQN_full(obs.shape, action_n).to(system)

optimiser = opt.AdamW(network.parameters(), lr=update)
memory = ReplayMemory(1000) 

steps_passed = 0

def select_action(state):
    global steps_passed
    sample = random.random()
    steps_passed += 1
    epsilon = epsilon_end + (epsilon_start - epsilon_end) * math.exp(-1. * steps_passed / e_decay)
    if sample > epsilon:
        with torch.no_grad():
            return network(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[env.action_space.sample()]], device=system, dtype=torch.long)


## Model Optimiser

In [8]:
batch_size = 50 #UPDATE

def optimize_model():
    if len(memory) < batch_size: # take 250 of 10000 to update model params
        return
    transitions = memory.sample(batch_size)
    batch = Transition(*zip(*transitions))

    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=system, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    state_action_values = network(state_batch).gather(1, action_batch)

    next_state_values = torch.zeros(batch_size, device=system)
    with torch.no_grad():
        next_state_values[non_final_mask] = network(non_final_next_states).max(1).values
    
    # Compute expected Q values
    expected_state_action_values = (next_state_values * gamma) + reward_batch

    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    optimiser.zero_grad()
    loss.backward()
    
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(network.parameters(), 15)
    optimiser.step()


## Training Loop

In [None]:
if system == torch.device("cuda"):
    episodes = 600
else:
    episodes = 15

episode_rewards = []
episode_steps = []
steps_passed = 0

for episode_n in range(episodes):
    print("Starting Episode ", episode_n)
    state, info = env.reset()
    state = torch.tensor(state.copy(), dtype=torch.float32, device=system).unsqueeze(0)
    total_reward = 0
    done = None
    for i in range(1, 100):  # count():
        action = select_action(state)
        obs, reward, terminated, truncated, info = env.step(action.item())
        reward = torch.tensor([reward], device=system)
        done = terminated or truncated
        total_reward += reward
        if terminated:
            next_state = None
        else:
            next_state = torch.tensor(obs, dtype=torch.float32, device=system).unsqueeze(0)

        memory.push(state, action, next_state, reward)
        state = next_state

        optimize_model()

        if done:
            break
        
    episode_rewards.append(total_reward)
    print(total_reward)

print('Done Training')

torch.save(network.state_dict(), 'mario_DQN.pth')


Starting Episode  0
tensor([164.])
Starting Episode  1
tensor([175.])
Starting Episode  2
tensor([155.])
Starting Episode  3
tensor([148.])
Starting Episode  4
tensor([136.])
Starting Episode  5
tensor([141.])
Starting Episode  6
tensor([170.])
Starting Episode  7
tensor([171.])
Starting Episode  8
tensor([137.])
Starting Episode  9
tensor([215.])
Starting Episode  10
tensor([196.])
Starting Episode  11
tensor([197.])
Starting Episode  12
tensor([172.])
Starting Episode  13
tensor([168.])
Starting Episode  14
tensor([184.])
Done Training


# /// A2C Algorithm ///

To save time, we will use some of the same classes and functions from DQN. Make sure you run all of the prior code (except the training loop for DQN).

In [13]:
steps_passed = 0
episode_rewards = []
episode_steps = []
gamma = 0.99
a2c_update = 0.05

if system == torch.device("cuda"):
    episodes = 600
else:
    episodes = 40


class CriticNetwork(nn.Module):

    def __init__(self, shape, actions_n):
        super(CriticNetwork, self).__init__()
        self.convolution = nn.Sequential( # combined convolutional layers
        nn.Conv2d(shape[0],32, kernel_size=8, stride=4),
        nn.ReLU(),
        nn.Conv2d(32, 64, kernel_size=4, stride=2),
        nn.ReLU(),
        nn.Conv2d(64, 64, kernel_size=3, stride=1),
        nn.ReLU()
        )

        output_size = self._get_conv_out(shape)
        self.actor = nn.Sequential(  
            nn.Linear(output_size, 512),
            nn.ReLU(),
            nn.Linear(512, actions_n),
            nn.ReLU()
        )
        self.critic = nn.Sequential(
            nn.Linear(output_size, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )

    def _get_conv_out(self, shape):
        with torch.no_grad():
            blank = torch.zeros(1, *shape)
            out = self.convolution(blank)
            return int(np.prod(out.size())) 

    def get_values(self, x): # for debugging
        x = self.convolution(x)
        actor = self.actor(x).view(x.size(0), -1)
        critic = self.critic(x).view(x.size(0), -1)
        return actor, critic
    
    def forward(self, x):
        x = self.convolution(x).view(x.size(0), -1)
        probs = nn.Softmax(dim=1)(self.actor(x))
        state_values = self.critic(x)
        return probs, state_values


a2c_model = CriticNetwork(obs.shape, action_n).to(system)
a2c_optimiser = opt.Adam(a2c_model.parameters(), lr=a2c_update)


for episode_n in range(episodes):
    print("Starting Episode ", episode_n)
    state, info = env.reset()
    state = torch.tensor(state.copy(), dtype=torch.float32, device=system).unsqueeze(0)
    total_reward = 0
    probability_logs = []
    rewards = []
    values = []
    for i in range(1, 1000):  # count():
        action_probability, value = a2c_model(state)
        action = torch.multinomial(action_probability, 1).item()
        prob_log = torch.log(action_probability.squeeze(0)[action])
        obs, reward, terminated, truncated, info = env.step(action)
        next_state = torch.tensor(obs, dtype=torch.float32, device=system).unsqueeze(0)
        probability_logs.append(prob_log)
        total_reward += reward
        rewards.append(reward)
        values.append(value)
        done = terminated or truncated
        if terminated:
            next_state = None
        else:
            next_state = torch.tensor(obs, dtype=torch.float32, device=system).unsqueeze(0)
        state = next_state

        if done:
            break
    episode_steps.append(i+1)
    episode_rewards.append(total_reward)
    print(total_reward)


    returns = []
    discounted_return = 0
    for reward in reversed(rewards):
        discounted_return = reward + gamma * discounted_return
        returns.insert(0, discounted_return)
    returns = torch.tensor(returns, dtype=torch.float32, device=system)

    probability_logs = torch.stack(probability_logs)
    values = torch.cat(values).squeeze()
    advantage = returns - values

    actor_loss = -(probability_logs * advantage.detach()).mean()
    critic_loss = advantage.pow(2).mean()

    loss = actor_loss + critic_loss
    a2c_optimiser.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(a2c_model.parameters(), 0.5)
    a2c_optimiser.step()

print('Done Training')

torch.save(a2c_model.state_dict(), 'mario_A2C.pth')

Starting Episode  0
632.0
Starting Episode  1
680.0
Starting Episode  2
680.0
Starting Episode  3
680.0
Starting Episode  4
-90.0
Starting Episode  5
577.0
Starting Episode  6
297.0
Starting Episode  7
504.0
Starting Episode  8
632.0
Starting Episode  9
-43.0
Starting Episode  10
632.0
Starting Episode  11
632.0
Starting Episode  12
632.0
Starting Episode  13
1262.0
Starting Episode  14
1123.0
Starting Episode  15
1302.0
Starting Episode  16
1363.0
Starting Episode  17
1284.0
Starting Episode  18
1451.0
Starting Episode  19
680.0
Starting Episode  20
680.0
Starting Episode  21
825.0
Starting Episode  22
680.0
Starting Episode  23
680.0
Starting Episode  24
680.0
Starting Episode  25
680.0
Starting Episode  26
680.0
Starting Episode  27
680.0
Starting Episode  28


KeyboardInterrupt: 