# Super Mario Bros RL Notebook
## Rohan H, Talaal S, Thang N, Yuet W
### University of Bath

![SegmentLocal](mario!.gif "segment")
(we are going to have to delete him sadly...)


### How to run:
The official website for the game environment can be found here: https://pypi.org/project/gym-super-mario-bros/

In a nutshell, you will need:
- Python 3.5/3.6/3.7/3.8 (I have tested on 3.7)
- gymnasium (gym is deprecated)
- ipykernel for running the notebook
- gym-super-mario-bros 
- other essential packages/libraries like NumPy
- an average computer

In [4]:
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
env_example = gym_super_mario_bros.make('SuperMarioBros-v0', apply_api_compatibility=True, render_mode='human')
env_example = JoypadSpace(env_example, SIMPLE_MOVEMENT)

test = False
done = True
if test:
    for step in range(1000):
        if done:
            obs, info = env_example.reset()
        obs, reward, terminated, truncated, info = env_example.step(env_example.action_space.sample())
        done = terminated or truncated

    env_example.close()

  f"The environment {id} is out of date. You should consider "
  f"The environment creator metadata doesn't include `render_modes`, contains: {list(env_creator.metadata.keys())}"


## Independent Work

Each of the cells below will belong to each member of the group. If you have independent work to be getting on with, you should do it in one of the cells below. Branches are yet to be organised, either by member or by task. 

In [None]:
# Talaal's Cell

# YOUR CODE HERE

## Importing Libraries, Defining Functions and Classes

In [None]:
# Rohan's Cell

# YOUR CODE HERE
# example here: https://docs.pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

import math
import random
import matplotlib
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as opt
import torch.nn.functional as F

env = gym_super_mario_bros.make('SuperMarioBros-v0', apply_api_compatibility=True, render_mode='robot')
env = JoypadSpace(env_example, SIMPLE_MOVEMENT)

def device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    #elif torch.backends.mps.is_available():
    #   return torch.device("mps")
    else:
        return torch.device("cpu")

from collections import namedtuple, deque
from itertools import count
Transition = namedtuple('Transition',
    ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):
    
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)


class DQN(nn.Module):

    def __init__(self, observations_n, actions_n):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(observations_n, 32)
        self.layer2 = nn.Linear(32, 32)
        self.layer3 = nn.Linear(32, actions_n)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)
    

batch_size = 50
gamma = 0.9 # discount reward
update = 0.05
epsilon = 0.15 # exploration

action_n = len(env.action_space)
state, info = env.reset()
observation_n = len(state)

network = DQN(observation_n, action_n).to(device)
optimiser = opt.AdamW(network.parameters(), lr=update, amsgrad=True)
memory = ReplayMemory(100) #increase to 10000 when training

steps_passed = 0

def select_action(state):
    global steps_passed
    sample = random.random()
    steps_passed += 1
    if sample > epsilon:
        with torch.no_grad():
            return network(state).max(1).indices.view(1, 1)
    else:
        return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)
    



TypeError: object of type 'Discrete' has no len()

## Model Optimiser

In [7]:
def optimize_model():
    if len(memory) < batch_size:
        return
    transitions = memory.sample(batch_size)
    batch = Transition(*zip(*transitions))

    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device="cpu", dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    state_action_values = network(state_batch).gather(1, action_batch)

    next_state_values = torch.zeros(batch_size, device="cpu")
    with torch.no_grad():
        next_state_values[non_final_mask] = network(non_final_next_states).max(1).values
    
    # Compute expected Q values
    expected_state_action_values = (next_state_values * gamma) + reward_batch

    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    optimiser.zero_grad()
    loss.backward()
    
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(network.parameters(), 100)
    optimiser.step()


## Training Loop

In [8]:
if torch.cuda.is_available():
    episodes = 600
else:
    episodes = 50

# episodes = 1000

for episode_n in range(episodes):
    state, info = env.reset()
    state = torch.tensor(state, dtype=torch.float32, device="cpu").unsqueeze(0)
    for i in count():
        action = select_action(state)
        obs, reward, terminated, truncated = env.step(action.item())
        reward = torch.tensor([reward], device="cpu")

        done = terminated or truncated
        if terminated:
            next_state = None
        else:
            next_state = torch.tensor(obs, dtype=torch.float32, device="cpu").unsqueeze(0)

        memory.push(state, action, next_state, reward)
        state = next_state

        optimize_model()

        if done:
            #episode_durations.append(i+1)
            #plot_durations()
            break

print('Done Training')
print('Graph support not implemented yet')


ValueError: At least one stride in the given numpy array is negative, and tensors with negative strides are not currently supported. (You can probably work around this by making a copy of your array  with array.copy().) 

In [None]:
# Thang's Cell

# YOUR CODE HERE

In [None]:
# Yuet's Cell

# YOUR CODE HERE