# Super Mario Bros RL Notebook
## Rohan H, Talaal S, Thang N, Yuet W
### University of Bath

![SegmentLocal](mario!.gif "segment")
(we are going to have to delete him sadly...)


### How to run:
The official website for the game environment can be found here: https://pypi.org/project/gym-super-mario-bros/

In a nutshell, you will need:
- Python 3.5/3.6/3.7/3.8 (I have tested on 3.7)
- gymnasium (gym is deprecated)
- ipykernel for running the notebook
- gym-super-mario-bros 
- other essential packages/libraries like NumPy
- an average computer

In [2]:
from nes_py.wrappers import JoypadSpace
import time
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
env_example = gym_super_mario_bros.make('SuperMarioBros-v0', apply_api_compatibility=True, render_mode='human')
env_example = JoypadSpace(env_example, SIMPLE_MOVEMENT)

def env_run(env, steps):
    done = True
    for step in range(steps):
        if done:
            obs, info = env.reset()
        obs, reward, terminated, truncated, info = env.step(env.action_space.sample())
        done = terminated or truncated
        time.sleep(0.01)
    env.close()

env_run(env_example, steps=500)

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  f"The environment {id} is out of date. You should consider "
  f"The environment creator metadata doesn't include `render_modes`, contains: {list(env_creator.metadata.keys())}"


## Independent Work

Each of the cells below will belong to each member of the group. If you have independent work to be getting on with, you should do it in one of the cells below. Branches are yet to be organised, either by member or by task. 

In [None]:
# Talaal's Cell

# YOUR CODE HERE

In [None]:
# Rohan's Cell

# YOUR CODE HERE
import numpy as np
from statistics import mean
import random

class MCAgent:
    
    def __init__(self, env, epsilon = 0.15, gamma = 0.9) :

        self.env = env
        self.returns = {}
        self.epsilon = epsilon
        self.gamma = gamma
        self.q_values = {}
    
    def state(self, state):
        if isinstance(state, np.ndarray):
            return tuple(state.flatten())
        return state
    
    def policy(self, state, greedy = False) :
        state = self.state(state)
        available_actions = list(range(self.env.action_space.n))
        
        # Random tie-breaking
        random.shuffle(available_actions)
        
        # Take random action.
        if (greedy == False and random.random() < self.epsilon) :
            return random.choice(available_actions)
        # Take greedy action.
        else :
            q_table = {(state, a) : mean(self.returns.get((state, a), [0])) for a in available_actions}
            q_values = [q_table.get((state, a), 0) for a in available_actions]
            return available_actions[q_values.index(max(q_values))]
        
    def learn(self, state_action_pairs, rewards, next_states) :
        
        episode_return = 0
        
        # Loop through our episode experience (backwards, as per the pseudocode).
        while(len(state_action_pairs) > 0) :
            
            # We "pop" the last time-step from our experience lists
            # each iteration until they are empty.
            state, action = state_action_pairs.pop()
            reward = rewards.pop()
            next_state = next_states.pop()
            
            # Update the return earned after this time-step.
            episode_return = reward + self.gamma * episode_return
            
            # If this is our first-visit to this state-action pair in this
            # episode, update its list of returns.
            if (not (state, action) in state_action_pairs) :
                returns_list = self.returns.get((state, action), [])
                returns_list.append(episode_return)
                self.returns[(state, action)] = returns_list.copy()
            
    def generate_episode(self) :
        
        # Initialise variables for storing our agent's experience.
        state_action_pairs = []
        rewards = []
        next_states = []
        summed_rewards = 0
        
        # Initialise environment.
        state, info = self.env.reset()
        state = self.state(state)
        terminal = False
        limit = 1000 # Prevent infinite episodes.
        
        # Generate a full episode of experience.
        while (not terminal) or (limit > 0) :
            action = self.policy(state)
            next_state, reward, terminated, truncated, info = self.env.step(action)
            terminal = terminated or truncated
            
            next_state = self.state(next_state)
            state_action_pairs.append((state, action))
            rewards.append(reward)
            next_states.append(next_state)
            
            state = next_state
            
            summed_rewards += reward
            limit -= 1
                
        return state_action_pairs, rewards, next_states, summed_rewards
    

'''
num_agents = 4
num_episodes = 50
mc_rewards = []


env = gym_super_mario_bros.make('SuperMarioBros-v3', apply_api_compatibility=True, render_mode='robot')
env = JoypadSpace(env, SIMPLE_MOVEMENT)

for agent in range(0, num_agents) :
    #print("Agent {}".format(agent + 1))
    episode_rewards = []

    mcagent = MCAgent(env)
    
    for episode in range(0, num_episodes) :

        # Update cumulative reward for episode.
        state_action_pairs, rewards, next_states, sum_rewards = mcagent.generate_episode()
        mcagent.learn(state_action_pairs, rewards, next_states)

        episode_rewards.append(sum_rewards)
    mc_rewards.append(episode_rewards)

'''


In [None]:
# Thang's Cell

# YOUR CODE HERE

In [None]:
# Yuet's Cell

# YOUR CODE HERE