In [1]:
!pip install gym_super_mario_bros==7.3.0 nes_py
!pip install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio===0.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
!pip install stable-baselines3[extra]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gym_super_mario_bros==7.3.0
  Downloading gym_super_mario_bros-7.3.0-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 14.1 MB/s 
[?25hCollecting nes_py
  Downloading nes_py-8.2.1.tar.gz (77 kB)
[K     |████████████████████████████████| 77 kB 7.2 MB/s 
Collecting pyglet<=1.5.21,>=1.4.0
  Downloading pyglet-1.5.21-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 65.3 MB/s 
Building wheels for collected packages: nes-py
  Building wheel for nes-py (setup.py) ... [?25l[?25hdone
  Created wheel for nes-py: filename=nes_py-8.2.1-cp37-cp37m-linux_x86_64.whl size=435617 sha256=00ca2eaed1bbd7f7ee6e90b31cd8e1bb0ea049ef9c9ab611b73c91aef905ae3a
  Stored in directory: /root/.cache/pip/wheels/17/96/0e/22a8c7dbdf412d8e988286f223b223baf0f4ad90c9e699c56d
Successfully built nes-py
Installing collected packages: pyglet, nes-py, gym-sup

In [2]:
import io
import base64
import os 
import gym_super_mario_bros

from matplotlib import pyplot as plt
from IPython.display import HTML
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
from gym.wrappers import GrayScaleObservation
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv

In [3]:
game_envi = gym_super_mario_bros.make('SuperMarioBros-v3')
game_envi = JoypadSpace(game_envi, SIMPLE_MOVEMENT)

game_envi = GrayScaleObservation(game_envi, keep_dim=True)
game_envi = DummyVecEnv([lambda: game_envi])
game_envi = VecFrameStack(game_envi, 4, channels_order='last')

state = game_envi.reset()
state, reward, done, info = game_envi.step([5])

In [4]:
class TrainAndLoggingCallback(BaseCallback):
    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)
        return True

In [5]:
CHECKPOINT_DIR = 'checkpoint'
LOG_DIR = 'logs'
CB = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

In [6]:
model = PPO('CnnPolicy', game_envi, verbose=1, tensorboard_log=LOG_DIR, learning_rate=0.000001, 
            n_steps=512) 

Using cuda device
Wrapping the env in a VecTransposeImage.


In [7]:
model.learn(total_timesteps=50000, callback=CB)

Logging to logs/PPO_1


  return (self.ram[0x86] - self.ram[0x071c]) % 256


----------------------------
| time/              |     |
|    fps             | 42  |
|    iterations      | 1   |
|    time_elapsed    | 12  |
|    total_timesteps | 512 |
----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 50            |
|    iterations           | 2             |
|    time_elapsed         | 20            |
|    total_timesteps      | 1024          |
| train/                  |               |
|    approx_kl            | 1.0916963e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.95         |
|    explained_variance   | -0.000431     |
|    learning_rate        | 1e-06         |
|    loss                 | 62.5          |
|    n_updates            | 10            |
|    policy_gradient_loss | -0.000128     |
|    value_loss           | 187           |
-------------------------------------------
-----

<stable_baselines3.ppo.ppo.PPO at 0x7f4d75be7fd0>

In [8]:
model.save('Cruz_Mario_Final_Model')

In [9]:
state = game_envi.reset()

In [10]:
from gym import wrappers
game_envi = gym_super_mario_bros.make('SuperMarioBros2-v0')
game_envi = wrappers.Monitor(game_envi, "./gym-results", force=True)
game_envi.reset()
for _ in range(4000):
    action = game_envi.action_space.sample()
    state, reward, done, info = game_envi.step(action)
    if done: break
game_envi.close()

  return (self.ram[0x86] - self.ram[0x071c]) % 256


In [11]:
video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % game_envi.file_infix, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
.format(encoded.decode('ascii')))