In [1]:
#!pip install nes-py==0.2.6
!pip install nes-py
!pip install gym-super-mario-bros
!apt-get update
!apt-get install ffmpeg libsm6 libxext6  -y
!apt install -y libgl1-mesa-glx
!pip install opencv-python

Collecting nes-py
  Downloading nes_py-8.1.8.tar.gz (76 kB)
[?25l[K     |████▎                           | 10 kB 29.5 MB/s eta 0:00:01[K     |████████▌                       | 20 kB 10.7 MB/s eta 0:00:01[K     |████████████▉                   | 30 kB 9.0 MB/s eta 0:00:01[K     |█████████████████               | 40 kB 8.1 MB/s eta 0:00:01[K     |█████████████████████▍          | 51 kB 4.4 MB/s eta 0:00:01[K     |█████████████████████████▋      | 61 kB 5.2 MB/s eta 0:00:01[K     |██████████████████████████████  | 71 kB 5.6 MB/s eta 0:00:01[K     |████████████████████████████████| 76 kB 2.8 MB/s 
Building wheels for collected packages: nes-py
  Building wheel for nes-py (setup.py) ... [?25l[?25hdone
  Created wheel for nes-py: filename=nes_py-8.1.8-cp37-cp37m-linux_x86_64.whl size=434263 sha256=4b717fa83413a3d2bda3b828bc73ebd98da08acc71cea667fa39c75e2f0bb2c4
  Stored in directory: /root/.cache/pip/wheels/f2/05/1f/608f15ab43187096eb5f3087506419c2d9772e97000f3ba025
Succes

In [2]:
import torch
import torch.nn as nn
import random
import gym_super_mario_bros
from nes_py.wrappers import JoypadSpace
from tqdm import tqdm
import pickle 
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import gym
import numpy as np
import collections 
import cv2
import matplotlib.pyplot as plt
from IPython import display
%matplotlib inline

In [3]:
class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env=None, skip=4):
        """Return only every `skip`-th frame"""
        super(MaxAndSkipEnv, self).__init__(env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = collections.deque(maxlen=2)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = None
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            self._obs_buffer.append(obs)
            total_reward += reward
            if done:
                break
        max_frame = np.max(np.stack(self._obs_buffer), axis=0)
        return max_frame, total_reward, done, info

    def reset(self):
        """Clear past frame buffer and init to first obs"""
        self._obs_buffer.clear()
        obs = self.env.reset()
        self._obs_buffer.append(obs)
        return obs


class ProcessFrame84(gym.ObservationWrapper):
    """
    Downsamples image to 84x84
    Greyscales image

    Returns numpy array
    """
    def __init__(self, env=None):
        super(ProcessFrame84, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)

    def observation(self, obs):
        return ProcessFrame84.process(obs)

    @staticmethod
    def process(frame):
        if frame.size == 240 * 256 * 3:
            img = np.reshape(frame, [240, 256, 3]).astype(np.float32)
        else:
            assert False, "Unknown resolution."
        img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
        resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
        x_t = resized_screen[18:102, :]
        x_t = np.reshape(x_t, [84, 84, 1])
        return x_t.astype(np.uint8)


class ImageToPyTorch(gym.ObservationWrapper):
    def __init__(self, env):
        super(ImageToPyTorch, self).__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]),
                                                dtype=np.float32)

    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)


class ScaledFloatFrame(gym.ObservationWrapper):
    """Normalize pixel values in frame --> 0 to 1"""
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0


class BufferWrapper(gym.ObservationWrapper):
    def __init__(self, env, n_steps, dtype=np.float32):
        super(BufferWrapper, self).__init__(env)
        self.dtype = dtype
        old_space = env.observation_space
        self.observation_space = gym.spaces.Box(old_space.low.repeat(n_steps, axis=0),
                                                old_space.high.repeat(n_steps, axis=0), dtype=dtype)

    def reset(self):
        self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype)
        return self.observation(self.env.reset())

    def observation(self, observation):
        self.buffer[:-1] = self.buffer[1:]
        self.buffer[-1] = observation
        return self.buffer


def make_env(env):
    env = MaxAndSkipEnv(env)
    env = ProcessFrame84(env)
    env = ImageToPyTorch(env)
    env = BufferWrapper(env, 4)
    env = ScaledFloatFrame(env)
    return JoypadSpace(env, SIMPLE_MOVEMENT)

In [4]:
from os import X_OK
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv2D, Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model

class DDQNMario:

    def __init__(self, state_space, action_space, mem_size, buffer_sample_size, gamma, lr, max_epsilon, min_epsilon, epsilon_decay, pretrained):
      
      self.pretrained = pretrained 

      # Learning hyperparameters
      self.gamma = gamma
      self.epsilon_decay = epsilon_decay
      self.min_epsilon = min_epsilon
      self.epsilon = max_epsilon
      self.huber = keras.losses.Huber()

      # Env hyperparameters
      self.step = 0
      self.state_space = state_space
      self.num_actions = action_space

      # Experience replay
      self.mem_size = mem_size
      self.sync_rate = 5000
      self.buffer = 0
      self.buffer_sample_size = buffer_sample_size

      # CNN network hyperparameters
      self.lr = lr
      self.optimiser = Adam(learning_rate=self.lr, clipnorm=1.0)

      # Init CNN networks
      if self.pretrained:
        # use pre-trained weights for the models
        self.online = load_model('online')
        self.target = load_model('target')

      else:
        self.online = self.create_model()
        self.target = self.create_model()

      # Init arrays for logging
      if self.pretrained:

        with open("ending_position.pkl", "rb") as f:
            self.last_position = pickle.load(f)
        with open("buffer.pkl", "rb") as f:
            self.buffer = pickle.load(f)
        with open("state_hist.pkl", "rb") as f:
            self.state_hist = pickle.load(f)
        with open("action_hist.pkl", "rb") as f:
            self.action_hist = pickle.load(f)
        with open("reward_hist.pkl", "rb") as f:
            self.reward_hist = pickle.load(f)
        with open("nstate_hist.pkl", "rb") as f:
            self.next_state_hist = pickle.load(f)
        with open("done_hist.pkl", "rb") as f:
            self.done_hist = pickle.load(f)

      else:
        
        self.init_log()
        self.last_position = 0

    def create_model(self):
      inputs = Input(shape=self.state_space, batch_size=self.buffer_sample_size)
      x = Conv2D(32, 8, strides=4, activation='relu', name='conv1', padding='same')(inputs)
      x = Conv2D(64, 4, strides=2, activation='relu', name='conv2', padding='same')(x)
      x = Conv2D(64, 3, strides=1, activation='relu', name='conv3', padding='same')(x)
      x = Flatten()(x)
      x = Dense(512, activation='relu')(x)
      action = Dense(self.num_actions, activation='softmax', name='action')(x) 
      model = tf.keras.Model(inputs=inputs, outputs=action)

      model.compile(optimizer=self.optimiser, loss=self.huber, metrics=["accuracy"])
      #model.summary()
      return model

    def choose_action(self, state):
      self.step += 1

      # Epsilon-greedy policy #
      # Exploration 
      if random.random() < self.epsilon:  
        return random.randrange(self.num_actions)
      
      # Exploitation
      else:
        state_tensor = tf.convert_to_tensor(state)
        # print('state tensor: ', state_tensor.shape)
        #state_tensor = tf.expand_dims(state_tensor, 0)
        #print('state tensor 2: ', state_tensor)
        action_probs = self.online(state_tensor, training=False)
        # print(action_probs)
        return tf.argmax(action_probs[0]).numpy()

    def sync_models(self):
      self.target.set_weights(self.online.get_weights())

    def sample_exp(self):
      sample_idx = random.choices(range(self.buffer), k=self.buffer_sample_size)

      state = self.state_hist[sample_idx]
      action = self.action_hist[sample_idx]
      reward = self.reward_hist[sample_idx]
      next_state = self.next_state_hist[sample_idx]
      done = self.done_hist[sample_idx]
      
      return state, action, reward, next_state, done

    def experience_replay(self):
      
      if self.step % self.sync_rate == 0:
        self.sync_models()

      if self.buffer < self.buffer_sample_size:
        return 

      state_sample, action_sample, reward_sample, next_state_sample, done_sample = self.sample_exp()

      future_rewards = self.target.predict(next_state_sample)
      updated_q_values = reward_sample + self.gamma * tf.reduce_max(future_rewards, axis=0)
      # updated_q_values = updated_q_values * (1 - done_sample) - done_sample

      # One-hot encoding
      masked_actions = tf.one_hot(action_sample, self.num_actions)
      
      # print()
      with tf.GradientTape() as tape:

          # Forward pass
          q_values = self.online(state_sample)
          q_action = tf.reduce_sum(tf.multiply(q_values, masked_actions), axis=0)

          # Use huber loss as the loss functiona of the CNN
          loss = self.huber(updated_q_values, q_action)

      # Backpropagation
      grads = tape.gradient(loss, self.online.trainable_variables)
      self.optimiser.apply_gradients(zip(grads, self.online.trainable_variables))
      
      # Update epsilon
      self.epsilon = max(self.min_epsilon, self.epsilon*self.epsilon_decay)

    def init_log(self):
      self.state_hist = np.zeros((self.mem_size, *self.state_space))
      self.action_hist = np.zeros((self.mem_size, 1))
      self.reward_hist = np.zeros((self.mem_size, 1))
      self.next_state_hist = np.zeros((self.mem_size, *self.state_space))
      self.done_hist = np.zeros((self.mem_size, 1))

    def log(self, state, action, reward, next_state, done):
      
      # Remove older transition memories
      if len(self.action_hist) >= self.mem_size:
          self.init_log()

      self.state_hist[self.last_position] = state.float()
      self.action_hist[self.last_position] = action
      self.reward_hist[self.last_position] = reward.float()
      self.next_state_hist[self.last_position] = next_state.float()
      self.done_hist[self.last_position] = done.float()
      self.last_position = (self.last_position + 1) % self.mem_size
      self.buffer = min(self.buffer + 1, self.mem_size)

In [5]:
def show_state(env, ep=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.title("Episode: %d %s" % (ep, info))
    plt.axis('off')

    display.clear_output(wait=True)
    display.display(plt.gcf())

In [6]:
def run(training_mode, pretrained):
   
    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') # world 1, stage 1, standard ROM
    env = make_env(env)  # Wraps the environment so that frames are grayscale 
    observation_space = env.observation_space.shape
    action_space = env.action_space.n
    agent = DDQNMario(state_space=observation_space,
                      action_space=action_space,
                      mem_size=30000,
                      buffer_sample_size=32,
                      gamma=0.90,
                      lr=.00025,
                      max_epsilon=1.0,
                      min_epsilon=0.02,
                      epsilon_decay=0.99,
                      pretrained=pretrained)
    
    # Init
    num_episodes = 100
    burnin = int(num_episodes * 0.05)
    env.reset()
    total_rewards = []
    
    for ep_num in tqdm(range(num_episodes)):
        state = env.reset()
        state = torch.Tensor([state])
        total_reward = 0
        steps = 0
        while True:
            if ep_num > burnin and not training_mode: #<- render while training
                show_state(env, ep_num)

            action = agent.choose_action(state)
            # print('action: ', action)
            steps += 1
            
            state_next, reward, terminal, info = env.step(int(action))
            total_reward += reward
            state_next = torch.Tensor([state_next])
            reward = torch.tensor([reward]).unsqueeze(0)
            
            terminal = torch.tensor([int(terminal)]).unsqueeze(0)
            
            if training_mode:
                agent.log(state, action, reward, state_next, terminal)
                agent.experience_replay()
            
            state = state_next
            if terminal:
                break
        
        total_rewards.append(total_reward)

        print("Total reward after episode {} is {}".format(ep_num + 1, total_rewards[-1]))
        num_episodes += 1      
    
    if training_mode:
        with open("ending_position.pkl", "wb") as f:
            pickle.dump(agent.last_position, f, protocol=4)
        with open("buffer.pkl", "wb") as f:
            pickle.dump(agent.buffer, f, protocol=4)
        with open("total_rewards.pkl", "wb") as f:
            pickle.dump(total_rewards, f, protocol=4)
        with open("state_hist.pkl", "wb") as f:
            pickle.dump(agent.state_hist, f, protocol=4)
        with open("action_hist.pkl", "wb") as f:
            pickle.dump(agent.action_hist, f, protocol=4)
        with open("reward_hist.pkl", "wb") as f:
            pickle.dump(agent.reward_hist, f, protocol=4)
        with open("nstate_hist.pkl", "wb") as f:
            pickle.dump(agent.next_state_hist, f, protocol=4)
        with open("done_hist.pkl", "wb") as f:
            pickle.dump(agent.done_hist, f, protocol=4)
        
        agent.online.save("online")
        agent.target.save("target")

    env.close()
    
    if num_episodes > burnin:
        plt.title("Episodes trained vs. Average Rewards")
        plt.plot([0 for _ in range(burnin)] + 
            np.convolve(total_rewards, np.ones((burnin,))/burnin, mode="valid").tolist())
        plt.show()

In [None]:
run(training_mode=True, pretrained=False)



In [None]:
run(training_mode=False, pretrained=True)