In [1]:
import numpy as np
import random
from __future__ import division
from keras.models import Sequential
from keras.layers import Dense, Input, Lambda, Dropout
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
import keras.backend as K
from collections import deque
import h5py
import os
import gym
import cv2
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
EPISODES = 10

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen = 2000)
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.95
        self.learning_rate = 0.5
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()
    
    # Loss function
    def _huber_loss(self, target, prediction):
        error = prediction - target
        return K.mean(K.sqrt(1+K.square(error)) - 1, axis = -1)
    
    # Model architecture
    def _build_model(self):
        model = Sequential()
        #model.add(Conv2D(filters = 32, input_shape=self.state_size, kernel_size = [8,8], strides = [2,2], activation = 'relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(96, activation='tanh'))
        model.add(Dropout(0.1))
        #model.add(Flatten())
        model.add(Dense(72, activation='softmax'))
        model.add(Dropout(0.2))
        model.add(Dense(48, activation='linear'))
        model.add(Dense(20, activation='relu'))
        model.compile(loss=self._huber_loss,
                     optimizer=Adam(lr=self.learning_rate))
        return model
    
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
        
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = self.model.predict(state)
            if done:
                target[0][action] = reward
            else:
                # a = self.model.predict(next_state)[0]
                t = self.target_model.predict(next_state)[0]
                target[0][action] = reward + self.gamma * np.amax(t)
                #target[0][action] = reward + self.gamma * t[np.argmax(t)]
            self.model.fit(state, target, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def load(self, name):
        self.model.load_weights(name)
        
    def save(self, name):
        self.model.save_weights(name)


In [None]:
if __name__ == "__main__":
    env = gym.make("SuperMarioBros-v2")
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
    state_size = env.observation_space.shape[1]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    
    done = False
    batch_size = 24
    
    for e in range(EPISODES):
        state = env.reset()
        #state = cv2.cvtColor(state, cv2.COLOR_BGR2GRAY)
        # state = np.reshape(state, [1, state_size])
        for time in range(250):
            env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            #next_state = cv2.cvtColor(next_state, cv2.COLOR_BGR2GRAY)
            reward = reward if not done else -50
            # next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            agent.update_target_model()
            print("episode: {}/{}, score: {}, e: {:.2}, r: {}".format(e, EPISODES, time, agent.epsilon, reward))
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
    

episode: 0/10, score: 0, e: 1.0, r: 0
episode: 0/10, score: 1, e: 1.0, r: 1
episode: 0/10, score: 2, e: 1.0, r: 2
episode: 0/10, score: 3, e: 1.0, r: 3
episode: 0/10, score: 4, e: 1.0, r: 2
episode: 0/10, score: 5, e: 1.0, r: 5
episode: 0/10, score: 6, e: 1.0, r: 5
episode: 0/10, score: 7, e: 1.0, r: 4
episode: 0/10, score: 8, e: 1.0, r: 2
episode: 0/10, score: 9, e: 1.0, r: 0
episode: 0/10, score: 10, e: 1.0, r: 2
episode: 0/10, score: 11, e: 1.0, r: 4
episode: 0/10, score: 12, e: 1.0, r: 6
episode: 0/10, score: 13, e: 1.0, r: 7
episode: 0/10, score: 14, e: 1.0, r: 6
episode: 0/10, score: 15, e: 1.0, r: 7
episode: 0/10, score: 16, e: 1.0, r: 5
episode: 0/10, score: 17, e: 1.0, r: 4
episode: 0/10, score: 18, e: 1.0, r: 6
episode: 0/10, score: 19, e: 1.0, r: 5
episode: 0/10, score: 20, e: 1.0, r: 5
episode: 0/10, score: 21, e: 1.0, r: 6
episode: 0/10, score: 22, e: 1.0, r: 6
episode: 0/10, score: 23, e: 1.0, r: 7
episode: 0/10, score: 24, e: 1.0, r: 6
episode: 0/10, score: 25, e: 0.95, 