In [11]:
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers
from pyboy.pyboy import *
from MarioAISettings import MarioAI
from CustomPyBoyGym import CustomPyBoyGym
from wrappers import SkipFrame, ResizeObservation
from gym.wrappers import FrameStack, NormalizeObservation

# The environment

In [18]:
gameDimentions = (20, 16)
frameStack = 4

pyboy = PyBoy("mario.gb", window_type="SDL2", window_scale=3, debug=False, game_wrapper=True)
aiSettings = MarioAI()
env = CustomPyBoyGym(pyboy, observation_type="tiles")
env.setAISettings(aiSettings)  # use this settings
env = SkipFrame(env, skip=4)
env = ResizeObservation(env, gameDimentions)  # transform MultiDiscreate to Box for framestack
env = NormalizeObservation(env)  # normalize the values
env = FrameStack(env, num_stack=frameStack)

pyboy.set_emulation_speed(0)

observation = env.reset()
filteredActions = aiSettings.GetActions()  # get possible actions

# The model

In [19]:
model = tf.keras.Sequential()

cnn = tf.keras.Sequential()
cnn.add(layers.Conv2D(20, (5,5), strides=(1,1), padding='same', activation='relu'))
cnn.add(layers.Conv2D(15, (4,4), strides=(1,1), padding='same', activation='relu'))
cnn.add(layers.Conv2D(10, (3,3), strides=(1,1), padding='same', activation='relu'))
cnn.add(layers.Flatten())
cnn.add(layers.Reshape((1, 200)))

rnn = tf.keras.Sequential()
rnn.add(layers.LSTM(64, return_sequences=False, dropout=0.3, recurrent_dropout=0.3, stateful=True, batch_input_shape=(1, 1, 200)))

dense = tf.keras.Sequential()
dense.add(layers.Dense(len(filteredActions), activation='relu'))

main_input = layers.Input(shape=(1, 20, 16))

model = cnn(main_input)
model = rnn(model)
model = dense(model)



In [20]:
DRQL = tf.keras.Model(inputs=main_input, outputs=model)

In [21]:
DRQL.compile(loss="mean_squared_error")

In [22]:
DRQL.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1, 20, 16)]       0         
                                                                 
 sequential_13 (Sequential)  (None, 1, 200)            14195     
                                                                 
 sequential_14 (Sequential)  (1, 64)                   67840     
                                                                 
 sequential_15 (Sequential)  (1, 5)                    325       
                                                                 
Total params: 82,360
Trainable params: 82,360
Non-trainable params: 0
_________________________________________________________________


# The Q Learning algorithm

In [23]:
alpha = 0.001
gamma = 0.99

In [24]:
samples = []
targets = []
actions = None
state, reward, done, info = None, None, None, None

for i in range(100):
    q_values = None
    if actions is None:
        actions = filteredActions[0]
    else:
        parsed_state = np.array([[state._frames[0]]])
        q_values = DRQL.predict(parsed_state)[0]
        action_id = np.argmax(q_values)
        actions = filteredActions[action_id]
        
    state, reward, done, info = env.step(actions)
    
    new_parsed_state = np.array([[state._frames[0]]])
    new_predictions = DRQL.predict(new_parsed_state)[0]
    new_action = np.argmax(new_predictions)
    
    if i == 0:
        continue
        
    print(len(q_values), len(new_predictions))
    q_values[action_id] += alpha * (reward + gamma * new_predictions[new_action] - q_values[action_id])
    
    samples.append(np.array([parsed_state]))
    targets.append(np.array([q_values]))
    
    if len(samples) == 10:
        DRQL.fit(parsed_state, np.array([q_values]), epochs=1)
        samples = []
        targets = []

5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5
5 5


I could manage to send the frames one by one into the LSTM cell by using the stateful parameter

The problem is I didn't manage to send the sequences of states 10 by 10 during training...