In [2]:
import numpy as np
from collections import deque
from matplotlib import pyplot as plt
from keras.layers import Dense, Input, Reshape
from keras.models import Model
from keras.optimizers import SGD, Adam
import gym

env = gym.make('Breakout-v0')

Using TensorFlow backend.
  result = entry_point.load(False)


In [3]:
# Constants defining our neural network
learning_rate = 1e-1
input_size = env.observation_space.shape[0]
output_size = env.action_space.n


X = Input(shape=env.observation_space.shape)
h = Reshape((100800,))(X)
h = Dense(input_size, activation='relu')(h)
Qpred = Dense(output_size, activation='sigmoid')(h)

model =Model(X,Qpred)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 210, 160, 3)       0         
_________________________________________________________________
reshape_1 (Reshape)          (None, 100800)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 210)               21168210  
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 844       
Total params: 21,169,054
Trainable params: 21,169,054
Non-trainable params: 0
_________________________________________________________________


In [3]:
model.compile(optimizer = Adam(lr = learning_rate), loss ='mse')

max_episodes = 500
dis = 0.9
step_history = []

for episode in range(max_episodes):
    e = 1. / ((episode / 10) + 1)
    step_count = 0
    state = env.reset()
    done = False

    # The Q-Network training
    while not done:
        step_count += 1
        x = np.expand_dims(state, 0)
        # Choose an action by greedily (with e chance of random action) from
        # the Q-network
        Q = model.predict(x)
        
        if np.random.rand(1) < e:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q)

        # Get new state and reward from environment
        next_state, reward, done, _ = env.step(action)
        if done:
            Q[0, action] = -100
        else:
            x_next = np.expand_dims(state, 0)
            # Obtain the Q' values by feeding the new state through our network
            Q_next =model.predict(x_next)
            Q[0, action] = reward + dis * np.max(Q_next)

        # Train our network using target and predicted Q values on each episode
        model.fit(x,Q,verbose = 0)
        state = next_state

    step_history.append(step_count)
    print("Episode: {}  steps: {} score: {}".format(episode, step_count, reward))
    # If last 10's avg steps are 500, it's good enough
    if len(step_history) > 10 and np.mean(step_history[-10:]) > 500:
        break

# See our trained network in action
observation = env.reset()
reward_sum = 0
while True:
    env.render()

    x =  np.expand_dims(observation, 0)
    Q = model.predict(x)
    action = np.argmax(Q)

    observation, reward, done, _ = env.step(action)
    reward_sum += reward
    #print("score: {}".format(reward))
    if done:
        print("Total score: {}".format(reward_sum))
        break

Episode: 0  steps: 260 score: 0.0
Episode: 1  steps: 232 score: 0.0
Episode: 2  steps: 249 score: 0.0
Episode: 3  steps: 174 score: 0.0
Episode: 4  steps: 274 score: 0.0
Episode: 5  steps: 298 score: 0.0
Episode: 6  steps: 235 score: 0.0
Episode: 7  steps: 175 score: 0.0
Episode: 8  steps: 184 score: 0.0
Episode: 9  steps: 227 score: 0.0
Episode: 10  steps: 195 score: 0.0
Episode: 11  steps: 235 score: 0.0
Episode: 12  steps: 176 score: 0.0
Episode: 13  steps: 196 score: 0.0
Episode: 14  steps: 220 score: 0.0
Episode: 15  steps: 180 score: 0.0
Episode: 16  steps: 251 score: 0.0
Episode: 17  steps: 250 score: 0.0
Episode: 18  steps: 318 score: 0.0
Episode: 19  steps: 343 score: 0.0
Episode: 20  steps: 193 score: 0.0
Episode: 21  steps: 311 score: 0.0
Episode: 22  steps: 212 score: 0.0
Episode: 23  steps: 241 score: 0.0
Episode: 24  steps: 339 score: 0.0
Episode: 25  steps: 187 score: 0.0
Episode: 26  steps: 459 score: 0.0
Episode: 27  steps: 326 score: 0.0
Episode: 28  steps: 412 score:

In [None]:
plt.plot(reward)