In [22]:
import numpy as np
from collections import deque
from matplotlib import pyplot as plt
from keras.layers import Dense, Input 
from keras.models import Model
from keras.optimizers import SGD, Adam, Adadelta
import random
import gym

In [23]:
env = gym.make('CartPole-v0')   #get environment agent from gym
env._max_episode_steps = 10001   #change maximum steps. limited to 200 steps

input_size = env.observation_space.shape[0]   #state array size
output_size = env.action_space.n   #action array size
h_size = 20   #hidden layers size

dis = 0.95   #discount factor
REPLAY_MEMORY = 50000   #buffer data size

  result = entry_point.load(False)


In [24]:
class DQN:   #create class DQN using keras
    def __init__(self, input_size, h_size, output_size, name="main"):
        self.input_size = input_size
        self.output_size = output_size
        self.h_size = h_size
        self.net_name = name
        self._build_network()

    def _build_network(self, l_rate=0.1):
        self._X = Input(shape = (self.input_size,))    #input 
        l1 = Dense(self.h_size, activation='tanh')(self._X )   #hidden layer 1
        l2 = Dense(self.h_size, activation='tanh')(l1)
        self._Qpred = Dense(self.output_size)(l2)   #output
        
        self.model = Model(self._X, self._Qpred)   #create NN model
        self.model.compile(optimizer = Adadelta(lr = l_rate), loss = 'mse')   #model compile

    def predict(self, state):
        x = np.reshape(state, [1, self.input_size])
        return self.model.predict(x)

    def update(self, x_stack, y_stack):
        return self.model.train_on_batch(x_stack,y_stack)   

In [25]:
def replay_train(mainDQN, targetDQN, train_batch):   #batch train using replay random memoey
    x_stack = np.empty(0).reshape(0, input_size)    #initailze stack data to 0 
    y_stack = np.empty(0).reshape(0, output_size)

    for state, action, reward, next_state, done in train_batch:
        Q = mainDQN.predict(state)   #predict action from mainDQN

        if done:  # terminal
            Q[0, action] = reward
        else:
            # get target from targetDQN
            Q[0, action] = reward + dis * np.max(targetDQN.predict(next_state))

        y_stack = np.vstack([y_stack, Q])   #stack output
        x_stack = np.vstack([x_stack, state])   #stack input

    return mainDQN.update(x_stack, y_stack)     # Train network predicted Q values

In [26]:
def bot_play(mainDQN):  #test network
    s = env.reset()   
    reward_sum = 0
    while True:
        env.render()
        a = np.argmax(mainDQN.predict(s))
        s, reward, done, _ = env.step(a)
        reward_sum += reward
        if done:
            print ("Total score {}". format(reward_sum))
            break

In [27]:
def main():
    max_episodes = 5000
    replay_buffer = deque()
    
    mainDQN = DQN(input_size, h_size, output_size)   #create main DQN
    targetDQN = DQN(input_size, h_size, output_size)   #create target DQN
    
    for episode in range(max_episodes):
        e = 1./((episode / 10) +1)   #epsilon greedy
        done = False
        step_count = 0
        state = env.reset()
        while not done:
            if np.random.rand(1) < e:
                action = env.action_space.sample()
            else:
                action = np.argmax(mainDQN.predict(state))
                
            next_state, reward, done, _ = env.step(action)
            if done:
                reward = -500
                
            replay_buffer.append((state, action, reward, next_state, done))
            if len(replay_buffer) > REPLAY_MEMORY:
                replay_buffer.popleft()
            
            state = next_state
            step_count += 1
            if reward > 500:
                break
                
        print ("Episode: {} steps {}".format(episode, step_count))
        if step_count > 1000:
            pass
        bot_play(mainDQN)
        if episode % 10 == 1 :
            for _ in range(50):
                minibatch = random.sample(replay_buffer, 10)
                loss = replay_train(mainDQN, targetDQN, minibatch)
            print ("Loss :" , loss)
            targetDQN.model.set_weights(mainDQN.model.get_weights())                       

In [28]:
main()

Episode: 0 steps 22
Total score 10.0
Episode: 1 steps 25
Total score 13.0
Loss : 0.8872568
Episode: 2 steps 13
Total score 9.0
Episode: 3 steps 17
Total score 12.0
Episode: 4 steps 17
Total score 12.0
Episode: 5 steps 13
Total score 12.0
Episode: 6 steps 11
Total score 10.0
Episode: 7 steps 16
Total score 10.0
Episode: 8 steps 11
Total score 11.0
Episode: 9 steps 20
Total score 10.0
Episode: 10 steps 16
Total score 12.0
Episode: 11 steps 11
Total score 13.0
Loss : 0.7092656
Episode: 12 steps 12
Total score 22.0
Episode: 13 steps 31
Total score 19.0
Episode: 14 steps 14
Total score 16.0
Episode: 15 steps 12
Total score 23.0
Episode: 16 steps 20
Total score 24.0
Episode: 17 steps 21
Total score 20.0
Episode: 18 steps 19
Total score 14.0
Episode: 19 steps 23
Total score 20.0
Episode: 20 steps 16
Total score 16.0
Episode: 21 steps 15
Total score 26.0
Loss : 24953.396
Episode: 22 steps 16
Total score 16.0
Episode: 23 steps 21
Total score 27.0
Episode: 24 steps 15
Total score 14.0
Episode: 2

KeyboardInterrupt: 

In [None]:
model.summary()

In [None]:
mainDQN = DQN(input_size, h_size, output_size)