In [1]:
import random
import gym
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as bckend
import tensorflow as tf

In [2]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.99   # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model
    
    def update_target_model(self):
        # copy weights from model to target_model
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma * np.amax(self.target_model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def load(self, name):
        self.model.load_weights(name)
        
    def save(self, name):
        self.model.save_weights(name)

In [3]:
# Watch our agent play before learning
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
random_agent = DQNAgent(state_size, action_size)

import time
from IPython.display import clear_output
for episode in range(3):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    done = False
    print("====EPISODE ", episode+1, "====\n\n\n\n")
    print("=============================\n")
    time.sleep(1)    
    
    for step in range(200):        
        clear_output(wait=True)
        %matplotlib notebook
        env.render()
        #time.sleep(0.2)
        
        action = random_agent.act(state)       
        new_state, reward, done, info = env.step(action)
        new_state = np.reshape(new_state, [1, state_size])
        state = new_state
env.close()

In [4]:
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
    
batch_size = 32
EPISODES = 100

In [10]:
done = False

for e in range(EPISODES):
    cum_reward = 0
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    
    for time in range(500):
        # env.render()
        action = agent.act(state)
        next_state, reward, done, info = env.step(action)
        reward = reward if not done else -10
        cum_reward+=reward    
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
            
        if done:
            agent.update_target_model()
            print("episode: {}/{}, score: {}, e: {:.2}".format(e, EPISODES, cum_reward, agent.epsilon))
            break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

episode: 0/100, score: 3.0, e: 1.0
episode: 1/100, score: 2.0, e: 1.0
Instructions for updating:
Use tf.cast instead.
episode: 2/100, score: 12.0, e: 0.92
episode: 3/100, score: 39.0, e: 0.72
episode: 4/100, score: 3.0, e: 0.67
episode: 5/100, score: 2.0, e: 0.63
episode: 6/100, score: 0.0, e: 0.6
episode: 7/100, score: 0.0, e: 0.57
episode: 8/100, score: 22.0, e: 0.49
episode: 9/100, score: 140.0, e: 0.23
episode: 10/100, score: 149.0, e: 0.1
episode: 11/100, score: 121.0, e: 0.054
episode: 12/100, score: 156.0, e: 0.023
episode: 13/100, score: 149.0, e: 0.011
episode: 14/100, score: 129.0, e: 0.01
episode: 15/100, score: 107.0, e: 0.01
episode: 16/100, score: 188.0, e: 0.01
episode: 17/100, score: 144.0, e: 0.01
episode: 18/100, score: 135.0, e: 0.01
episode: 19/100, score: 191.0, e: 0.01
episode: 20/100, score: 155.0, e: 0.01
episode: 21/100, score: 139.0, e: 0.01
episode: 22/100, score: 340.0, e: 0.01
episode: 23/100, score: 200.0, e: 0.01
episode: 24/100, score: 195.0, e: 0.01
epi

In [5]:
# Watch our agent play AFTER learning
import time
from IPython.display import clear_output
agent = DQNAgent(state_size, action_size)
agent.load("cartpole-dqn.h5")
agent.epsilon = 0.0
for episode in range(3):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    done = False
    print("====EPISODE ", episode+1, "====\n\n\n\n")
    print("=============================\n")
    time.sleep(1)    
    
    for step in range(200):        
        clear_output(wait=True)
        %matplotlib notebook
        env.render()
        #time.sleep(0.2)
        
        action = agent.act(state)       
        new_state, reward, done, info = env.step(action)
        new_state = np.reshape(new_state, [1, state_size])
        
        #print(reward, done)
        if done:
#             clear_output(wait=True)
#             env.render()
            
#             if reward > 500:
#                 print("****Good Job!****")
#                 time.sleep(3)
#             else:
#                 print("****Learn Better!****")
#                 time.sleep(3)
                
#             clear_output(wait=True)
            break
        state = new_state
env.close()


**Code adapted from: Deep Q-Learning with Keras and Gym https://keon.io/deep-q-learning/**