In [1]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style(style="dark")

Using TensorFlow backend.


### Environment

**STATE**

* Cart Position
* Cart Velocity
* Pole Angle
* Pole Velocity at tip

**ACTIONS**
* 0 - LEFT
* 1 - RIGHT

**REWARD**

* 1 for every survived step

### Build Agent

In [2]:
def build_Deep_Q_Agent():
    model = Sequential()
    model.add(Dense(32, input_dim=state_size, activation='relu'))
    model.add(Dense(8, input_dim=state_size, activation='relu'))
    model.add(Dense(action_size, activation='linear')) # LEFT OR RIGHT
    model.compile(loss='mse', optimizer=Adam(lr=learning_rate))
    return model

### Perform action from given state

In [3]:
def perform_action(model, state, epsilon):
    action = -1
    
    if np.random.rand() <= epsilon:
        #print("Perform random action!")
        action = random.randrange(2)
    else:
        #print("Perform predicted action from current state!")
        # Predict the most likely reward value from the given state
        action = model.predict(state)
        # Perform the action based on the predicted reward
        action = np.argmax(action[0]) # [0.74, 0.2] LEFT / RIGHT
        
    return action

### Remember transition

In [4]:
def remember(state, action, reward, next_state, done):
    #print("Remember state, action performed, reward received, etc")
    memory.append((state, action, reward, next_state, done))

### Train agent to maximize future reward

In [5]:
def replay(batch_size, epsilon, done):
    
    # Get a minibatch of previous training memories
    minibatch = random.sample(memory, batch_size)
    
    #print("Replay: Train model on sample of previous memories")

    # For each memory
    for state, action, reward, next_state, done in minibatch:
        
        if not done:
            # Predict future disconted reward if not a terminating step
            target = (reward + gamma * np.amax(agent.predict(next_state)[0]))
        else:
            # Terminating step, no prediction needed
            target = reward
        
        # Map an approximation between current state to the future discounted reward
        target_f = agent.predict(state)
        target_f[0][action] = target
        
        # Train model to find map function from state to target
        # This will maximize the future reward based on current state
        agent.fit(state, target_f, epochs=1, verbose=0)
    
    # Decrease exploration successively
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay
    
    return epsilon

### Load and Save

In [6]:
def load(model, name):
    return model.load_weights(name)

def save(model, name):
    model.save_weights(name)

### Params

In [7]:
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n # left, right
memory = deque(maxlen=10000)
gamma = 0.95    # discount rate
epsilon = 0.5  # exploration rate
epsilon_min = 0.01 # min exploration rate
epsilon_decay = 0.995 # decrease exploration successively
learning_rate = 0.0005
episodes = 1000
batch_size = 25
max_score = 0
max_score_ep = 0

## Run

In [8]:
#sns.lineplot(range(len(results)), results)

In [None]:
agent = build_Deep_Q_Agent()
#agent = load(agent, 'weights/cartpole-dqn.h5')
#epsilon = 0.01
done = False
results = []

for episode in range(episodes):

    state = env.reset()
    state = np.reshape(state, [1, state_size])

    for frame in range(episodes):
        env.render()
        
        # ---------------------------------------------------------
        # Select action - combine prediction and randomness
        # ---------------------------------------------------------
        
        action = perform_action(agent, state, epsilon)
        next_state, reward, done, info = env.step(action)
        if done: reward = -1
            
        # ---------------------------------------------------------
        # Move to next state and save transition in memory
        # ---------------------------------------------------------
            
        next_state = np.reshape(next_state, [1, state_size])
        remember(state, action, reward, next_state, done) 
        state = next_state

        if done:
            print("\nEpisode: {}/{}, Score: {}, Epsilon: {:.3}, Top score: {}".format(episode, episodes, frame, float(epsilon), max_score))
            
            results.append(frame)
            
            if episode % 25 == 0:
                print("Average score: ", sum(results[-25:]) / 25.0)

            if frame > max_score:
                max_score = frame
                max_score_ep = episode
            
            break
            
        # ---------------------------------------------------------
        # Sample random minibatches from memory and train model to 
        # maximize future reward from current state
        # ---------------------------------------------------------

        if len(memory) > batch_size:
            epsilon = replay(batch_size, epsilon, done)

    if episode % 50 == 0:
        print("Save weights!")
        save(agent, "weights/cartpole-dqn-" + str(episode) + ".h5")


Episode: 0/1000, Score: 34, Epsilon: 0.422, Top score: 0
Average score:  1.36
Save weights!

Episode: 1/1000, Score: 19, Epsilon: 0.383, Top score: 34

Episode: 2/1000, Score: 9, Epsilon: 0.366, Top score: 34

Episode: 3/1000, Score: 10, Epsilon: 0.349, Top score: 34

Episode: 4/1000, Score: 14, Epsilon: 0.325, Top score: 34

Episode: 5/1000, Score: 9, Epsilon: 0.311, Top score: 34

Episode: 6/1000, Score: 10, Epsilon: 0.295, Top score: 34

Episode: 7/1000, Score: 10, Epsilon: 0.281, Top score: 34

Episode: 8/1000, Score: 9, Epsilon: 0.269, Top score: 34

Episode: 9/1000, Score: 11, Epsilon: 0.254, Top score: 34

Episode: 10/1000, Score: 16, Epsilon: 0.235, Top score: 34

Episode: 11/1000, Score: 13, Epsilon: 0.22, Top score: 34

Episode: 12/1000, Score: 8, Epsilon: 0.211, Top score: 34

Episode: 13/1000, Score: 14, Epsilon: 0.197, Top score: 34

Episode: 14/1000, Score: 9, Epsilon: 0.188, Top score: 34

Episode: 15/1000, Score: 8, Epsilon: 0.181, Top score: 34

Episode: 16/1000, Scor