# DQL Cartpole

## FIRST VERSION: With Keras

### Step 0: Import the dependencies and create the environement

In [1]:
import gym
import keras
import numpy as np
import random

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam
from collections import deque

env = gym.make("CartPole-v1")

Using TensorFlow backend.
[2018-01-11 20:00:34,919] Making new env: CartPole-v1


### Step 1: Set up the hyperparameters

In [2]:
num_episodes = 1000
num_steps = 500

input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n

epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995 #we want to decrease the number of explorations as it gets good at playing games.
gamma = 0.95 # Discount rate
lr = 0.001
done = False
batch_size = 32



### Step 2: Construct the Q Neural Network

In [3]:
class QNetwork:
    def __init__(self, input_dim, output_dim, lr, gamma, epsilon, epsilon_min, epsilon_decay):
        self.input_dim = input_dim
        self.output_dim = output_dim
        
        self.memory = deque(maxlen=2000)
        
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        
        self.lr = lr
        self.model = self.build_model()
        
    def build_model(self):
        model = Sequential()
        
        model.add(Dense(24, 
                        input_dim = self.input_dim, # 4 states
                       activation = "relu")) 
        
        model.add(Dense(24, 
                        
                       activation = "relu")) 
        
        model.add(Dense(self.output_dim, 
                        activation = "linear")) # Action size
        
        model.compile(loss="mse",
                     optimizer = Adam(lr=self.lr))
        
        model.summary()
        
        return model
    
    def remember(self, state, action, reward, next_state, done):
        # Save the experience in memory
        self.memory.append((state, action, reward, next_state, done))
        
    # Our agent will randomly select its action at first by a certain percentage, called ‘exploration rate’ or ‘epsilon’. 
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            # Random action
            # Exploration
            return random.randrange(self.output_dim)    
        
        # Non random
        # Exploitation
        # Predict the reward on being at that state
        action_reward = self.model.predict(state)
            
        # Pick the action based on the predicted reward
        return np.argmax(action_reward[0])
    

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)
    
    
    
    
    def replay(self, batch_size):
        # Sample some examples of the memory and call them minibatch
        minibatch = random.sample(self.memory, batch_size)
        
        # Extract information for each memory
        for state, action, reward, next_state, done in minibatch:
            
            # If done, make our target reward
            target = reward
                
            if not done:
                # Predict the future discounted reward
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
            
            # make the agent to approximately map
            # the current state to future discounted reward
            # We'll call that target_f
            target_f = self.model.predict(state)
            target_f[0][action] = target
                    
            # Train the Neural Net with the state and target_f
            self.model.fit(state, target_f, epochs=1, verbose=0)
        
        # Lower epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

### Step 3: Train the Q Neural Network

In [4]:
if __name__ == "__main__":
    # Init the agent
    agent = QNetwork(input_dim, output_dim, lr, gamma, epsilon, epsilon_min, epsilon_decay)
    done = False
    # Iterate the game
    for episode in range(num_episodes):
        # Restart our game
        state = env.reset()
        state = np.reshape(state, [1, input_dim])
        
        # For each frame of the game
        for step in range(num_steps):
            
            # Decide action
            action = agent.act(state)
            
            # Perform the action
            next_state, reward, done, info = env.step(action)
            
            reward = reward if not done else -10
            
            next_state = np.reshape(next_state, [1, input_dim])
            
            # Remember the previous state action, reward and done
            agent.remember(state, action, reward, next_state, done)
            
            # Make next state the current state
            state = next_state
            
            # done becomes True when the game ends
            # ex) The agent drops the pole
            if done:
                # print the score and break out of the loop
                print("episode: {}/{}, score: {}, e: {:2}"
                      .format(episode, num_episodes, step, agent.epsilon))
                break
        
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)       

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_2 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 50        
Total params: 770.0
Trainable params: 770
Non-trainable params: 0.0
_________________________________________________________________
episode: 0/1000, score: 35, e: 1.0
episode: 1/1000, score: 32, e: 0.995
episode: 2/1000, score: 23, e: 0.990025
episode: 3/1000, score: 12, e: 0.985074875
episode: 4/1000, score: 31, e: 0.9801495006250001
episode: 5/1000, score: 48, e: 0.9752487531218751
episode: 6/1000, score: 14, e: 0.9703725093562657
episode: 7/1000, score: 21, e: 0.9655206468094844
episode: 8/1000, score: 19, e: 0.960693043575437


### Step 4: Make our agent play the game

In [5]:
env.reset()
num_steps = 1000

for episode in range(200):
    state = env.reset()
    
    done = False

    for step in range(num_steps):
        env.render()
        state = np.reshape(state, [1, input_dim])
        # Take the action (index) that have the maximum expected future reward given that state
        action = agent.act(state)
        
        new_state, reward, done, info = env.step(action)
    
        if done:
            break
        else:
            state = new_state
env.close()
    

AttributeError: 'NoneType' object has no attribute 'flip'