# Cartpole DQN

Deep Q-Learning Network with Keras and OpenAI Gym, based on [Keon Kim's code](https://github.com/keon/deep-q-learning/blob/master/dqn.py).

#### Select processing devices

In [1]:
# import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = ""
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [4]:
!pip install keras

Collecting keras
  Using cached https://files.pythonhosted.org/packages/5e/10/aa32dad071ce52b5502266b5c659451cfd6ffcbf14e6c8c4f16c0ff5aaab/Keras-2.2.4-py2.py3-none-any.whl
Collecting pyyaml (from keras)
Installing collected packages: pyyaml, keras
Successfully installed keras-2.2.4 pyyaml-5.1


#### Import dependencies

In [23]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import os # for creating directories

#### Set parameters

In [6]:
env = gym.make('CartPole-v0') # initialise environment

In [7]:
state_size = env.observation_space.shape[0]
state_size

4

In [8]:
action_size = env.action_space.n
action_size

2

In [9]:
batch_size = 32

In [10]:
n_episodes = 1001 # n games we want agent to play (default 1001)

In [11]:
output_dir = 'model_output/cartpole/'

In [12]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#### Define agent

In [24]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000) # double-ended queue; acts like list, but elements can be added/removed from either end
        self.gamma = 0.95 # decay or discount rate: enables agent to take into account future actions in addition to the immediate ones, but discounted at this rate
        self.epsilon = 1.0 # exploration rate: how much to act randomly; more initially than later due to epsilon decay
        self.epsilon_decay = 0.995 # decrease number of random explorations as the agent's performance (hopefully) improves over time
        self.epsilon_min = 0.01 # minimum amount of random exploration permitted
        self.learning_rate = 0.001 # rate at which NN adjusts models parameters via SGD to reduce cost 
        self.model = self._build_model() # private method 
    
    def _build_model(self):
        # neural net to approximate Q-value function:
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu')) # 1st hidden layer; states as input
        model.add(Dense(24, activation='relu')) # 2nd hidden layer
        model.add(Dense(self.action_size, activation='linear')) # 2 actions, so 2 output neurons: 0 and 1 (L/R)
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done)) # list of previous experiences, enabling re-training later

    def act(self, state):
        if np.random.rand() <= self.epsilon: # if acting randomly, take random action
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        print("action 0"# if not acting randomly, predict reward value based on current state
        print(act_values[0])
        return np.argmax(act_values[0]) # pick the action that will give the highest reward (i.e., go left or right?)

    def replay(self, batch_size): # method that trains NN with experiences sampled from memory
        minibatch = random.sample(self.memory, batch_size) # sample a minibatch from memory
        for state, action, reward, next_state, done in minibatch: # extract data for each minibatch sample
            target = reward # if done (boolean whether game ended or not, i.e., whether final state or not), then target = reward
            if not done: # if not done, then predict future discounted reward
                target = (reward + self.gamma * # (target) = reward + (discount rate gamma) * 
                          np.amax(self.model.predict(next_state)[0])) # (maximum target Q based on future action a')
            target_f = self.model.predict(state) # approximately map current state to future discounted reward
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0) # single epoch of training with x=state, y=target_f; fit decreases loss btwn target_f and y_hat
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

#### Interact with environment

In [31]:
agent = DQNAgent(state_size, action_size) # initialise agent

In [32]:
done = False
for e in range(n_episodes): # iterate over new episodes of the game
    state = env.reset() # reset state at start of each new episode of the game
    state = np.reshape(state, [1, state_size])
    
    for time in range(2):  # time represents a frame of the game; goal is to keep pole upright as long as possible up to range, e.g., 500 or 5000 timesteps
#         env.render()
        action = agent.act(state)
      # action is either 0 or 1 (move cart left or right); decide on one or other here
        next_state, reward, done, _ = env.step(action) # agent interacts with env, gets feedback; 4 state data points, e.g., pole angle, cart position        
        reward = reward if not done else -10 # reward +1 for each additional frame with pole upright        
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done) # remember the previous timestep's state, actions, reward, etc.        
        state = next_state # set "current state" for upcoming iteration to the current next state        
        if done: # episode ends if agent drops pole or we reach timestep 5000
            print("episode: {}/{}, score: {}, e: {:.2}" # print the episode's score and agent's epsilon
                  .format(e, n_episodes, time, agent.epsilon))
            break # exit loop
    if len(agent.memory) > batch_size:
        agent.replay(batch_size) # train the agent by replaying the experiences of the episode
    if e % 50 == 0:
        agent.save(output_dir + "weights_" + '{:04d}'.format(e) + ".hdf5")         

im state
[[-0.04031077  0.02444768  0.01443564  0.03365074]]
im state
[[-0.04904611  0.00636852 -0.0049215  -0.01607657]]
im state
[[ 0.01586473 -0.00492557  0.03279118 -0.0469909 ]]
im state
[[-0.04915721 -0.01539448  0.00352472 -0.00245312]]
im state
[[ 0.03486612 -0.02976177  0.03838348 -0.01632912]]
im state
[[-0.03962966 -0.02257931  0.04419673  0.01914151]]
im state
[[ 0.00116316 -0.0208033   0.01063772  0.03836153]]
im state
[[ 0.00746938 -0.03079816 -0.00134145 -0.00461728]]
im state
[[-0.04520609  0.00223286  0.00746196 -0.00085635]]
im state
[[-0.04772528 -0.04986958 -0.0421497  -0.02020933]]
im state
[[-0.02886132  0.04440323  0.0013066   0.00800775]]
im state
[[-0.00188277 -0.02386037 -0.04109984 -0.03027412]]
im state
[[ 0.02407163 -0.00280154  0.00712496  0.00890634]]
im state
[[-0.00568756 -0.00327722  0.03756241  0.00590454]]
im state
[[0.00619791 0.02632046 0.0499431  0.04902947]]
im state
[[ 0.02380381 -0.00538965  0.03738677 -0.01360934]]
im state
[[ 0.0039891  -0.04

im state
[[ 0.02667014 -0.00916468  0.01383086 -0.00311735]]
im state
[[ 0.0203891  -0.00393342  0.04658145 -0.03561608]]
[[80879.1  66873.94]]
[[87519.47 72363.86]]
im state
[[-0.01588092 -0.02476403 -0.04315949 -0.03688937]]
[[81761.52 68081.02]]
[[88192.36 73435.29]]
im state
[[ 0.04344881  0.0129211   0.03749694 -0.04384881]]
[[91069.53 75793.85]]
im state
[[-0.00356005 -0.04644265 -0.03738892 -0.03680776]]
im state
[[ 0.00890285 -0.02132451  0.04811612 -0.00145581]]
[[89540.164 74107.85 ]]
im state
[[ 0.03781106  0.0249022  -0.0363057  -0.00630094]]
[[89343.914 73997.31 ]]
[[96423.89  79860.586]]
im state
[[ 0.03472632  0.03772571  0.04562681 -0.02925374]]
[[92240.33  76523.555]]
im state
[[-0.0204299   0.00615785 -0.04420816 -0.01632581]]
im state
[[-0.01325014  0.04352146 -0.00041917 -0.01186587]]
im state
[[ 0.04760484 -0.02444701 -0.01069615 -0.03687736]]
[[98307.68  81526.234]]
[[106263.49  88123.38]]
im state
[[-0.03335154 -0.02805314 -0.00874519 -0.01844537]]
[[101124.28  8

im state
[[-0.0410289  -0.01548839  0.01476942 -0.01291023]]
[[446392.5  347214.62]]
im state
[[-0.00877227 -0.03789677  0.03278488 -0.02437884]]
[[413498.06 321367.25]]
im state
[[ 0.02996174  0.0269718   0.01930245 -0.03446252]]
[[411069.38 318990.88]]
[[451888.25 350665.88]]
im state
[[ 0.00263364 -0.04039739 -0.03836976 -0.04253326]]
[[374764.97 290144.5 ]]
im state
[[-0.02759009  0.00791233  0.0278366  -0.01292054]]
[[385323.94 298289.12]]
im state
[[-0.01365356 -0.04522176  0.01605828  0.02871219]]
[[437975.2  339054.47]]
[[480395.28 371893.1 ]]
im state
[[ 0.02551058  0.00186179  0.03247589 -0.01886263]]
[[394281.75 304749.9 ]]
im state
[[-0.0137878  -0.02388396 -0.00889374  0.03529601]]
[[445212.34 343634.62]]
[[488075.78 376718.03]]
im state
[[ 0.0192012  -0.02507347 -0.02074949  0.00255726]]
[[489323.6  377556.72]]
im state
[[-0.04986828 -0.02430111  0.01450601  0.00453357]]
[[411730.78 317734.5 ]]
im state
[[-0.04078714 -0.02914314 -0.02983543 -0.0053902 ]]
[[500570.62 38583

KeyboardInterrupt: 

In [13]:
# saved agents can be loaded with agent.load("./path/filename.hdf5") 