In [1]:
from keras.layers import Dense, Flatten
from keras.models import Sequential
from collections import deque
# deque gives double sided queue, delete and append from both sides
import numpy as np
import random

import gym

Using TensorFlow backend.


In [2]:
# definition for some hyper parameters

#episodes - a number of games we want the agent to play.

#gamma - aka decay or discount rate, to calculate the future discounted reward.

#epsilon - aka exploration rate, this is the rate in which an agent randomly decides its action rather than prediction.

#epsilon_decay - we want to decrease the number of explorations as it gets good at playing games.

#epsilon_min - we want the agent to explore at least this amount.

#learning_rate - Determines how much neural net learns in each iteration.

#mbSize - Size of mini batch that will be taken from memory to train model

#tsteps - Time steps in each episode

In [3]:
# the flow of this process goes like

# step 1 import dependencies, define hyperparameters
# step2 initialize environment
# step3 build neural net architecture
# step 4 make the main part, i.e, time steps and agent taking decision based on max q value given from network,
# depending on epsilon agent chooses to explore-exploit, first network is not trained, so predictes bad random maybe.
#step 5 result is stored in deque all along state, action, reward etc.
# step 6 mini batch is taken from stored memory and network is trained on it.
# step6 goes to step 4 and again predict

In [4]:
#INITIALIZING GYM ENVIRONMENT
print('initializing gym agent-environment')
env = gym.make('CartPole-v0')

[2017-10-08 01:19:34,455] Making new env: CartPole-v0


initializing gym agent-environment


In [19]:
print('setting hyper parameters')
D = deque()                                # Register where the actions will be stored
tSteps = 500                              # Number of timesteps we will be acting on the game and observing results
epsilon = 0.7                              # Probability of doing a random move
gamma = 0.9                                # Discounted future reward. How much we care about steps further in time
mbSize = 50   

setting hyper parameters


In [6]:
# observation space for the environment

# Every environment comes with first-class Space objects that describe the valid actions and observations: 
# The observation_space defines the structure of the observations your environment will be returning. 
# Learning agents usually need to know this before they start running, in order to set up the policy function. 
# Some general-purpose learning agents can handle a wide range of observation types: Discrete, Box, or pixels 

# similarly we have action space which contains possible actions, in this case 2
# these operations can be used 

#print(env.observation_space)
#print(env.observation_space.shape)

#print(env.observation_space.high)
#print(env.observation_space.low)

#print(env.action_space.n)
# env.observation_space,n can't be used because action space is discrete but ob space is box type 

In [7]:
# BUILDING DEPP NEURAL NET FOR DQN
print('building neural net architecture')

model = Sequential()
model.add(Dense(20, input_shape = (2,) + env.observation_space.shape, kernel_initializer = 'uniform',
                activation = 'relu'))

#input is the state of the game which is described by observation space and action space
# here input from 2 consecutive states is fed as input

model.add(Flatten())  #as the dimension is not 1D, and we are operating with simple Dense units, so better to convert
                     # it into 1-d
model.add(Dense(18, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dense(10, kernel_initializer = 'uniform', activation = 'relu'))

model.add(Dense(env.action_space.n, kernel_initializer = 'uniform', activation = 'linear'))
# output no. will be equal to number of possible actions
# env.action_space.n here gives 2, as it is discrete

model.compile(loss = 'mse', optimizer = 'Adam', metrics = ['accuracy'])

building neural net architecture


In [25]:
# building main part
print('building main part of action- observation')

observation = env.reset()
#print(observation.shape)           # this observation is having parameters of state , 1 d array having 4 values,
                            # but is unstructured, (4,), it is restructuring to (1,4,) np.newaxis can also b used
obs = np.expand_dims(observation, axis = 0)
#print(obs.shape)

state = np.stack( (obs, obs), axis = 1)   #vertically stacking to make obs structure (1, 2, 4)
#print(state.shape)

# all these reshaping is done so it can be fed into neural net

done = False
for t in range(tSteps):
    if np.random.rand() <= epsilon:
        #70% probability chosing to explore
        action = np.random.randint(0, env.action_space.n, size = 1)[0]  #taking the first random int generated between
                                                        # 0 and 2, for taking tht action
    else:
        # choose to exploit best option, i.e make prediction from network and choose action with highest 
        # Q value
        
        Q = model.predict(state)
        action = np.argmax(Q)
        
    #now take that action
    obsNew, reward, done, info = env.step(action)
 
    #similarly the new observation will be unstructured, we ll re structure it and make new
    #state and save the previous state to deque (memory)
    
    obsNew = np.expand_dims(obsNew, axis = 0)  #(4,) to (1,4)

    #now we will append one last state to the the new state as in neural net we feed 2 consecutive states
       
    stateNew = np.append(np.expand_dims(obsNew, axis=0), state[:, :1, :], axis=1)
    
     #substitute to above line
    #state_new2 = np.append(obsNew[:,:,np.newaxis], state[:, :1, :], axis=1)
    #stateNew = np.stack( (np.expand_dims(obsNew, axis = 0), state[:, :1, :]) , axis = 1)
    
    #now we will save previous state
    D.append( (state, action, reward, stateNew, done) )
    state = stateNew
    if done:
        env.reset()        # Restart game if it's finished
        obs = np.expand_dims(observation, axis = 0)
        state = np.stack((obs, obs), axis=1)
print('Done observation')

building main part of action- observation
Done observation


In [47]:
# LEARNING FROM THE MISTAKES (Replayinf from memory)
miniBatch = random.sample(D, mbSize)    #random.choices(D, mbSize) can't b used as they provide with replacements

inputShape = (mbSize,) + state.shape[1:]  #(50, 2, 4)
# initializing inputs and targets
inputs = np.zeros(inputShape)
targets = np.zeros( (mbSize, env.action_space.n))

for i in range(mbSize):
    state = miniBatch[i][0]
    action = miniBatch[i][1]
    reward = miniBatch[i][2]
    stateNew = miniBatch[i][3]
    done = miniBatch[i][4]
    
    #inputs[i : i+1] = np.expand_dims(state, axis = 0)
    inputs[i] = state
    
    # the target to be achieved is also predicted by model
    targets[i] = model.predict(state)
    Q_sa = model.predict(stateNew)
    if done:
        targets[i, action] = reward   #this reward will be last
    else:
        targets[i, action] = reward + gamma *np.max(Q_sa)
        
# training on batch
# model.train_on_batch(self, x, y, sample_weight=None, class_weight=None)
# Runs a single gradient update on a single batch of data.

model.train_on_batch(inputs, targets)

print('Done Learning')

Done Learning


In [46]:
# Play !
# now that the model has trained 

observation = env.reset()
obs = np.expand_dims(observation, axis=0)
state = np.stack((obs, obs), axis=1)
done = False
tot_reward = 0.0
while not done:
    env.render()                    # Uncomment to see game running
    Q = model.predict(state)        
    action = np.argmax(Q)         
    observation, reward, done, info = env.step(action)
    obs = np.expand_dims(observation, axis=0)
    state = np.append(np.expand_dims(obs, axis=0), state[:, :1, :], axis=1)    
    tot_reward += reward
print('Game ended! Total reward: {}'.format(reward))

Game ended! Total reward: 1.0
