In [74]:
import gym
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

In [75]:
env = gym.make("CartPole-v1")
env.reset()
n_actions = env.action_space.n
input_dim = env.observation_space.shape[0]

In [76]:
def replay(replay_memory, minibatch_size=32):
    minibatch = np.random.choice(replay_memory, minibatch_size, replace=True)
    s_l =      np.array(list(map(lambda x: x['s'], minibatch)))
    a_l =      np.array(list(map(lambda x: x['a'], minibatch)))
    r_l =      np.array(list(map(lambda x: x['r'], minibatch)))
    sprime_l = np.array(list(map(lambda x: x['sprime'], minibatch)))
    done_l   = np.array(list(map(lambda x: x['done'], minibatch)))
    qvals_sprime_l = model.predict(sprime_l)
    target_f = model.predict(s_l) # includes the other actions, states
    # q-update
    for i,(s,a,r,qvals_sprime, done) in enumerate(zip(s_l,a_l,r_l,qvals_sprime_l, done_l)): 
        if not done:  target = r + gamma * np.max(qvals_sprime)
        else:         target = r
    target_f[i][a] = target
    model.fit(s_l,target_f, epochs=1, verbose=0)
    return model

In [77]:
# nnet to approximate q 
model = Sequential()
# 5 = 4 states + 1 action
model.add(Dense(64, input_dim = 4 , activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(2, activation = 'linear'))
model.compile(optimizer=Adam(), loss = 'mse')

In [79]:
n_episodes = 1200
gamma = 0.99
epsilon = 1
minibatch_size = 32
r_sums = []
# replay memory holds s, a, r, s'
replay_memory = []
for n in range(n_episodes): 
    s = env.reset()
    done=False
    r_sum = 0
    while not done: 
        #env.render()
        # 1. Feedforward pass for current state to get predicted q-values for all actions 
        qvals_s = model.predict(s.reshape(1,4))
        # 1a. choose action
        if np.random.random() < epsilon:  a = env.action_space.sample()
        else:                             a = np.argmax(qvals_s); # print(qvals_s, a)
        sprime, r, done, info = env.step(a)
        r_sum += r 
        # add to memory 
        if len(replay_memory) > 10000:
            replay_memory.pop(0)
            replay_memory.append({"s":s,"a":a,"r":r,"sprime":sprime,"done":done})
            
        s=sprime

        # Experience Replay - update weights after every episode
        if not done:
            model=replay(replay_memory, minibatch_size = minibatch_size)
        if epsilon > 0.01:      epsilon -= 0.001
    print("Total reward:", r_sum)
    r_sums.append(r_sum)
    if n % 100 == 0 and n != 0 : 
        print(n) 
        print(np.mean(r_sums[n-100:n]))
        
        

Total reward: 16.0
Total reward: 39.0
Total reward: 10.0
Total reward: 14.0
Total reward: 18.0
Total reward: 11.0
Total reward: 21.0
Total reward: 10.0
Total reward: 21.0
Total reward: 26.0
Total reward: 12.0
Total reward: 14.0
Total reward: 13.0
Total reward: 33.0
Total reward: 12.0
Total reward: 16.0
Total reward: 16.0
Total reward: 15.0
Total reward: 12.0
Total reward: 12.0
Total reward: 28.0
Total reward: 14.0
Total reward: 14.0
Total reward: 19.0
Total reward: 18.0
Total reward: 11.0
Total reward: 11.0
Total reward: 10.0
Total reward: 11.0
Total reward: 18.0
Total reward: 17.0
Total reward: 11.0
Total reward: 10.0
Total reward: 10.0
Total reward: 13.0
Total reward: 11.0
Total reward: 14.0
Total reward: 11.0
Total reward: 12.0
Total reward: 11.0
Total reward: 14.0
Total reward: 11.0
Total reward: 12.0
Total reward: 11.0
Total reward: 11.0
Total reward: 11.0
Total reward: 13.0
Total reward: 8.0
Total reward: 11.0
Total reward: 12.0
Total reward: 10.0
Total reward: 11.0
Total reward:

Total reward: 10.0
Total reward: 11.0
Total reward: 10.0
Total reward: 9.0
Total reward: 10.0
Total reward: 10.0
Total reward: 10.0
Total reward: 8.0
Total reward: 8.0
Total reward: 10.0
Total reward: 10.0
Total reward: 10.0
Total reward: 9.0
Total reward: 9.0
Total reward: 8.0
Total reward: 10.0
Total reward: 9.0
Total reward: 8.0
Total reward: 8.0
Total reward: 10.0
Total reward: 8.0
Total reward: 9.0
Total reward: 9.0
Total reward: 10.0
Total reward: 9.0
Total reward: 9.0
Total reward: 10.0
Total reward: 8.0
Total reward: 8.0
Total reward: 10.0
Total reward: 9.0
Total reward: 10.0
Total reward: 10.0
Total reward: 8.0
Total reward: 10.0
Total reward: 10.0
Total reward: 8.0
Total reward: 9.0
Total reward: 9.0
Total reward: 9.0
Total reward: 9.0
Total reward: 9.0
Total reward: 9.0
Total reward: 10.0
Total reward: 8.0
Total reward: 8.0
Total reward: 8.0
Total reward: 9.0
Total reward: 12.0
Total reward: 10.0
Total reward: 10.0
Total reward: 10.0
Total reward: 10.0
Total reward: 10.0
Tot

Total reward: 8.0
Total reward: 8.0
Total reward: 9.0
Total reward: 10.0
Total reward: 8.0
Total reward: 10.0
Total reward: 10.0
Total reward: 10.0
900
9.3
Total reward: 10.0
Total reward: 8.0
Total reward: 8.0
Total reward: 10.0
Total reward: 9.0
Total reward: 10.0
Total reward: 9.0
Total reward: 10.0
Total reward: 9.0
Total reward: 10.0
Total reward: 9.0
Total reward: 10.0
Total reward: 10.0
Total reward: 10.0
Total reward: 9.0
Total reward: 9.0
Total reward: 9.0
Total reward: 9.0
Total reward: 8.0
Total reward: 10.0
Total reward: 8.0
Total reward: 10.0
Total reward: 10.0
Total reward: 8.0
Total reward: 10.0
Total reward: 12.0
Total reward: 10.0
Total reward: 9.0
Total reward: 10.0
Total reward: 10.0
Total reward: 10.0
Total reward: 10.0
Total reward: 8.0
Total reward: 10.0
Total reward: 9.0
Total reward: 11.0
Total reward: 10.0
Total reward: 9.0
Total reward: 10.0
Total reward: 10.0
Total reward: 8.0
Total reward: 10.0
Total reward: 9.0
Total reward: 8.0
Total reward: 9.0
Total rewa

In [None]:
replay_memory.pop(0)

In [None]:
for i in range(msize): 
    tmp = minibatch[i]
    s,a,r,sprime,done = tmp['s'],tmp['a'],tmp['r'],tmp['sprime'],tmp['done']
    # 2. Do a feedforward pass for the next state s’ and calculate 
    # maximum overall network outputs max a’ Q(s’, a’).
    
    # 3. Set Q-value target for action to r + γmax a’ Q(s’, a’), 
    # For all other actions, set the Q-value target to the same 
    # as originally returned from step 1, making the error 0 for those outputs.
    if not done: 
        target = r + gamma * np.max(qvals_sprime)
    else: 
        target = r 
    #print(target)
    target_f = model.predict(s.reshape((1,4)))
    target_f[0][a] = target
    


In [23]:
np.array(list(map(lambda x: x['s'], minibatch)))

(32, 4)

In [17]:
s

array([ 0.03509201, -0.02546408, -0.02205155,  0.00229339])

In [239]:
minibatch = np.random.choice(replay_memory, msize , replace=False)


In [248]:
target_f[0][1]

13232.123

In [None]:
minibatch = random.sample(self.memory, batch_size)
for s, a, r, sprime in minibatch:
   # target = r
   # if not done:
    target = r + gamma * np.amax(model.predict(sprime)[0])
    target_f = model.predict(s.reshape(1,4))
    target_f[0][a] = target
    model.fit(s, target_f, epochs=1, verbose=0)
if self.epsilon > self.epsilon_min:
    self.epsilon *= self.epsilon_decay

In [243]:
target_f = model.predict(s.reshape((1,4)))


In [244]:
target_f

array([[12929.587, 13232.123]], dtype=float32)

In [245]:
s

array([-0.04088862, -0.03568615,  0.0272934 , -0.02125022])