In [1]:
import gym 
import random
import numpy as np
import time
import tensorflow as tf
from gym.envs.registration import register 
from IPython.display import clear_output  # feature of jupyter notebook
from collections import deque

# DEQUE  ?
Deque can be implemented in python using the module “collections“. Deque is preferred over list in the cases where we need quicker append and pop operations from both the ends of container, as deque provides an O(1) time complexity for append and pop operations as compared to list which provides O(n) time complexity.

In [2]:
print("Using OpenAI Gym:", gym.__version__)
print("Using Tensorflow:", tf.__version__)

Using OpenAI Gym: 0.17.1
Using Tensorflow: 1.14.0


In [3]:
# register is in the init function of the source code of the open ai
try:     #used as if we register one game again we get error therfore to avoid we use exception handling
    register(
        id='FrozenLakenoslip-v0',     # we can change this name if we want before (-v0)
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4','is_slippery':False},  # making game non slippery for our benifit
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )
except:
    pass
env_name = "FrozenLakenoslip-v0"
env = gym.make(env_name)

In [4]:
print(env.observation_space)    # 4X4 matrix
print(env.action_space)     # 4 movement
type(env.action_space)

Discrete(16)
Discrete(4)


gym.spaces.discrete.Discrete

In [5]:
class Agent():
    def __init__(self, env):
        self.is_discrete = \
            type(env.action_space) == gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("Action size:", self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print("Action range:", self.action_low, self.action_high)
        
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low,
                                       self.action_high,
                                       self.action_shape)
        return action

In [12]:
# same code as normal neural net only differnce is buffer made using deque
class QNRAgent(Agent):
    def __init__(self, env, discount_rate=0.97, learning_rate=0.001): # Smaller learning rate
        super().__init__(env)
        self.state_size = env.observation_space.n
        print("State size:", self.state_size)
        
        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.replay_buffer = deque(maxlen=1000) # buffer deque of size max len 1000
        
    def build_model(self):
        tf.reset_default_graph()
        self.state_in = tf.placeholder(tf.int32, shape=[None]) # None means any shape
        self.action_in = tf.placeholder(tf.int32, shape=[None]) #
        self.target_in = tf.placeholder(tf.float32, shape=[None]) #
        
        self.state = tf.one_hot(self.state_in, depth=self.state_size)
        self.action = tf.one_hot(self.action_in, depth=self.action_size)
        
        self.q_state = tf.layers.dense(self.state, units=self.action_size, name="q_table")
        self.q_action = tf.reduce_sum(tf.multiply(self.q_state, self.action), axis=1)
        
        self.loss = tf.reduce_sum(tf.square(self.target_in - self.q_action))
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        
    def get_action(self, state):
        q_state = self.sess.run(self.q_state, feed_dict={self.state_in: [state]})
        action_greedy = np.argmax(q_state)
        action_random = super().get_action(state)
        return action_random if random.random() < self.eps else action_greedy
    
    def train(self, experience, batch_size=50):
        self.replay_buffer.append(experience) # append buffer with experience
        samples = random.choices(self.replay_buffer, k=batch_size) # randomly choose the experience and return list of size 50
        state, action, next_state, reward, done = (list(col) for col in zip(experience, *samples))
        # The purpose of zip() is to map the similar index of multiple containers so that they can be used just using as single entity
        # *samples is used to pass all items in the list 
        
        q_next = self.sess.run(self.q_state, feed_dict={self.state_in: next_state})
        q_next[done] = np.zeros([self.action_size])
        q_target = reward + self.discount_rate * np.max(q_next, axis=1)
        
        feed = {self.state_in: state, self.action_in: action, self.target_in: q_target}
        self.sess.run(self.optimizer, feed_dict=feed)
        
        if experience[4]:
            self.eps = self.eps * 0.99
            
    def __del__(self):
        self.sess.close()
        
agent = QNRAgent(env)

Action size: 4
State size: 16


In [16]:
total_reward = 0
for ep in range(100):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        agent.train((state,action,next_state,reward,done))
        state = next_state
        total_reward += reward
        
        print("s:", state, "a:", action)
        print("Episode: {}, Total reward: {}, eps: {}".format(ep,total_reward,agent.eps))
        env.render()
    
        
        # Reuse=True... So if we run again, previously learned weights in q-table will be the starting point.
        with tf.variable_scope("q_table", reuse=True):
            weights = agent.sess.run(tf.get_variable("kernel"))
            print(weights)
        time.sleep(0.05)
        clear_output(wait=True)

s: 15 a: 2
Episode: 99, Total reward: 98.0, eps: 0.017950553275045134
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
[[ 0.5293679   0.41516203  0.37216517  0.15255252]
 [ 0.5484979  -0.430597    0.34724542  0.04113482]
 [ 0.522624    0.28485516  0.00297938  0.12154437]
 [ 0.22556773 -0.27479848  0.06086236 -0.19075096]
 [ 0.57514614  0.4417374  -0.4362241   0.1737367 ]
 [-0.14803278  0.01598388  0.23613018  0.19616961]
 [-0.00995207  0.11141294 -0.43796554  0.10492181]
 [ 0.3080092   0.3474084  -0.24372712 -0.4915411 ]
 [ 0.6009202  -0.4376558   0.4766692   0.1355403 ]
 [ 0.60161     0.49733016  0.5048913  -0.6830667 ]
 [ 0.62823844  0.5264207  -0.43567204  0.05987294]
 [-0.4477314   0.11181992 -0.10622957 -0.09693518]
 [ 0.06005758  0.40364558 -0.0392949  -0.4964552 ]
 [-0.23923656  0.49711064  0.5339876   0.2303302 ]
 [ 0.6568191   0.52576923  0.5639878   0.257516  ]
 [-0.5440019   0.46472573  0.00205296 -0.13045731]]
