In [1]:
import numpy as np
import gym
from IPython.display import clear_output
import time
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
Map= 'SFFFHFFFFFFFFFFG'
env_name= "FrozenLake-v0"
amap=np.asarray(Map, dtype='c')
shape=int(np.sqrt(amap.shape[0]))
amap=amap.reshape(shape,shape)
env=gym.make(env_name, desc=amap).unwrapped

In [3]:
class FrozenLake():
    
    def __init__(self, env, discount_rate= 1.0, learning_rate= 0.25):     
        self.state_space  = env.observation_space.n
        self.action_space = env.action_space.n
        self.discount_rate= discount_rate
        self.learning_rate= learning_rate
        self.eps= 1.0
        self.build_model()
        
        self.sess= tf.Session()
        self.sess.run(tf.global_variables_initializer())
    
    def build_model (self):
        tf.reset_default_graph()
        
        self.state= tf.placeholder(tf.int32, shape= [1])
        self.action= tf.placeholder(tf.int32, shape= [1])
        self.target= tf.placeholder(tf.float32, shape= [1])
        
        self.state_b= tf.one_hot(self.state, depth=self.state_space)
        self.action_b= tf.one_hot(self.action, depth=self.action_space)
        
        self.q_state= tf.layers.dense(self.state_b, units=self.action_space, name="q_table")
        self.q_value= tf.reduce_sum(tf.multiply(self.q_state, self.action_b), axis=1)
        
        self.loss= tf.reduce_sum(tf.square(self.target - self.q_value))
        self.optimizer= tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        
    def get_action(self, cur_state):
        q_value= self.sess.run(self.q_state, feed_dict= {self.state: [cur_state]})
        
        greedy= np.argmax(q_value)
        random= np.random.randint(self.action_space)

        if np.random.random() < self.eps:
            return(random)
        else:
            return(greedy)
    
    def train (self, experience):
        cur_state, action, next_state, reward, done= experience
        
        if experience[4]:
            target=0
            self.eps= self.eps*0.99
        else:
            q_next= self.sess.run(self.q_state, feed_dict={self.state: [next_state]})
            target = reward + self.discount_rate*np.max(q_next)
            
        self.sess.run(self.optimizer, feed_dict= {self.state: [cur_state], self.action: [action], self.target: [target]})
        
    def __del__(self):
        self.sess.close()
        

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [11]:
agent= FrozenLake(env)

In [14]:
Total_reward=0

for _ in range(100):
    cur_state= env.reset()
    
    done= False
    while not done:
        action = agent.get_action(cur_state)
        next_state, reward, done, info= env.step(action)

        agent.train((cur_state, action, next_state, reward, done))
        Total_reward+=reward
        
        cur_state= next_state
    
        print(agent.eps)
        with tf.variable_scope("q_table",reuse= True):
            weights= agent.sess.run(tf.get_variable("kernel"))
            print(weights)

        env.render()
        print(Total_reward)
        clear_output(wait=True)

0.04904089407128576
[[ 1.39850521e+01  3.65275040e+01  3.93565903e+01  1.28192184e+02]
 [ 1.29851242e+02  1.47784378e+02  1.44239883e+02  1.30923798e+02]
 [ 1.37507828e+02  1.47232590e+02  1.46830338e+02  1.26600571e+02]
 [ 1.36428177e+02  1.48665955e+02  1.48389175e+02  1.26093880e+02]
 [ 9.88352299e-02  4.18891191e-01  5.35588622e-01 -2.54441500e-02]
 [ 6.77535095e+01  6.21759176e+00  1.47164505e+02  3.01646767e+01]
 [ 1.31871399e+02  1.39454391e+02  1.43808258e+02  1.27759476e+02]
 [ 1.34973846e+02  1.27332611e+02  1.46624512e+02  1.27776581e+02]
 [ 3.50320396e+01  1.42183350e+02  2.24239273e+01 -2.43800163e+01]
 [ 1.00050415e+02  1.24884964e+02  1.47258102e+02  1.20557541e+02]
 [ 1.15764038e+02  1.33322632e+02  1.41880600e+02  1.27798943e+02]
 [ 6.86162491e+01  4.31229324e+01  3.24232445e+01  1.27853355e+02]
 [ 3.52605553e+01  3.95696983e+01  1.24579636e+02  1.72731037e+01]
 [ 8.36522293e+01  1.00395988e+02  1.46731522e+02  1.12733154e+02]
 [ 1.38924210e+02  5.19734726e+01  5.37920