In [None]:
# import gym

# env = gym.make('CartPole-v1')
# state = env.reset()
# while True:
#     env.render()
#     action = model.predict(state)
#     next_state, reward, done, info = env.step(action)
#     if done:
#         break

In [3]:
import tensorflow as tf
import numpy as np
import gym
import random
from collections import deque

num_episodes = 500
num_exploration_episodes = 100
max_len_episode = 1000
batch_size = 32
learning_rate = 1e-3
gamma = 1.
initial_epsilon = 1.
final_epsilon = 0.01

In [4]:
class QNetwork(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.dense1 = tf.keras.layers.Dense(units=24, activation=tf.nn.relu)
        self.dense2 = tf.keras.layers.Dense(units=24, activation=tf.nn.relu)
        self.dense3 = tf.keras.layers.Dense(units=2)
        
    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        x = self.dense3(x)
        return x
    
    def predict(self, inputs):
        q_values = self(inputs)
        return tf.argmax(q_values, axis=-1)

In [5]:
env = gym.make('CartPole-v1')
model = QNetwork()

In [7]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
replay_buffer = deque(maxlen=10000)
epsilon = initial_epsilon
for episode_id in range(num_episodes):
    state = env.reset()
    epsilon = max(initial_epsilon * (num_exploration_episodes - episode_id) / num_exploration_episodes, final_epsilon)
    for t in range(max_len_episode):
        env.render()
        if random.random() < epsilon:
            # 这里都没有设置怎么采样的
            action = env.action_space.sample()
        else:
            action = model.predict(np.expand_dims(state, axis=0)).numpy()
            # 为什么取第一个项
            action = action[0]
        
        next_state, reward, done, info = env.step(action)
        reward = -10. if done else reward
        replay_buffer.append((state, action, reward, next_state, 1 if done else 0))
        state = next_state
        
        if done:
            # 这里不是score吧？
            print("episode %d, epsilon %f, score %d" % (episode_id, epsilon, t))
            break
        
        if len(replay_buffer) >= batch_size:
            batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(
                *random.sample(replay_buffer, batch_size))
            batch_state, batch_reward, batch_next_state, batch_done = [
                np.array(a, dtype=np.float32) for a in [batch_state, batch_reward, batch_next_state, batch_done]]
            batch_action = np.array(batch_action, dtype = np.int32)
        
            q_value = model(batch_next_state)
            y = batch_reward + (gamma * tf.reduce_max(q_value, axis=1)) * (1 - batch_done)
            with tf.GradientTape() as tape:
                loss = tf.keras.losses.mean_squared_error(
                    y_true=y, 
                    y_pred=tf.reduce_sum(model(batch_state) * tf.one_hot(batch_action, depth=2), axis=1)
                )
            grads = tape.gradient(loss, model.variables)
            optimizer.apply_gradients(grads_and_vars=zip(grads, model.variables))

episode 0, epsilon 1.000000, score 32


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

episode 1, epsilon 0.990000, score 39
episode 2, epsilon 0.980000, score 11
episode 3, epsilon 0.970000, score 12
episode 4, epsilon 0.960000, score 29
episode 5, epsilon 0.950000, score 13
episode 6, epsilon 0.940000, score 27
episode 7, epsilon 0.930000, score 30
episode 8, epsilon 0.920000, score 9
episode 9, epsilon 0.910000, score 23
episode 10, epsilon 0.900000, score 17
episode 11, epsilon 0.890000, score 44
episode 12, epsilon 0.880000, score 20
episode 13, epsilon 0.870000, score 21
episode 14, epsilon 0.860000, score 9
episode 15, epsilon 0.850000, score 44
episode 16, epsilon 0.840000, score 11
episode 17, epsilon 0.830000, score 33
episode 1