In [None]:
from keras.layers import Input, Dense
from keras.models import Model
from keras.optimizers import Adam
from keras.utils import to_categorical
import keras.backend as K
import numpy as np

In [None]:
class Agent(object):
    def __init__(self):
        self.input_dims = 8
        self.n_actions = 4
        self.action_space = [0, 1, 2, 3]
        self.actor, self.critic, self.policy = self.build_actor_critic_network()
    def build_actor_critic_network(self):
        # 接收状态
        inputs = Input(shape=(self.input_dims,))
        # 接收优势值
        advantages = Input(shape=[1])
        dense1 = Dense(units=1024, 
                       activation='relu')(inputs)
        dense2 = Dense(units=512, 
                       activation='relu')(dense1)
        # 输出采取动作空间中每个动作的概率
        outputs = Dense(units=self.n_actions, 
                        activation='softmax')(dense2)
        # 状态值函数的输出
        values = Dense(units=1, 
                       activation=None)(dense2)
        # 自定义演员模型的损失函数
        def custom_loss(y_true, y_pred):
            y_pred = K.clip(y_pred, 1e-8, 1-1e-8)
            log_lik = y_true * K.log(y_pred)
            return K.sum(-log_lik * advantages)
        # 构建演员模型
        actor = Model([inputs, advantages], outputs)
        actor.compile(optimizer=Adam(lr=0.00001), 
                      loss=custom_loss,
                      metrics=None)
        # 构建评判家模型
        critic = Model(inputs, values)
        critic.compile(optimizer=Adam(lr=0.00005), 
                       loss='mse',
                       metrics=None)
        # 构建策略模型
        policy = Model(inputs, outputs)
        return actor, critic, policy
    def choose_action(self, state):
        state = state[np.newaxis, :]
        # 预测当前状态下采取每一个行动的概率
        probabilities = self.policy.predict(state)[0]
        # 根据概率值选择一个行为
        action = np.random.choice(self.action_space, p=probabilities)
        return action
    def learn(self, state, action, reward, next_state, done):
        # 当前状态
        state = state[np.newaxis, :]
        # 下一个状态
        next_state = next_state[np.newaxis, :]
        # 当前状态值
        critic_value = self.critic.predict(state) 
        # 下一个状态值
        next_critic_value = self.critic.predict(next_state)
        # 评判家模型的训练目标
        target = reward + next_critic_value * (1 - int(done))
        # 计算优势值
        advantage = target - critic_value
        # 将动作进行独热编码处理
        action = to_categorical(action, num_classes=self.n_actions)
        action = action[np.newaxis, :]
        # 训练演员模型
        self.actor.fit([state, advantage], action, verbose=0)
        # 训练评判家模型
        self.critic.fit(state, target, verbose=0)

In [None]:
import gym
agent = Agent()
env = gym.make('LunarLander-v2')
n_episodes = 2000
for i in range(n_episodes):
    done = False
    total_reward = 0
    state = env.reset()
    while not done:
        action = agent.choose_action(state)
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        agent.learn(state, action, reward, next_state, done)
        state = next_state
    print(f'Episode {i}/{n_episodes} ---> Total Reward: {total_reward}')