In [1]:
from keras.layers import Dense, Conv1D
from keras.models import Sequential
from keras.optimizers import Adam

Using TensorFlow backend.


In [1]:
class A2CAgent:
    def __init__(self, game_length, action_size, n_options):
        self.render = True
        self.load_model = False
        self.game_length = game_length
        self.action_size = action_size
        self.value_size = 1
        self.n_options = n_options

        self.discount_factor = 0.99
        self.actor_lr = 0.001
        self.critic_lr = 0.005

        self.actor = self.build_actor()
        self.critic = self.build_critic()

        if self.load_model:
            self.actor.load_weights("./save_model/Cowbull_actor.h5")
            self.critic.load_weights("./save_model/Cowbull_critic.h5")

    # approximate policy and value using Neural Network
    # actor: state is input and probability of each action is output of model
    def build_actor(self):
        actor = Sequential()

        actor.add(
            Conv1D(
                input_dim=(self.game_length, (2 * self.action_size + 1)),
                kernel_size=2,
                strides=1,
                padding='valid',
                activation='relu',
                kernel_initializer='he_uniform'))
        actor.add(
            Dense(16, activation='relu', kernel_initializer='he_uniform'))
        actor.add(
            Dense(16, activation='relu', kernel_initializer='he_uniform'))
        actor.add(
            Dense(
                self.n_options,
                activation='softmax',
                kernel_initializer='he_uniform'))
        actor.summary()
        actor.compile(
            loss='categorical_crossentropy', optimizer=Adam(lr=self.actor_lr))
        return actor

    # critic: state is input and value of state is output of model
    def build_critic(self):

        critic = Sequential()

        critic.add(
            Conv1D(
                input_dim=(self.game_length, (2 * self.action_size + 1)),
                kernel_size=2,
                strides=1,
                padding='valid',
                activation='relu',
                kernel_initializer='he_uniform'))
        critic.add(
            Dense(32, activation='relu', kernel_initializer='he_uniform'))
        critic.add(
            Dense(32, activation='relu', kernel_initializer='he_uniform'))
        critic.add(
            Dense(
                self.value_size,
                activation='linear',
                kernel_initializer='he_uniform'))
        critic.summary()
        critic.compile(loss="mse", optimizer=Adam(lr=self.critic_lr))

        return critic

    # using the output of policy network, pick action stochastically
    def get_action(self, state):

        action = []
        state[2 * self.action_size] = state[0]
        for i in len(self.action_size):
            state[2 * self.action_size] = state[i]
            policy = self.actor.predict(state, batch_size=1).flatten()
            action.append(
                np.random.choice(np.arange(0, self.n_options), p=policy))
        return action

    # update policy network every episode
    def train_model(self, state, action, reward, next_state):

        for i in range(self.action_size):
            
            target = np.zeros((1, self.value_size))
            advantages = np.zeros((1, self.n_options))
            
            state[2 * self.action_size] = state[i]
            next_state[2 * self.action_size] = state[i]
            
            value = self.critic.predict(state)[0]
            next_value = self.critic.predict(next_state)[0]

            advantages[0][action[i]] = reward + self.discount_factor * (
                next_value) - value
            target[0][0] = reward + self.discount_factor * next_value

            self.actor.fit(state, advantages, epochs=1, verbose=0)
            self.critic.fit(state, target, epochs=1, verbose=0)