# 深度學習 

# 伯伯會社 第5組 Competition04 報告

## 組員:江伯耕、蔣嘉霖、周秉儒、陳炘昱


In [1]:
# %%
import os 
import tensorflow as tf
import numpy as np
from ple.games.flappybird import FlappyBird
from ple import PLE
import matplotlib.pyplot as plt
import copy
import wandb
from tensorflow.keras.layers import Input, Dense
import argparse
from threading import Thread, Lock
from multiprocessing import cpu_count
os.environ["SDL_VIDEODRIVER"] = "dummy" 


pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom


In [2]:
cpu_count()

20

In [None]:
#%%
tf.keras.backend.set_floatx('float32')
wandb.init(name='A3C_Dominic_CPU=1_update_interval=4000', project="DL_comp4")


In [4]:
import tensorflow as tf
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
    except RuntimeError as e:
        print(e)

# Model: A3C

https://arxiv.org/pdf/1602.01783.pdf

* A2C為policy method的一種，由actor更新policy distribution，由critic估計value function。
* A3C則是A2C開多核平行進行參數更新的版本。

## 本次比賽我們採用A3C的架構，但由於多核心的training需花較長的時間，這次競賽我們交的模型只用單核(i.e. A2C)進行training，從結果來看，即使只用單核，效果也是不錯的。


## (一)、Parameter

* actor 和 critic的 learning rate 調小一點(小數點第四位)的training效果比較好。

* update_interval的參數基本上越大越好，盡量讓bird在死亡或收集許多動作和獎勵後再進行更新。


In [5]:
#%%
args = {
  'gamma' : 0.95,
  'update_interval':4000,
  'actor_lr':0.0006,
  'critic_lr':0.0008,
  'entropy_beta':0,
  'reward_no_die':0,
  'reward_die':-5,
  'reward_through':1,
  'p-threshold':0.49,
  'MIN_EXPLORING_RATE':0.01
}
CUR_EPISODE = 0


In [6]:
#%%
class Actor:
    def __init__(self, state_dim, action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.RMSprop(args['actor_lr'])
        self.entropy_beta = args['entropy_beta']

    def create_model(self):
        return tf.keras.Sequential([
            tf.keras.Input((self.state_dim,)),
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(self.action_dim, activation='softmax')
        ])

    def compute_loss(self, actions, logits, advantages):
        ce_loss = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True)
        entropy_loss = tf.keras.losses.CategoricalCrossentropy(
            from_logits=True)
        actions = tf.cast(actions, tf.int32)
        policy_loss = ce_loss(
            actions, logits, sample_weight=tf.stop_gradient(advantages))
        entropy = entropy_loss(logits, logits)
        return policy_loss - self.entropy_beta * entropy

    def train(self, states, actions, advantages):
        with tf.GradientTape() as tape:
            logits = self.model(states, training=True)
            loss = self.compute_loss(
                actions, logits, advantages)
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss


In [7]:
#%%
class Critic:
    def __init__(self, state_dim):
        self.state_dim = state_dim
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.RMSprop(args['critic_lr'])

    def create_model(self):
        return tf.keras.Sequential([
            tf.keras.Input((self.state_dim,)),
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(1, activation='linear')
        ])

    def compute_loss(self, v_pred, td_targets):
        mse = tf.keras.losses.MeanSquaredError()
        return mse(td_targets, v_pred)

    def train(self, states, td_targets):
        with tf.GradientTape() as tape:
            v_pred = self.model(states, training=True)
            assert v_pred.shape == td_targets.shape
            loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss


In [8]:
#%%
def make_new():
    game = FlappyBird()
    env = PLE(game)
    return env


## (二)、Preprocess

* 我們有對TA_state做normalization的處理，最後會存進model裡，讓model在執行助教環境時，會自動進行normalization。

In [9]:
#%%
def TA_state(game):
    state = copy.deepcopy(game.getGameState())
    state['next_next_pipe_bottom_y'] -= state['player_y']
    state['next_next_pipe_top_y'] -= state['player_y']
    state['next_pipe_bottom_y'] -= state['player_y']
    state['next_pipe_top_y'] -= state['player_y']
    relative_state = list(state.values())
    #return the state in tensor type, with batch dimension
    relative_state = tf.convert_to_tensor(relative_state, dtype=tf.float32)
    relative_state = tf.expand_dims(relative_state, axis=0)
    return relative_state/[512,18,288,100,100,288,100,100]
    #return relative_state/[512,10,36,100,100,36,100,100]

In [10]:
#%%
class Agent:
    def __init__(self, env_name):
        env = make_new()
        self.env_name = env_name
        self.state_dim = TA_state(env).shape[1]
        self.action_dim = len(env.getActionSet())

        self.global_actor = Actor(self.state_dim, self.action_dim)
        self.global_critic = Critic(self.state_dim)
        #self.num_workers = cpu_count()
        self.num_workers = 1
        #self.num_workers = 1
    def train(self, max_episodes=1000000):
        workers = []
        for i in range(self.num_workers):
            env = make_new()
            workers.append(WorkerAgent(
                env, self.global_actor, self.global_critic, max_episodes))

        for worker in workers:
            worker.start()

        for worker in workers:
            worker.join()


## (三)、Reward
* 原本有嘗試更改reward，變成只要存活就給reward，但這樣容易造成機率傾斜到固定一個動作的問題(e.g. 一直no flap，reward仍一直增加，造成no flap的機率大到接近1，讓flap的可能性變很低)，後來還是維持遊戲原本的reward計算方式。

In [11]:
#%%
def reward_trans(reward):
    if reward == 0:
        reward = args['reward_no_die'] 
    elif reward == 1:
        reward = args['reward_through']
    elif reward == -5:
        reward = args['reward_die']
    return reward 



## (四)、Training
* 可看到train到後面，reward常出現1000以上。
* 我們嘗試在train的過程中，對actor產出動作的機率設置threshold，希望讓model能多做一點探索，結果確實能加速model train得更好，但其實沒加也可達到同樣的結果，只是需要train的episode變多。

In [12]:
#%%
import time
class WorkerAgent(Thread):
    def __init__(self, env, global_actor, global_critic, max_episodes):
        Thread.__init__(self)
        self.lock = Lock()
        self.env = env
        self.state_dim = TA_state(self.env).shape[1]
        self.action_dim = len(self.env.getActionSet())

        self.max_episodes = max_episodes
        self.global_actor = global_actor
        self.global_critic = global_critic
        self.actor = Actor(self.state_dim, self.action_dim)
        self.critic = Critic(self.state_dim)

        self.actor.model.set_weights(self.global_actor.model.get_weights())
        self.critic.model.set_weights(self.global_critic.model.get_weights())

    def n_step_td_target(self, rewards, next_v_value, done):
        td_targets = np.zeros_like(rewards)
        cumulative = 0
        if not done:
            cumulative = next_v_value

        for k in reversed(range(0, len(rewards))):
            cumulative = args['gamma'] * cumulative + rewards[k]
            td_targets[k] = cumulative
        return td_targets

    def advatnage(self, td_targets, baselines):
        return td_targets - baselines

    def list_to_batch(self, list):
        batch = list[0]
        for elem in list[1:]:
            batch = np.append(batch, elem, axis=0)
        return batch

    def train(self):
        global CUR_EPISODE

        while self.max_episodes >= CUR_EPISODE:
            t1 = time.time()
            state_batch = []
            action_batch = []
            reward_batch = []
            episode_reward, done = 0, False
            self.env.reset_game()
            
            state = TA_state(self.env)
            total_loss =0
            step = 0
            while not done:
                probs = self.actor.model.predict(state)

#                 if CUR_EPISODE<1000:
#                     if 0.5-abs(probs[0][0]-0.5)<max(args['MIN_EXPLORING_RATE'], min(0.5, 0.99**((CUR_EPISODE)))):
#                         if probs[0][0]>0.5:
#                             probs[0][0] = 1-max(args['MIN_EXPLORING_RATE'], min(0.5, 0.99**((CUR_EPISODE))))
#                             probs[0][1] = max(args['MIN_EXPLORING_RATE'], min(0.5, 0.99**((CUR_EPISODE))))
#                         else:
#                             probs[0][0] = max(args['MIN_EXPLORING_RATE'], min(0.5, 0.99**((CUR_EPISODE))))
#                             probs[0][1] = 1-max(args['MIN_EXPLORING_RATE'], min(0.5, 0.99**((CUR_EPISODE))))
#                     else :
#                         probs[0][0] = probs[0][0]
#                         probs[0][1] = probs[0][1]
#                 else:
#                     probs[0][0] = probs[0][0]
#                     probs[0][1] = probs[0][1]
                action = np.random.choice(self.action_dim, p=probs[0])
                #print(probs[0])
                #action = np.random.choice(self.action_dim, p=[0.5,0.5])
                reward = reward_trans(self.env.act(self.env.getActionSet()[action])) 
                step+=1
                next_state = TA_state(self.env)
                done = self.env.game_over()

                action = np.reshape(action, [1, 1])
                reward = np.reshape(reward, [1, 1])

                state_batch.append(state)
                action_batch.append(action)
                reward_batch.append(reward)

                if len(state_batch) >= args['update_interval'] or done:
                    states = self.list_to_batch(state_batch)
                    actions = self.list_to_batch(action_batch)
                    rewards = self.list_to_batch(reward_batch)

                    next_v_value = self.critic.model.predict(next_state)
                    td_targets = self.n_step_td_target(
                        rewards, next_v_value, done)
                    advantages = td_targets - self.critic.model.predict(states)
                    
                    with self.lock:
                        actor_loss = self.global_actor.train(
                            states, actions, advantages)
                        critic_loss = self.global_critic.train(
                            states, td_targets)

                        self.actor.model.set_weights(
                            self.global_actor.model.get_weights())
                        self.critic.model.set_weights(
                            self.global_critic.model.get_weights())
                    
                    total_loss+=actor_loss
                    total_loss+=critic_loss

                    state_batch = []
                    action_batch = []
                    reward_batch = []
                    td_target_batch = []
                    advatnage_batch = []

                episode_reward += reward[0][0]
                state = next_state
            if CUR_EPISODE % 1 == 0:
                t2 = time.time()
                print('EP{} EpisodeReward={} TotalLoss={} Step={} Time={}\n'.format(CUR_EPISODE, episode_reward,total_loss,step,t2-t1))
            if episode_reward >=100:
                modelactor = self.global_actor.model
                modelactor.save("/tf/Competition04/state_a3c_cpu=1_interval=4000/a3c_cpu=1_trainreward="+str(episode_reward)+"_CUR_EPISODE="+str(CUR_EPISODE))
            wandb.log({'Reward': episode_reward,'Total Loss':total_loss})
            CUR_EPISODE += 1

    def run(self):
        self.train()


In [None]:
#%%
env_name = 'Flappy_bird_v2'
agent = Agent(env_name)
agent.train(100000)
#%%

EP0 EpisodeReward=-5 TotalLoss=4.128190517425537 Step=38 Time=1.0922420024871826

EP1 EpisodeReward=-5 TotalLoss=2.52353835105896 Step=45 Time=0.9741809368133545

EP2 EpisodeReward=-5 TotalLoss=2.382324457168579 Step=45 Time=1.0742008686065674

EP3 EpisodeReward=-5 TotalLoss=1.723806381225586 Step=56 Time=1.157773494720459

EP4 EpisodeReward=-5 TotalLoss=1.69681715965271 Step=48 Time=0.9412837028503418

EP5 EpisodeReward=-5 TotalLoss=1.6354947090148926 Step=48 Time=1.0291063785552979

EP6 EpisodeReward=-5 TotalLoss=1.1407008171081543 Step=49 Time=1.0345277786254883

EP7 EpisodeReward=-5 TotalLoss=0.7829761505126953 Step=47 Time=0.945026159286499

EP8 EpisodeReward=-5 TotalLoss=0.9279395341873169 Step=48 Time=0.9406614303588867

EP9 EpisodeReward=-5 TotalLoss=0.7882435321807861 Step=62 Time=1.3607454299926758

EP10 EpisodeReward=-5 TotalLoss=0.5492180585861206 Step=53 Time=1.308349370956421

EP11 EpisodeReward=-5 TotalLoss=0.25456753373146057 Step=54 Time=1.0657684803009033

EP12 Episod

# (五)、Conclusion

* 本次比賽的模型雖然是A3C的架構，但整個training的過程，只用單核(A2C)進行training。
* 在training時要有耐心，episode在1000次以前不容易train起來，在1000次以後整體reward會提升許多。
* 即便不調整原先reward的設計，用A2C也能train得不錯。
* 最後模型在TA環境實測，一個episode的reward可以達150萬以上。
