* Learn Q function by DQN with Ape-X (3 explorers + 1 learner)
  * Encode board
    * Player position is always center
    * Player length is set at center position
    * Around enemy heads are marked as RISK
    * Geese tails are marked as RISK (When the goose eat food, then the tail will stay there)
  * Each explorer has 2 NN actors and 2 rule based greedy actors
* Rule based Safe Guard to avoid Body Hit and Collision
* Submit multiple files (code + model file) under this instruction https://www.kaggle.com/c/google-football/discussion/191257

In [None]:
!pip install -U kaggle_environments cpprb

In [None]:
import gc
from multiprocessing import set_start_method, cpu_count, Process, Event, SimpleQueue
import time

import numpy as np
import tensorflow as tf
import cpprb # Replay Buffer Library: https://ymd_h.gitlab.io/cpprb/
from tqdm.notebook import tqdm

from kaggle_environments.envs.hungry_geese.hungry_geese import Observation, Configuration, Action, row_col
from kaggle_environments import make

# %load_ext tensorboard
# %tensorboard --logdir logs

In [None]:
# Global config
#RIGHT = 0
#GO = 1
#LEFT = 2

GOOSE = -1.0
RISK = GOOSE/2 # Half of GOOSE (= -0.5)
NONE = 0.0
FOOD = 1.0

act_shape = 4

WIDTH = 11
HEIGHT = 7

xc = WIDTH//2 + 1
yc = HEIGHT//2 + 1

EAST_idx  = (xc+1,yc  )
NORTH_idx = (xc  ,yc-1)
WEST_idx  = (xc-1,yc  )
SOUTH_idx = (xc  ,yc+1)

AROUND = ([xc+1,xc  ,xc-1,xc  ],
          [yc  ,yc-1,yc  ,yc+1])

code2dir = {0:'EAST', 1:'NORTH', 2:'WEST', 3:'SOUTH'}

dir2code = {"EAST":0, "NORTH": 1, "WEST":2, "SOUTH": 3}

In [None]:
def create_model():
    model = tf.keras.Sequential([tf.keras.layers.Dense(100,activation="relu",input_shape=(WIDTH*HEIGHT,)),
                                 tf.keras.layers.Dense(100,activation="relu"),
                                 tf.keras.layers.Dense(100,activation="relu"),
                                 tf.keras.layers.Dense(act_shape)])
    return model

In [None]:
def Q_func(model,obs,act):
    return tf.reduce_sum(model(obs) * tf.one_hot(act,depth=act_shape), axis=1)

def Q1_func(model,next_obs,rew,done):
    gamma = 0.99
    return gamma*tf.reduce_max(model(next_obs),axis=1)*(1.0-done) + rew

#@tf.function
def train_then_absTD(model,target,obs,act,rew,next_obs,done,weights):
    with tf.GradientTape() as tape:
        tape.watch(model.trainable_weights)
        Q = Q_func(model,obs,act)
        yQ1_r = Q1_func(target,next_obs,rew,done)
        TD_square = tf.square(Q - yQ1_r)
        weighted_loss = tf.reduce_mean(TD_square * weights)

    grad = tape.gradient(weighted_loss,model.trainable_weights)
    opt.apply_gradients(zip(grad,model.trainable_weights))

    Qnew = Q_func(model,obs,act)
    return tf.abs(Qnew - yQ1_r)

#@tf.function
def abs_TD(model,target,obs,act,rew,next_obs,done):
    Q = Q_func(model,obs,act)
    yQ1_r = Q1_func(target,next_obs,rew,done)
    return tf.abs(Q - yQ1_r)   

In [None]:
def pos(index):
    return index%WIDTH, index//WIDTH

def centering(z,dz,Z):
    z += dz
    if z < 0:
        z += Z
    elif Z >= Z:
        z -= Z
    return z
    

def encode_board(obs,act="NORTH",idx=0):
    """
    Player goose is always set at the center
    """
    board = np.zeros((WIDTH,HEIGHT))

    if len(obs["geese"][idx]) == 0:
        return board
        
    x0, y0 = pos(obs["geese"][idx][0])
    dx = xc - x0
    dy = yc - y0
    
    for goose in obs["geese"]:
        if len(goose) == 0:
            continue

        for g in goose[:-1]:
            x, y = pos(g)
            x = centering(x,dx,WIDTH)
            y = centering(y,dy,HEIGHT)       
            board[x,y] = GOOSE
        
        # Tail as Risk
        x, y = pos(goose[-1])
        x = centering(x,dx,WIDTH)
        y = centering(y,dy,HEIGHT)
        board[x,y] = RISK
        

    for food in obs["food"]:
        x, y = pos(food)
        x = centering(x,dx,WIDTH)
        y = centering(y,dy,HEIGHT)
        board[x,y] = FOOD

    # Set RISK for around enemy geese head
    for i, goose in enumerate(obs["geese"]):
        if (i == idx) or (len(goose) == 0):
            continue
        x, y = pos(goose[0])
        if (y < HEIGHT-1) and (board[x,y+1] != GOOSE):
            board[x,y+1] += RISK
        if (y > 0) and (board[x,y-1] != GOOSE):
            board[x,y-1] += RISK
        if (x < WIDTH-1) and (board[x+1,y] != GOOSE):
            board[x+1,y] += RISK
        if (x > 0) and (board[x-1,y] != GOOSE):
            board[x-1,y] += RISK
        
    board[xc,yc] = len(obs["geese"][idx]) # self length

    # Avoid Body Hit add psudo GOOSE
    if act == "EAST":
        board[WEST_idx] = GOOSE
    elif act == "NORTH":
        board[SOUTH_idx] = GOOSE
    elif act == "WEST":
        board[EAST_idx] = GOOSE
    elif act == "SOUTH":
        board[NORTH_idx] = GOOSE
    else:
        raise
    
    return board

In [None]:
def get_obs_action(model,states,idx=0, train=False):
    act = states[idx]["action"]

    if states[idx]["status"] != "ACTIVE":
        return None, act
    
    board = encode_board(states[0]["observation"],act=act,idx=idx)
    
    # e-greedy
    if train:
        if np.random.random() < 0.1:
            new_act = np.random.randint(4)
        else:
            new_act = int(tf.math.argmax(tf.squeeze(model(board.reshape(1,-1)))))    
    else:
        Q = tf.squeeze(model(board.reshape(1,-1))).numpy()
        OK = (board[AROUND] != GOOSE)
        
        new_act = 0
        max_v = -99999
        for i, (q,ok) in enumerate(zip(Q,OK)):
            if (q > max_v) and ok:
                new_act = i
                max_v = q

    return board, code2dir[new_act]

In [None]:
def get_obs_action_greedy(states,idx=0):
    act = states[idx]["action"]
    
    if states[idx]["status"] != "ACTIVE":
        return None, act
    
    board = encode_board(states[0]["observation"],act=act,idx=idx)
    
    obs = states[0]["observation"]

    if len(obs["geese"][idx]) == 0 or len(obs["food"]) == 0:
        return board, act
    
    x0, y0 = pos(obs["geese"][idx][0])
    
    min_len = WIDTH + HEIGHT
    min_i = 0
    NG = (board[AROUND] == GOOSE)
    for i, food in enumerate(obs["food"]):
        x, y = pos(food)
        
        dx = x - x0
        dy = y - y0
        L = abs(dx) + abs(dy)
        
        if dx == 0:
            if (dy > 0) and NG[dir2code["SOUTH"]]:
                L += 2
            elif (dy < 0) and NG[dir2code["NORTH"]]:
                L += 2
        if dy == 0:
            if (dx > 0) and NG[dir2code["EAST"]]:
                L += 2
            elif (dx < 0) and NG[dir2code["WEST"]]:
                L += 2
            
        if L < min_len:
            min_len = L
            min_i = i

    food = obs["food"][min_i]
    x, y = pos(food)

    if (x > x0):
        return board, "EAST"
    
    if (x < x0):
        return board, "WEST"
    
    if (y > y0):
        return board, "SOUTH"
    
    if (y < y0):
        return board, "NORTH"
    
    return board, act

In [None]:
def create_buffer(buffer_size,env_dict,alpha):
    return cpprb.MPPrioritizedReplayBuffer(buffer_size,env_dict,alpha=alpha)

In [None]:
def explorer(global_rb,env_dict,is_training_done,queue):
    local_buffer_size = int(1e+2)
    local_rb = cpprb.ReplayBuffer(local_buffer_size+4,env_dict)

    model = create_model()
    target = tf.keras.models.clone_model(model)
    env = make("hungry_geese", debug=False)
    
    states = env.reset(4)
    while not is_training_done.is_set():
        if not queue.empty():
            w,wt = queue.get()
            model.set_weights(w)
            target.set_weights(wt)

        board_act = [get_obs_action(model,states,i,train=True) if i < 2 else get_obs_action_greedy(states,i)
                     for i in range(4)]

        states = env.step([a for b,a in board_act])

        for i, (b, a) in enumerate(board_act):
            if b is None:
                continue

            local_rb.add(obs=b.ravel(),
                         act=dir2code[a],
                         next_obs=encode_board(states[0]["observation"],act=a,idx=i).ravel(),
                         rew=states[i]["reward"],
                         done=(states[i]["status"] != "ACTIVE"))

        if all(s["status"] != "ACTIVE" for s in states):
            states = env.reset(4)
            local_rb.on_episode_end()

        if local_rb.get_stored_size() >= local_buffer_size:
            sample = local_rb.get_all_transitions()
            global_rb.add(**sample,
                          priorities=abs_TD(model,target,
                                            tf.constant(sample["obs"]),
                                            tf.constant(sample["act"].ravel()),
                                            tf.constant(sample["rew"].ravel()),
                                            tf.constant(sample["next_obs"]),
                                            tf.constant(sample["done"].ravel())))
            local_rb.clear()            

In [None]:
%%time

# Training
n_warming = 100
n_train_step = int(1e+4)
batch_size = 64

writer = tf.summary.create_file_writer("./logs")

# Replay Buffer 
buffer_size = 10e+5
env_dict = {"obs": {"shape": (WIDTH*HEIGHT)},
            "act": {"dtype": int},
            "next_obs": {"shape": (WIDTH*HEIGHT)},
            "rew": {},
            "done": {}}
alpha = 0.5
rb = create_buffer(buffer_size, env_dict,alpha)

# Model
target_update = 50


model = create_model()
target = tf.keras.models.clone_model(model)

opt = tf.keras.optimizers.Adam()

# Ape-X
explorer_update_freq = 100
n_explorer = cpu_count() - 1


is_training_done = Event()
is_training_done.clear()

qs = [SimpleQueue() for _ in range(n_explorer)]
ps = [Process(target=explorer,
              args=[rb,env_dict,is_training_done,q])
      for q in qs]

for p in ps:
    p.start()

print("warm-up")
while rb.get_stored_size() < n_warming:
    time.sleep(1)


print("training")
    
epoch = 0
for i in tqdm(range(n_train_step)):        
    sample = rb.sample(batch_size,beta=0.4)
    
    absTD = train_then_absTD(model,target,
                             tf.constant(sample["obs"]),
                             tf.constant(sample["act"].ravel()),
                             tf.constant(sample["rew"].ravel()),
                             tf.constant(sample["next_obs"]),
                             tf.constant(sample["done"].ravel()),
                             tf.constant(sample["weights"].ravel()))
    rb.update_priorities(sample["indexes"],absTD)
        
    if i % target_update == 0:
        target.set_weights(model.get_weights())
        
    if i % explorer_update_freq == 0:
        w = model.get_weights()
        wt = target.get_weights()
        for q in qs:
            q.put((w,wt))

    
is_training_done.set()

!mkdir -p sub
model.save("sub/model")

for p in ps:
    p.join()

In [None]:
test_env = make("hungry_geese", debug=True)

for _ in range(4):
    states = test_env.reset(4)
    while any(s["status"] == "ACTIVE" for s in states):
        board_act = [get_obs_action(model,states,i) for i in range(4)]
        #board_act = [get_obs_action_greedy(states,i) for i in range(4)]
        states = test_env.step([a for b,a in board_act])

    test_env.render(mode='ipython')

In [None]:
%%writefile sub/main.py

import sys
import os

sys.path.append("/kaggle_simulations/agent")
working_dir = "/kaggle_simulations/agent"

if os.path.exists("sub/model"):
    model_f = "sub/model"
elif os.path.exists(os.path.join(working_dir,"model")):
    model_f = os.path.join(working_dir,"model")
else:
    raise ValueError("No model file")
    
print(model_f)


import numpy as np
import tensorflow as tf

from kaggle_environments.envs.hungry_geese.hungry_geese import Observation, Configuration, Action, row_col

GOOSE = -1.0
RISK = GOOSE/2 # Half of GOOSE (= -0.5)
NONE = 0.0
FOOD = 1.0

act_shape = 4

WIDTH = 11
HEIGHT = 7

xc = WIDTH//2 + 1
yc = HEIGHT//2 + 1

EAST_idx  = (xc+1,yc  )
NORTH_idx = (xc  ,yc-1)
WEST_idx  = (xc-1,yc  )
SOUTH_idx = (xc  ,yc+1)


AROUND = ([xc+1,xc  ,xc-1,xc  ],
          [yc  ,yc-1,yc  ,yc+1])


code2dir = {0:'EAST', 1:'NORTH', 2:'WEST', 3:'SOUTH'}
dir2code = {"EAST":0, "NORTH": 1, "WEST":2, "SOUTH": 3}


policy = tf.keras.models.load_model(model_f)
LAST_ACT = "NORTH"

def pos(index):
    return index%WIDTH, index//WIDTH

def centering(z,dz,Z):
    z += dz
    if z < 0:
        z += Z
    elif Z >= Z:
        z -= Z
    return z
    

def encode_board(obs,idx=0):
    """
    Player goose is always set at the center
    """
    global LAST_ACT
    act = LAST_ACT

    board = np.zeros((WIDTH,HEIGHT))

    if len(obs["geese"][idx]) == 0:
        return board
        
    x0, y0 = pos(obs["geese"][idx][0])
    dx = xc - x0
    dy = yc - y0
    
    for goose in obs["geese"]:
        if len(goose) == 0:
            continue

        for g in goose[:-1]:
            x, y = pos(g)
            x = centering(x,dx,WIDTH)
            y = centering(y,dy,HEIGHT)
            board[x,y] = GOOSE
            
        # Tail as Risk
        x, y = pos(goose[-1])
        x = centering(x,dx,WIDTH)
        y = centering(y,dy,HEIGHT)
        board[x,y] = RISK

            
    for food in obs["food"]:
        x, y = pos(food)
        x = centering(x,dx,WIDTH)
        y = centering(y,dy,HEIGHT)
        board[x,y] = FOOD
        
    # Set RISK for around enemy geese head
    for i, goose in enumerate(obs["geese"]):
        if (i == idx) or (len(goose) == 0):
            continue
        x, y = pos(goose[0])
        if (y < HEIGHT-1) and (board[x,y+1] != GOOSE):
            board[x,y+1] += RISK
        if (y > 0) and (board[x,y-1] != GOOSE):
            board[x,y-1] += RISK
        if (x < WIDTH-1) and (board[x+1,y] != GOOSE):
            board[x+1,y] += RISK
        if (x > 0) and (board[x-1,y] != GOOSE):
            board[x-1,y] += RISK
        
    board[xc,yc] = len(obs["geese"][idx]) # self length

    # Avoid Body Hit add psudo GOOSE
    if act == "EAST":
        board[WEST_idx] = GOOSE
    elif act == "NORTH":
        board[SOUTH_idx] = GOOSE
    elif act == "WEST":
        board[EAST_idx] = GOOSE
    elif act == "SOUTH":
        board[NORTH_idx] = GOOSE
    else:
        raise

    return board


def get_action(obs_dict,config_dict):
    global policy
    global LAST_ACT
    
    idx = Observation(obs_dict).index
    board = encode_board(obs_dict,idx)

    Q = tf.squeeze(policy(board.reshape(1,-1))).numpy()
    OK = (board[AROUND] != GOOSE)

    new_act = 0
    max_v = -99999
    for i, (q,ok) in enumerate(zip(Q,OK)):
        if (q > max_v) and ok:
            new_act = i
            max_v = q
    
    LAST_ACT = code2dir[new_act]

    return LAST_ACT

In [None]:
# Test with self

test_env.run(["sub/main.py","sub/main.py","sub/main.py","sub/main.py"])
test_env.render(mode='ipython')

In [None]:
# Test with sample agent

test_env.run(["sub/main.py","sub/main.py","../input/hungry-geese/agent.py","../input/hungry-geese/agent.py"])
test_env.render(mode='ipython')

In [None]:
import tarfile
import os.path

def make_tarfile(output_filename, source_dir):
    with tarfile.open(output_filename, "w:gz") as tar:
        tar.add(source_dir, arcname=os.path.basename(source_dir))

make_tarfile('submission.tar.gz', './sub/')