I made Q-learning starter notebook.

It's first time to try reinforcement learning, so if you find mistake, please tell me by comment.
Also, I'm glad if you share me opinion to make good state key by comment.

In [None]:
from kaggle_environments import make

env = make("kore_fleets", debug=True)
print(env.name, env.version)

In [None]:
from kaggle_environments.envs.kore_fleets.helpers import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import random
import seaborn as sns
from tqdm import tqdm
import itertools
import pickle

Copy from [Kore Intro Part 3: Expanding the Empire!](https://www.kaggle.com/code/bovard/kore-intro-part-3-expanding-the-empire) to use when learning.

In [None]:
%%writefile pilot.py
   
from kaggle_environments.envs.kore_fleets.helpers import *
from random import randint

# a flight plan
def build_flight_plan(dir_idx, size):
    flight_plan = ""
    for i in range(4):
        flight_plan += Direction.from_index((dir_idx + i) % 4).to_char()
        if not i == 3:
            flight_plan += str(size)
    return flight_plan

def agent(obs, config):
    board = Board(obs, config)
    me=board.current_player

    me = board.current_player
    turn = board.step
    spawn_cost = board.configuration.spawn_cost
    kore_left = me.kore

    for shipyard in me.shipyards:
        if shipyard.ship_count >= 50:
            flight_plan = build_flight_plan(randint(0, 3), randint(2, 9))
            action = ShipyardAction.launch_fleet_with_flight_plan(50, flight_plan)
            shipyard.next_action = action
        elif kore_left >= spawn_cost:
            action = ShipyardAction.spawn_ships(1)
            shipyard.next_action = action

    return me.next_actions

# Start Q-learning

I refer to this article [kaggleで強化学習をやってみた](https://yukoishizaki.hatenablog.com/entry/2020/04/05/202935) to make QTable and QLearningAgent.

In [None]:
# Making list of shipyard actions.
num_ship_list = [3,5,8,13,21,34,55,91]

actions_list = [f'spawn_{i}' for i in range(1, 9)]
actions_list += [f'straight_{i}_{s}' for i in [2, 5, 9] for s in num_ship_list]
actions_list += [f'cycle_{i}_{s}_{d}' for i in [2, 5, 9] for s in num_ship_list[4:] for d in range(4) ]
actions_list += [f'invade_{s}' for s in num_ship_list[4:]]
actions_list += [f'build_{s}' for s in num_ship_list[6:]]
actions_list

In [None]:
# Q table
class QTable():
    def __init__(self, actions):
        self.Q = {}
        self.actions = actions
    
    def get_state_key(self, state):
        # Use number of kore owned and ship number as state key.
        board = Board(state, env.configuration)
        me=board.current_player
        me_obser = me._observation
        
        turn = board.step
        kore_left = me.kore
        
        ship_number = 0
        for value in me_obser[1].values():
            ship_number += value[1]
        
        fleet_number = 0
        for value in me_obser[2].values():
            fleet_number += value[2]
        return f'{int(np.log(kore_left+1))}_{int(ship_number / 2)}'
        
    def get_q_values(self, state):
        # Output an array of Q values for all actions for the state
        state_key = self.get_state_key(state)
        if state_key not in self.Q.keys():
            self.Q[state_key] = [0] * len(self.actions)
        return self.Q[state_key]
    
    def update(self, state, action_str, add_q):
        state_key = self.get_state_key(state)
        self.Q[state_key] = [q + add_q if self.actions[idx] == action_str else q for idx, q in enumerate(self.Q[state_key])]

In [None]:
# Helper of actions
def pos_to_index(pos):
    x = pos[0]
    y = pos[1]
    x = (x + 31) if x < 0 else x
    x = (x - 31) if x > 30 else x
    y = (y + 31) if y < 0 else y
    y = (y - 31) if y > 30 else y

    return x * 31 + y

# Get straight score
def get_straight_score(pos, kore, length, direction):
    score = 0
    for i in range(length):
        pos += direction.to_point()
        score += kore[pos_to_index(pos)]
    return score

def get_max_straight_plan(pos, kore, length):
    scores = []
    for i in range(4):
        direction = Direction.from_index(i)
        scores.append(get_straight_score(pos, kore, length, direction))
    scores = sorted(scores, reverse=True)
#     direction_index = scores.index(scores[np.random.randint(2)])
    direction_index = scores.index(scores[0])
    
    direction = Direction.from_index(direction_index)
    flight_plan = direction.to_char()
    flight_plan += str(length)
    flight_plan += direction.opposite().to_char()
    return flight_plan


# Get maximum cycle
def get_max_cycle_plan(pos, kore, length, direct_i):
    direction = Direction.from_index(direct_i) 
    each_lengths = [i for i in range(1,length)]
    each_lengths = [i for i in itertools.product(each_lengths, repeat=2)]
    each_lengths = [i + tuple([i[0], i[1]]) for i in each_lengths]
    
    # Get max score
    scores = []
    for rotate_i in range(2):
        for each_length in each_lengths:
            each_pos = pos
            each_direction = direction
            score = 0
            for direct_i in range(4):
                for i in range(each_length[direct_i]):
                    each_pos += each_direction.to_point()
                    score += kore[pos_to_index(each_pos)]
                if rotate_i == 0:
                    each_direction = each_direction.rotate_right()
                else:
                    each_direction = each_direction.rotate_left()
            scores.append(score)
    
    # Get flight plan
    rotate_right = True
    best_index = scores.index(max(scores))
    if best_index >= len(each_lengths):
        best_index -= len(each_lengths)
        rotate_right = False
        
    best_each_length = each_lengths[best_index]
    each_direction = direction
    flight_plan = ""
    for i in range(4):
        flight_plan += each_direction.to_char()
        
        length = best_each_length[i] - 1
        if i != 3 and length > 0:
            flight_plan += str(length)
        if rotate_right:
            each_direction = each_direction.rotate_right()
        else:
            each_direction = each_direction.rotate_left()
    return flight_plan


def get_closest_enemy_shipyard(board, position, me):
    min_dist = 1000000
    enemy_shipyard = None
    for shipyard in board.shipyards.values():
        if shipyard.player_id == me.id:
            continue
        dist = position.distance_to(shipyard.position, board.configuration.size)
        if dist < min_dist:
            min_dist = dist
            enemy_shipyard = shipyard
    return enemy_shipyard

In [None]:
# Convert action string to ShipyardAction.
def shipyard_action(action_str, pos, kore, board=None, me=None):
    inst_list = action_str.split('_')
    if inst_list[0] == 'spawn':
        return ShipyardAction.spawn_ships(int(inst_list[1]))
    
    elif inst_list[0] == 'straight':
        flight_plan = get_max_straight_plan(pos, kore, int(inst_list[1]))
        return ShipyardAction.launch_fleet_with_flight_plan(int(inst_list[2]), flight_plan)
    
    elif inst_list[0] == 'cycle':
        flight_plan = get_max_cycle_plan(pos, kore, int(inst_list[1]), int(inst_list[3]))
        return ShipyardAction.launch_fleet_with_flight_plan(int(inst_list[2]), flight_plan)
    
    elif inst_list[0] == 'invade':
        closest_enemy_shipyard = get_closest_enemy_shipyard(board, pos, me)
        if not closest_enemy_shipyard:
            return None
        enemy_pos = closest_enemy_shipyard.position
        my_pos = pos
        flight_plan = "N" if enemy_pos.y > my_pos.y else "S"
        flight_plan += str(abs(enemy_pos.y - my_pos.y) - 1)
        flight_plan += "W" if enemy_pos.x < my_pos.x else "E"
        if (abs(enemy_pos.y - my_pos.y) - 1) < 0:
            return None
        if not all([c in "NESWC0123456789" for c in flight_plan]):
            return None

        return ShipyardAction.launch_fleet_with_flight_plan(int(inst_list[1]), flight_plan)

    elif inst_list[0] == 'build':
        length = 6
        flight_plan = get_max_straight_plan(pos, kore, length)
        flight_plan = flight_plan[:-1]
        direction = Direction.from_char(flight_plan[0])
        direction = direction.rotate_right()
        flight_plan += direction.to_char()
        flight_plan += 'C'
        return ShipyardAction.launch_fleet_with_flight_plan(int(inst_list[1]), flight_plan)

In [None]:
env = make("kore_fleets", debug=True)
trainer = env.train([None, "./pilot.py", "./pilot.py", "./pilot.py"])

class QLearningAgent():
    def __init__(self, env, epsilon=0.99):
        self.env = env
        self.actions = actions_list.copy()
        self.q_table = QTable(self.actions)
        self.epsilon = epsilon
        self.reward_log = []    
        
    def policy(self, state, shipyard, kore_left):
        # Require: spawn_cost, kore_left,
        possible_actions = self.actions.copy()
        for action in self.actions:
            inst_list = action.split('_')
            if inst_list[0] == 'spawn':
                if (int(inst_list[1]) * 10) >= kore_left or int(inst_list[1]) > shipyard.max_spawn:
                    possible_actions.remove(action)
            elif inst_list[0] == 'straight' or inst_list[0] == 'cycle':
                if int(inst_list[2]) > shipyard.ship_count:
                    possible_actions.remove(action)
            elif inst_list[0] == 'invade' or inst_list[0] == 'build':
                if int(inst_list[1]) > shipyard.ship_count:
                    possible_actions.remove(action)
        if len(possible_actions) == 0:
            return None
        # Epsilon-Greedy
        if np.random.random() < self.epsilon:
            return random.choice(possible_actions)
        else:
            # Select max Q value action.
            q_values = self.q_table.get_q_values(state)
            selected_items = [q if self.actions[idx] in possible_actions else -1e7 for idx, q in enumerate(q_values)]
            return self.actions[int(np.argmax(selected_items))]
        
    def learn(self, trainer, episode_cnt=100, gamma=0.6, 
              learn_rate=0.3, epsilon_decay_rate=0.99, min_epsilon=0.1):
        for episode in tqdm(range(episode_cnt)):
            state = trainer.reset()
            self.epsilon = max(min_epsilon, self.epsilon * epsilon_decay_rate) 
            while not env.done:
                # Update q to each shipyard.
                board = Board(state, self.env.configuration)
                spawn_cost = board.configuration.spawn_cost

                me = board.current_player
                turn = board.step
                kore_left = me.kore

                observation = board.observation
                kore = observation['kore']
                
                actions_str = []
                # loop through all shipyards you control
                for shipyard in me.shipyards:
                    spawn_max = shipyard.max_spawn
                    action_str = self.policy(state, shipyard, kore_left)
                    actions_str.append(action_str)
                    if action_str == None:
                        actions_str.append(None)
                        continue
                    shipyard.next_action = shipyard_action(action_str, shipyard.position, kore, board, me)
                next_state, reward, done, info = trainer.step(me.next_actions)
                
                reward = kore_left
                for value in me._observation[2].values():
                    reward += (value[2] * 12)
                reward += (len(me.shipyards) * 60)
                gain = reward + gamma * max(self.q_table.get_q_values(next_state))
                
                # Calucurate error and update Q-table.
                for i, shipyard in enumerate(me.shipyards):
                    action_str = actions_str[i]
                    if action_str == None:
                        continue
                    estimate = self.q_table.get_q_values(state)[self.actions.index(action_str)]
                    self.q_table.update(state, action_str, learn_rate * (gain - estimate))

                state = next_state
            self.reward_log.append(reward)

In [None]:
episode_cnt=100
epsilon_decay_rate=0.995
x = [i for i in range(episode_cnt)]
y = [max(0.99*(epsilon_decay_rate**i), 0.1) for i in range(episode_cnt)]
plt.plot(x, y)

In [None]:
# 学習
qa = QLearningAgent(env)
qa.learn(trainer, episode_cnt=episode_cnt, epsilon_decay_rate=epsilon_decay_rate)
# qa.learn(trainer, episode_cnt=1000)

sns.set(style='darkgrid')
# pd.DataFrame({'Average Reward': qa.reward_log}).rolling(10).mean().plot(figsize=(10,5))
pd.DataFrame({'Average Reward': qa.reward_log}).plot(figsize=(10,5))
plt.show()

In [None]:
# Sort by increasing Q value and tie the list of indexes to each state.
tmp_dict_q_table = qa.q_table.Q.copy()
dict_q_table = dict()

for k in tmp_dict_q_table:
    if np.count_nonzero(tmp_dict_q_table[k]) > 0:
        dict_q_table[k] = np.argsort(tmp_dict_q_table[k]).argsort().tolist()

In [None]:
with open('dict.pkl','wb') as f:
    pickle.dump(dict_q_table, f)

In [None]:
# with open("../input/kore2022dict/dict.pkl", mode="rb") as f:
#     dict_q_table = pickle.load(f)

In [None]:
my_agent = '''
from kaggle_environments.envs.kore_fleets.helpers import *
import numpy as np
import itertools
import random
# Helper
def pos_to_index(pos):
    x = pos[0]
    y = pos[1]
    x = (x + 31) if x < 0 else x
    x = (x - 31) if x > 30 else x
    y = (y + 31) if y < 0 else y
    y = (y - 31) if y > 30 else y

    return x * 31 + y

# Get straight score
def get_straight_score(pos, kore, length, direction):
    score = 0
    for i in range(length):
        pos += direction.to_point()
        score += kore[pos_to_index(pos)]
    return score

def get_max_straight_plan(pos, kore, length):
    scores = []
    for i in range(4):
        direction = Direction.from_index(i)
        scores.append(get_straight_score(pos, kore, length, direction))
    scores = sorted(scores, reverse=True)
    direction_index = scores.index(scores[0])

    direction = Direction.from_index(direction_index)
    flight_plan = direction.to_char()
    flight_plan += str(length)
    flight_plan += direction.opposite().to_char()
    return flight_plan


# Get maximum cycle
def get_max_cycle_plan(pos, kore, length, direct_i):
    direction = Direction.from_index(direct_i) 
    each_lengths = [i for i in range(1,length)]
    each_lengths = [i for i in itertools.product(each_lengths, repeat=2)]
    each_lengths = [i + tuple([i[0], i[1]]) for i in each_lengths]

    # Get max score
    scores = []
    for rotate_i in range(2):
        for each_length in each_lengths:
            each_pos = pos
            each_direction = direction
            score = 0
            for direct_i in range(4):
                for i in range(each_length[direct_i]):
                    each_pos += each_direction.to_point()
                    score += kore[pos_to_index(each_pos)]
                if rotate_i == 0:
                    each_direction = each_direction.rotate_right()
                else:
                    each_direction = each_direction.rotate_left()
            scores.append(score)

    # Get flight plan
    rotate_right = True
    best_index = scores.index(max(scores))
    if best_index >= len(each_lengths):
        best_index -= len(each_lengths)
        rotate_right = False

    best_each_length = each_lengths[best_index]
    each_direction = direction
    flight_plan = ""
    for i in range(4):
        flight_plan += each_direction.to_char()

        length = best_each_length[i] - 1
        if i != 3 and length > 0:
            flight_plan += str(length)
        if rotate_right:
            each_direction = each_direction.rotate_right()
        else:
            each_direction = each_direction.rotate_left()
    return flight_plan

def get_closest_enemy_shipyard(board, position, me):
    min_dist = 1000000
    enemy_shipyard = None
    for shipyard in board.shipyards.values():
        if shipyard.player_id == me.id:
            continue
        dist = position.distance_to(shipyard.position, board.configuration.size)
        if dist < min_dist:
            min_dist = dist
            enemy_shipyard = shipyard
    return enemy_shipyard


def shipyard_action(action_str, pos, kore, board, me):
    inst_list = action_str.split('_')
    if inst_list[0] == 'spawn':
        return ShipyardAction.spawn_ships(int(inst_list[1]))
    
    elif inst_list[0] == 'straight':
        flight_plan = get_max_straight_plan(pos, kore, int(inst_list[1]))
        return ShipyardAction.launch_fleet_with_flight_plan(int(inst_list[2]), flight_plan)
    
    elif inst_list[0] == 'cycle':
        flight_plan = get_max_cycle_plan(pos, kore, int(inst_list[1]), int(inst_list[3]))
        return ShipyardAction.launch_fleet_with_flight_plan(int(inst_list[2]), flight_plan)
    
    elif inst_list[0] == 'invade':
        closest_enemy_shipyard = get_closest_enemy_shipyard(board, pos, me)
        if not closest_enemy_shipyard:
            return None
        enemy_pos = closest_enemy_shipyard.position
        my_pos = pos
        flight_plan = "N" if enemy_pos.y > my_pos.y else "S"
        flight_plan += str(abs(enemy_pos.y - my_pos.y) - 1)
        flight_plan += "W" if enemy_pos.x < my_pos.x else "E"
        if (abs(enemy_pos.y - my_pos.y) - 1) < 0:
            return None
        if not all([c in "NESWC0123456789" for c in flight_plan]):
            return None

        return ShipyardAction.launch_fleet_with_flight_plan(int(inst_list[1]), flight_plan)

    elif inst_list[0] == 'build':
        length = 6
        flight_plan = get_max_straight_plan(pos, kore, length)
        flight_plan = flight_plan[:-1]
        direction = Direction.from_char(flight_plan[0])
        direction = direction.rotate_right()
        flight_plan += direction.to_char()
        flight_plan += 'C'
        return ShipyardAction.launch_fleet_with_flight_plan(int(inst_list[1]), flight_plan)
    
    

num_ship_list = [3,5,8,13,21,34,55,91]

actions_list = [f'spawn_{i}' for i in range(1, 9)]
actions_list += [f'straight_{i}_{s}' for i in [2, 5, 9] for s in num_ship_list]
actions_list += [f'cycle_{i}_{s}_{d}' for i in [2, 5, 9] for s in num_ship_list[4:] for d in range(4) ]
actions_list += [f'invade_{s}' for s in num_ship_list[4:]]
actions_list += [f'build_{s}' for s in num_ship_list[6:]]

def agent(obs, config):
    q_table = ''' \
    + str(dict_q_table).replace(' ', '') \
    + '''

    board = Board(obs, config)
    me=board.current_player
    me_obser = me._observation

    turn = board.step
    kore_left = me.kore
    
    observation = board.observation
    kore = observation['kore']

    ship_number = 0
    for value in me_obser[1].values():
        ship_number += value[1]
    fleet_number = 0
    for value in me_obser[2].values():
        fleet_number += value[2]
    state_key = f'{int(ship_number)}_{int(np.log(fleet_number+1))}'

    actions_str = []
    # loop through all shipyards you control
    for shipyard in me.shipyards:
        possible_actions = actions_list.copy()
        for action in actions_list:
            inst_list = action.split('_')
            if inst_list[0] == 'spawn':
                if (int(inst_list[1]) * 10) > kore_left or int(inst_list[1]) > shipyard.max_spawn:
                    possible_actions.remove(action)
            elif inst_list[0] == 'straight' or inst_list[0] == 'cycle':
                if int(inst_list[2]) > shipyard.ship_count:
                    possible_actions.remove(action)
            elif inst_list[0] == 'invade' or inst_list[0] == 'build':
                if int(inst_list[1]) > shipyard.ship_count:
                    possible_actions.remove(action)
        if len(possible_actions) == 0:
            continue
        if state_key not in q_table.keys():
            action_str = random.choice(possible_actions)
            shipyard.next_action = shipyard_action(action_str, shipyard.position, kore, board, me)
            continue

        q_values = q_table[state_key]
        # Determine if it is a possible action
        selected_items = [q if actions_list[idx] in possible_actions else -100 for idx, q in enumerate(q_values)]
        action_str = actions_list[int(np.argmax(selected_items))]
        actions_str.append(action_str)
        if action_str == None:
            actions_str.append(None)
            continue
        shipyard.next_action = shipyard_action(action_str, shipyard.position, kore, board, me)
    return me.next_actions
    '''

with open('submission.py', 'w') as f:
    f.write(my_agent)

In [None]:
env.run(["./submission.py", "./pilot.py", "./pilot.py", "./pilot.py"])
env.render(mode="ipython", width=1000, height=800)

I made a q-leraning starter notebook, but I can't train this agent well.
I try 700 times in 11 hours by competing my best model which don't includes combat action. However, that agent's public score is 874.7.

<br />

There are several hypothetical reasons why it did not work.
1. The way the state is created is wrong.
2. I need to train more times such as 10000.
3. I need to let agent observe the board more using deep q-laning, etc.
4. In the end, the rule-based approach is stronger.