# Version History

This is a kernel that shows how to run reinforcement learning algorithms in Halite IV. A colleague of mine asked me to provide something so he could get started so I decided to make it publicly available. I think I will see how far I can push this kernel in terms of usefulness. Currently it is really just a toy example and  meant to be minimal

## Version 1 (03. Aug 2020)

This is a first sketch of a simple DQN baseline. It has:
* replay buffer
* target network
* simple reward function
* self play 
* A couple convolutional layers


### References

There is plenty of good resources out there. First and foremost checkout a couple other notebooks in this competition since there is other RL starter notebooks.

* [Simple Reinforcement Learning with Tensorflow - Arthur Juliani](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0)
* [Official Pytorch: Reinforcement Learning (DQN) Tutorial - Adam Paszke](https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html)
* [Couple of good ipynb notebooks for various rainbow DQN features higgsfield/RL-Adventure](https://github.com/higgsfield/RL-Adventure)
* [Pytorch Examples on Github](https://github.com/pytorch/examples/tree/master/reinforcement_learning)


In [None]:
from kaggle_environments import make
from kaggle_environments.envs.halite.helpers import *

from random import seed
import time

import numpy as np

np.set_printoptions(precision=3)
np.set_printoptions(suppress=True)

In [None]:
NUM_AGENTS = 2
BOARD_SIZE = 7
TURNS = 6

env = make('halite', configuration={"randomSeed": 1, "episodeSteps": TURNS, "size": BOARD_SIZE}, debug=True)
_ = env.reset(num_agents=NUM_AGENTS)

ACTIONS = [
    ShipAction.NORTH,
    ShipAction.EAST,
    ShipAction.SOUTH,
    ShipAction.WEST,
    #ShipAction.CONVERT, #for our toy example we don't want to build new shipyards
    None #None for collecting
]

In [None]:
env.render(mode='ipython')

In [None]:
# This will create an output of
#  size x size x features
#  features are 
#   "how much halite on the board", 
#   "where are ships",
#   "how full are ships",
#   "where are bases",

# could add features around how many turns are left
# how much halite players have on the bank etc

def world_feature(board):
    size = board.configuration.size
    me = board.current_player
    
    ships = np.zeros((1, size, size))
    ship_cargo = np.zeros((1, size, size))
    bases = np.zeros((1, size, size))

    map_halite = np.array(board.observation['halite']).reshape(1, size, size)/1000

    for iid, ship in board.ships.items():
        ships[0, ship.position[1], ship.position[0]] = 1 if ship.player_id == me.id else -1
        ship_cargo[0, ship.position[1], ship.position[0]] = ship.halite/1000

    for iid, yard in board.shipyards.items():
        bases[0, yard.position[1], yard.position[0]] = 1 if yard.player_id == me.id else -1
        
    return np.concatenate([
        map_halite, 
        ships, 
        ship_cargo, 
        bases
    ], axis=0)


#As example take the first frame of the game
sample_obs = env.state[0].observation
board = Board(sample_obs, env.configuration)

feature = world_feature(board)
feature

In [None]:
#Now we define an agent consisting of a neural network
import random
import torch
import torch.nn as nn

torch.set_printoptions(profile="short")


#Simple network with three conv layers and a linear output
#We will interpret the output as a (n_actions, size, size) array and 
#do the .argmax over the actions to find the move we would take for every location on the board, if there was a ship

#You want to change this to your liking e.g.
#https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html#q-network
#or search for various pytorch DQN implementations
class SmallModel(nn.Module):
    def __init__(self, input_channels, num_actions):
        super(SmallModel, self).__init__()
        self.input_channels = input_channels
        self.num_actions = num_actions
        
        self.network = nn.Sequential(
            nn.Conv2d(
                in_channels=input_channels,
                out_channels=16,
                kernel_size=(3, 3),
                stride=1,
                padding=3, #use this padding to give each pixel more vision
                padding_mode='circular'
            ),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Conv2d(
                in_channels=16,
                out_channels=16,
                kernel_size=(3, 3),
                stride=1,
                padding=0, #this will make the padded first layer smaller again
                padding_mode='circular'
            ),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Conv2d(
                in_channels=16,
                out_channels=16,
                kernel_size=(3, 3),
                stride=1,
                padding=0, #this will make the padded first layer smaller again
                padding_mode='circular'
            ),
            nn.ReLU(),
            nn.BatchNorm2d(16),
        )
        
        self.output = nn.Linear(BOARD_SIZE*BOARD_SIZE*16, BOARD_SIZE*BOARD_SIZE*len(ACTIONS))
        
    def forward(self, features):
        x = self.network(features)
        x = x.view(features.shape[0], -1) #flatten
        x = self.output(x) #pass through linear layer
        
        return x.reshape(features.shape[0], self.num_actions, BOARD_SIZE, BOARD_SIZE)
                          

model = SmallModel(
    input_channels=4, #needs to be equal to the number of feature channels we have
    num_actions=len(ACTIONS)
)

target_model = SmallModel(
    input_channels=4, #needs to be equal to the number of feature channels we have
    num_actions=len(ACTIONS)
)

#predicting the feature from the cell above
feature_tensor = torch.from_numpy(feature).float().unsqueeze(0)
prediction = model(feature_tensor) 
print(prediction.shape) 
# Shape: (1, 5, 6, 6) = (batch_size, n_actions, y, x)
# move would then be model(feature_tensor).argmax(dim=1)
# to select the Q-Values for a specific location you can do prediction[0, :, y, x]  where y,x will be the coordinates of our ships
prediction

In [None]:
def make_move(model, obs, configuration, EPSILON):
    size = configuration.size
    board = Board(obs, configuration)
    me = board.current_player


    #if we do not have ships but a shipyard build 1 ship
    if len(me.ships)==0 and len(me.shipyards)>0:
        me.shipyards[0].next_action = ShipyardAction.SPAWN

    #if we have no shipyard build one
    state = world_feature(board).astype(np.float32)
    state_tensor = torch.from_numpy(state).unsqueeze(0)
    
    action_indices = model(state_tensor).detach().numpy().argmax(1).squeeze()
    random_indices = np.random.choice(range(5), (size, size))
    actions = np.zeros((size, size))-1
    

    for ship in me.ships: 
        if len(me.shipyards)==0:
            action_index = -1
            ship.next_action = ShipAction.CONVERT #in our toy example we handle this manually
        else:
            if random.random() < EPSILON:
                action_index = random_indices[ship.position[1], ship.position[0]]
            else:
                action_index = action_indices[ship.position[1], ship.position[0]]
            
            ship.next_action = ACTIONS[action_index]
            
        actions[ship.position[1], ship.position[0]] = action_index
            
    return me.next_actions, state, actions

make_move(model, sample_obs, env.configuration, 0.0)   

In [None]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr=1e-4)
model

In [None]:
#Taken and slightly modified from here: https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html#replay-memory

class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        
        self.memory[self.position] = args[0]
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [None]:
from collections import defaultdict
import torch.nn.functional as F
import copy

EPSILON = 1.0 
EPSILON_DECAY = 0.998
TRAINING_ITERATIONS = 2300
EPOCHS = 1
REPLACE_TARGET_INTERVAL = 10
LEARNING_RATE = 0.1
REPLAY_CAPACITY = 100000
WARM_START_SAMPLES = 32*20
BATCH_SIZE = 32
GAMMA = 0.99
PRINT_INTERVAL = 100

running_avg_reward = []
episode_rewards = []

memory = ReplayMemory(REPLAY_CAPACITY)

#A bunch of stuff is happening here
#We inialize the environment, set random seeds and define some variables we need for storing all the necessary information
#We run one full game with 4 times the same agent (initially making random moves EPSILON=1)
#We store all the data in the replay buffer
#After the last round we need to properly set the done flag, calculate the last rewards and the next states for everything
#We then store this in the replay buffer
#If we have enough samples, we run EPOCHS times the training loop, random sampling data and training the network with it (r+GAMMA*target_q)

for episode in range(TRAINING_ITERATIONS+1): #+1 so its inclusive and we print a statement at the end
    print(f'{episode} - {round(EPSILON,3)} - {len(memory)}', end='\r')
    _ = env.reset(num_agents=NUM_AGENTS)

    #When we call env.reset it will set the random seeds of both python and numpy to our fixed value
    #We want to do some real random exploration though, otherwise we will always end up with the same game
    seed_time = int(time.time()*1000)%1000000000
    np.random.seed(seed_time)
    seed(seed_time)
    size = env.configuration.size


    player2states = defaultdict(list)
    player2actions = defaultdict(list)

    player2halite = defaultdict(list)
    player2rewards = defaultdict(list)

    player2dones = defaultdict(list)

    #The gist of this loop is copied from Tom Van de Wiele's answer here: https://www.kaggle.com/c/halite/discussion/144844
    while not env.done:
        observation = env.state[0].observation
        player_mapped_actions = []
        for active_id in range(NUM_AGENTS):
            agent_status = env.state[active_id].status
            if agent_status == 'ACTIVE':
                player_obs = env.state[0].observation.players[active_id]
                observation['player'] = active_id
                engine_commands, state, actions = make_move(model, observation, env.configuration, EPSILON)

                player2states[active_id].append(state)
                player2actions[active_id].append(actions)

                #in the first round there was no previous reward, we will use 5000 so the diff is 0 and drop it later in post processing
                #one big thing is that we only generate one reward per frame, probably we should generate a (size, size) reward array to 
                #properly attribute the rewards to the ships if we have a multi ship scenario later
                prev_reward = 5000 if len(player2halite[active_id]) == 0 else player2halite[active_id][-1]
                reward = player_obs[0] - prev_reward
                reward = reward if reward > 0 else 0
                player2rewards[active_id].append(reward) 
                player2halite[active_id].append(player_obs[0]) 

                player2dones[active_id].append(env.done)

                player_mapped_actions.append(engine_commands)
            else:
                player_mapped_actions.append({})
        env.step(player_mapped_actions)


    #Postprocessing:
    #We need to build (state(t), actions(t), reward(t+1), state(t+1), dones(t+1)) tuples
    #After the env finished we want to set the last done to true
    #We want to add the last reward and remove the reward t=0 (since we always need reward(t+1))
    for active_id in range(NUM_AGENTS):
        player_obs = env.state[0].observation.players[active_id]
        player2dones[active_id][-1] = True #the main loop does not get called again when the env is done so we set it manually

        prev_reward = player2halite[active_id][-1]
        reward = player_obs[0] - prev_reward
        reward = reward if reward > 0 else 0
        player2rewards[active_id].append(reward) #append reward t+1
        player2rewards[active_id].pop(0) #remove reward t=0

        #For debugging: Make sure we have the same number of samples everywhere
        #print(len(player2states[active_id]), len(player2actions[active_id]),len(player2rewards[active_id]),len(player2dones[active_id]),)
        #Look at your rewards and compare with the replay below whether the reward matches the games that you see
        #print(player2rewards[active_id])


        states = player2states[active_id]
        next_states = [x for x in states]
        next_states = next_states[1:] + next_states[-1:]

        for state, action, reward, next_state, done in zip(states, player2actions[active_id], player2rewards[active_id], next_states, player2dones[active_id]):
            memory.push((state, action, reward, next_state, done))


    episode_rewards.append(np.array([x for y in player2rewards.values() for x in y]))

        
        
    running_avg_reward.append(episode_rewards[-1].sum()/episode_rewards[-1].shape)
    if episode % PRINT_INTERVAL == 0:
        episode_rewards = np.concatenate(episode_rewards)
        print(f'ep:{episode}, '
               f'mem_size:{len(memory)}, '
               f'rew:{episode_rewards.sum()}, '
               f'avg:{round(episode_rewards.sum()/episode_rewards.shape[0], 3)}, '
               f'eps:{round(EPSILON, 2)}, '
               f'running_avg_rew:{round(np.mean(running_avg_reward), 3)}'
              )
        episode_rewards = []
    
            
            
    if not len(memory)>WARM_START_SAMPLES:
        continue
        
    for epoch in range(EPOCHS):
        sample = memory.sample(BATCH_SIZE)

        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for detailed explanation). 
        # This converts [(state1, action1, reward1, next_state1, done1), (state2, action2, reward2, next_state2, done2)]
        # To:[(state1, state2), (action1, action2), (reward1, reward2), (next_state1, next_state2), (done1, done2)]
        states, actions, rewards, next_states, dones = list(zip(*sample))

        states = np.array(states)
        actions = np.array(actions)
        rewards = np.array(rewards)
        next_states = np.array(next_states)
        dones = np.array(dones)

        #Its not a bad idea to check shapes again: states.shape, next_states.shape, actions.shape, rewards.shape, dones.shape
        states = torch.from_numpy(states)
        next_states = torch.from_numpy(next_states)
        actions = torch.from_numpy(actions).long()
        rewards = torch.from_numpy(rewards)
        dones = torch.from_numpy(dones)

        #our actions are a (size, size) array, we want to select all fields that are not -1 since this is where a ship was that took an action
        #if we had mutliple ships then batch would contain the same frame multiple times with different x and y coordinates
        batch, xs, ys = np.where(actions>-1)

        taken_actions = actions[batch, xs, ys].unsqueeze(-1)

        #We will train multiple epochs, here you would ideally want to sample from a replay buffer

        current_qs = model(states)[batch, :, xs, ys].gather(1, taken_actions)
        next_qs = target_model(next_states).detach().max(1)[0][batch, xs, ys]

        # target_q = reward + 0.99 * next_state_max_q * (1 - done)
        target_qs = rewards[batch] + GAMMA * next_qs * ~dones[batch]
        loss = F.smooth_l1_loss(current_qs.squeeze(), target_qs.detach())
        #if we turn this on we will see that the loss is actually not decreasing very much from epoch to epoch
        #print(target_qs.shape, current_qs.shape, loss.mean(), end='\n')
        optimizer.zero_grad()
        loss.mean().backward()
        for param in model.parameters():
            param.grad.data.clamp_(-1, 1)
        optimizer.step()

    if episode and episode % REPLACE_TARGET_INTERVAL:
        target_model = copy.deepcopy(model)
        
    EPSILON *= EPSILON_DECAY 

In [None]:
#running_avg_rew=0.221 is random play, so we learned something!
#Lets see how our bot fares on some sample games:

env = make('halite', configuration={"randomSeed": 1, "episodeSteps": TURNS, "size": BOARD_SIZE}, debug=True)
_ = env.reset(num_agents=NUM_AGENTS)

while not env.done:
    observation = env.state[0].observation
    player_mapped_actions = []
    for active_id in range(NUM_AGENTS):
        agent_status = env.state[active_id].status
        if agent_status == 'ACTIVE':
            player_obs = env.state[0].observation.players[active_id]
            observation['player'] = active_id
            engine_commands, state, actions = make_move(model, observation, env.configuration, 0.0)

            player_mapped_actions.append(engine_commands)
        else:
            player_mapped_actions.append({})
    env.step(player_mapped_actions)
    
    
env.render(mode='ipython', width=800, height=600)

We can see that the yellow agent found the best move while the red agent did not.
This is a little surprising given that we played ~2300 games. But we can see that the agent is learning something
From here on out we can maybe add two more turns to see if the agents can find the big halite chunk.
After this I would add advances of DQN like rainbow DQN, still on the small problem, see  if it works fine
Then up the size of the board a little, add multi ship, make sure this works and then scale out to full game.