# Install kaggle-environments

In [None]:
# 1. Enable Internet in the Kernel (Settings side pane)

# 2. Curl cache may need purged if v0.1.6 cannot be found (uncomment if needed). 
# !curl -X PURGE https://pypi.org/simple/kaggle-environments

# ConnectX environment was defined in v0.1.6
!pip install 'kaggle-environments>=0.1.6'

In [None]:
from kaggle_environments import evaluate, make, utils

import random
from collections import namedtuple
from itertools import count

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = make("connectx", debug=True)
env.render()

In [None]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [None]:
class DQN(nn.Module):

    def __init__(self, rows, columns, inarow, outputs):
        super(DQN, self).__init__()
        
        self.rows = rows
        self.columns = columns
        self.inarow = inarow
        
        # Set kernel size to minimum match length
        self.conv1 = nn.Conv2d(1, 16, kernel_size=inarow, stride = 1)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=1, stride = 1)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=1, stride = 1)
        self.bn3 = nn.BatchNorm2d(32)

        # Number of Linear input connections depends on output of conv2d layers
        # and therefore the input array size, so compute it.
        def conv2d_size_out(size, kernel_size=inarow, stride = 1):
            return (size - (kernel_size - 1) - 1) // stride  + 1
        convh = conv2d_size_out(rows)
        convw = conv2d_size_out(columns + inarow - 1)
        linear_input_size = convw * convh * 32
        self.head = nn.Linear(linear_input_size, outputs)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = torch.reshape(x, (-1, 1, self.rows, self.columns))
        x = torch.cat((x, torch.zeros(x.shape[0], 1, self.rows, self.inarow-1)), dim=3)
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        return self.head(x.view(x.size(0), -1))

In [None]:
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

columns = env.configuration['columns']
rows = env.configuration['rows']

# Number of actions is equal to number of columns
n_actions = columns
inarow = env.configuration['inarow']

policy_net = DQN(rows, columns, inarow, n_actions).to(device)
adversary_net = DQN(rows, columns, inarow, n_actions).to(device)
target_net = DQN(rows, columns, inarow, n_actions).to(device)
adversary_net.load_state_dict(policy_net.state_dict())
adversary_net.eval()
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000)


steps_done = 0


episode_durations = []

In [None]:
def select_action(observation, configuration):
    global steps_done
    state = torch.tensor(observation.board, dtype=torch.float)
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        np.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.float)

In [None]:
def adversary_agent(observation, configuration):
    state = torch.tensor(observation.board, dtype=torch.float)
    
    with torch.no_grad():
        action = adversary_net(state).max(1)[1].view(1, 1)
        
    if observation.board[action] != 0:
        return random.choice([c for c in range(configuration.columns) if observation.board[c] == 0])
        
    return int(action[0][0].item())

In [None]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions))

    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(tuple(ten.type(torch.long) for ten in batch.action))
    reward_batch = torch.tensor(batch.reward)

    state_action_values = policy_net(state_batch).gather(1, action_batch)

    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [None]:
# Play as first position against negamax agent.
trainer = env.train([None, "negamax"])

num_episodes = 1000
for i_episode in range(num_episodes):
    observation = trainer.reset()
    for t in count():
        state = torch.tensor(observation.board, dtype=torch.float)
        action = select_action(observation, env.configuration)
        chosen_column = int(action[0][0].item())
        
        last_state = state
        observation, reward, done, info = trainer.step(chosen_column)
        
        if not done:
            next_state = torch.tensor(observation.board, dtype=torch.float)
        else:
            next_state = None
            
        if done:
            if reward == 1: # Won
                reward = 20
            elif reward == 0: # Lost
                reward = -20
            else: # Draw
                reward = 10
        elif reward == None:
            reward = 0

        # Store the transition in memory
        memory.push(last_state, action, next_state, reward)
        
        # Perform one step of the optimization (on the target network)
        optimize_model()
        if done:
            break
    
#     # If the policy lost, switch to adversary
#     if reward == 0:
#         policy_net.load_state_dict(adversary_net.state_dict())
#         policy_net.eval()
        
#     # Otherwise, update adversary
#     else:
#         adversary_net.load_state_dict(policy_net.state_dict())
#         adversary_net.eval()
    
#     trainer = env.train([None, adversary_agent])
    
    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())
        
print('Done')

In [None]:
#uncap state_dict size
torch.set_printoptions(profile="full")

agent = f"""
import torch
import numpy as np
import random
from torch import nn, tensor
import torch.nn.functional as F
from collections import OrderedDict
    
def my_agent(observation, configuration):
    class DQN(nn.Module):
        def __init__(self, rows, columns, inarow, outputs):
            super(DQN, self).__init__()

            self.rows = rows
            self.columns = columns
            self.inarow = inarow

            self.conv1 = nn.Conv2d(1, 16, kernel_size=inarow, stride = 1)
            self.bn1 = nn.BatchNorm2d(16)
            self.conv2 = nn.Conv2d(16, 32, kernel_size=1, stride = 1)
            self.bn2 = nn.BatchNorm2d(32)
            self.conv3 = nn.Conv2d(32, 32, kernel_size=1, stride = 1)
            self.bn3 = nn.BatchNorm2d(32)

            def conv2d_size_out(size, kernel_size=inarow, stride = 1):
                return (size - (kernel_size - 1) - 1) // stride  + 1
            convh = conv2d_size_out(rows)
            convw = conv2d_size_out(columns + inarow - 1)
            linear_input_size = convw * convh * 32
            self.head = nn.Linear(linear_input_size, outputs)

        def forward(self, x):
            x = torch.reshape(x, (-1, 1, self.rows, self.columns))
            x = torch.cat((x, torch.zeros(x.shape[0], 1, self.rows, self.inarow-1)), dim=3)
            x = F.relu(self.bn1(self.conv1(x)))
            x = F.relu(self.bn2(self.conv2(x)))
            x = F.relu(self.bn3(self.conv3(x)))
            return self.head(x.view(x.size(0), -1))
            
    columns = configuration['columns']
    rows = configuration['rows']

    n_actions = columns
    inarow = configuration['inarow']
    
    policy_net = DQN(rows, columns, inarow, n_actions)
    
    policy_net.load_state_dict({str(policy_net.state_dict())})
    policy_net.eval()
        
    state = torch.tensor(observation.board, dtype=torch.float)
    
    with torch.no_grad():
        action = policy_net(state).max(1)[1].view(1, 1)
        
    if observation.board[action] != 0:
        return random.choice([c for c in range(configuration.columns) if observation.board[c] == 0])
        
    return int(action[0][0].item())
"""

In [None]:
with open('submission.py', 'w') as f:
    f.write(agent)

In [None]:
from submission import my_agent

In [None]:
# "None" represents which agent you'll manually play as (first or second player).
env.play([None, my_agent], width=500, height=450)

In [None]:
def mean_reward(rewards):
    return sum(r[0] for r in rewards) / max(0.001, sum(r[0] + r[1] for r in rewards))

# Run multiple episodes to estimate agent's performance.
print("My Agent vs. Random Agent:", mean_reward(evaluate("connectx", [my_agent, "random"], num_episodes=10)))
print("My Agent vs. Negamax Agent:", mean_reward(evaluate("connectx", [my_agent, "negamax"], num_episodes=10)))
print("Random Agent vs. My Agent:", mean_reward(evaluate("connectx", ["random", my_agent], num_episodes=10)))
print("Negamax Agent vs. My Agent:", mean_reward(evaluate("connectx", ["negamax", my_agent], num_episodes=10)))

In [None]:
# Note: Stdout replacement is a temporary workaround.
import sys
out = sys.stdout
submission = utils.read_file("/kaggle/working/submission.py")
agent = utils.get_last_callable(submission)
sys.stdout = out

env = make("connectx", debug=True)
env.run([agent, agent])
print("Success!" if env.state[0].status == env.state[1].status == "DONE" else "Failed...")

# Credits

Adapted from [PyTorch: Reinforcement Learning (DQN) Tutorial](https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html#input-extraction)