# Install kaggle-environments

In [None]:
# 1. Enable Internet in the Kernel (Settings side pane)

# 2. Curl cache may need to be purged if v0.1.6 cannot be found (uncomment if needed). 
# !curl -X PURGE https://pypi.org/simple/kaggle-environments

# ConnectX environment was defined in v0.1.6
!pip install 'kaggle-environments>=0.1.6'

# Create ConnectX Environment

In [None]:
from kaggle_environments import evaluate, make, utils
import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as scheduler
import numpy as np
import random
import matplotlib.pyplot as plt
from random import choice
from collections import namedtuple, deque

env = make("connectx", debug=False)
env.render()

In [None]:
class config:
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    lr = 7e-3
    weight_decay = 1e-5
    replace_cntr = 500
    epsilon = 1#0.99985 ** 20_000
    eps_min = .01
    eps_decay = 0.99985
    gamma = .7
    mem_len = 1000
    num_actions = 7
    epochs = 20_000

In [None]:
class Callback():
    def __init__(self):
        self.reward = []
        self.total_reward = 0
        
    def on_train_begin(self):
        pass
    
    def on_train_end(self):
        plt.plot(self.reward)
    
    def on_epoch_begin(self):
        pass
        
    def on_epoch_end(self):
        pass
        
    def on_game_begin(self):
        pass
        
    def on_game_end(self, reward):
        agent.scheduler.step()
        
        self.reward += [reward]
        
        agent.step_counter += 1
        agent.decrement_epsilon()
        
    def on_loss_begin(self):
        pass
    
    def on_loss_end(self):
        pass
    
    def on_step_begin(self):
        pass
    
    def on_step_end(self):
        pass

In [None]:
class RLModel(nn.Module):
    def __init__(self):
        super(RLModel,self).__init__()
        self.fc1 = nn.Linear(3,1)
        self.fc2 = nn.Linear(42,128)
        self.fc3 = nn.Linear(128,7)
        self.type_emb = nn.Embedding(3,3)
        self.relu = nn.ReLU()
        
        self.to(config.device)
        
    def forward(self, board):
        types = self.type_emb(board)
        types = self.relu(self.fc1(types))
        types = types.permute([1,0])
        out = self.relu(self.fc2(types))
        out = self.fc3(out).squeeze(0)
        out = out.softmax(dim=0)
        return out

In [None]:
class RLModel(nn.Module):
    def __init__(self):
        super(RLModel,self).__init__()
        self.conv1 = nn.Conv2d(1,1,kernel_size=3)
        self.conv2 = nn.Conv2d(1,1,kernel_size=3)
        self.conv2 = nn.Conv2d(1,1,kernel_size=3)
        self.conv3 = nn.Conv2d(1,1,kernel_size=3)
        self.conv2 = nn.Conv2d(7,7,kernel_size=(3,4))
        self.fc1 = nn.Linear(7,1)
        self.relu = nn.ReLU()
        
        self.to(config.device)
        
    def forward(self, board):
        inp = board.reshape(6,7).unsqueeze(0).unsqueeze(0).float()
        conv_out = self.relu(self.conv1(inp))
        conv_out = self.relu(self.conv2(conv_out))
        conv_out = conv_out.squeeze(-1).squeeze(-1)
        inp2 = board.reshape(6,7).float()
        cat_board =  torch.transpose(torch.cat((conv_out,inp2)),0,1)
        out = self.fc1(cat_board).squeeze(1)
        out = out.softmax(dim=1)
        return out

In [None]:
class RLModel(nn.Module):
    def __init__(self):
        super(RLModel,self).__init__()
        self.model = nn.Sequential(nn.Linear(42,50),
                                   nn.ReLU(),
                                   nn.Linear(50,50),
                                   nn.ReLU(),
                                   nn.Linear(50,50),
                                   nn.ReLU(),
                                   nn.Linear(50,50),
                                   nn.ReLU(),
                                   nn.Linear(50,50),
                                   nn.ReLU(),
                                   nn.Linear(50,50),
                                   nn.ReLU(),
                                   nn.Linear(50,50),
                                   nn.ReLU(),
                                   nn.Linear(50,50),
                                   nn.ReLU(),
                                   nn.Linear(50,7))
        
        self.to(config.device)
        
    def forward(self, board):
        out = self.model(board)
        out = out.softmax(dim=0)
        return out

In [None]:
Transitions = namedtuple('transitions',
                        ('states','actions','rewards','states_','dones'))

class Replay():
    def __init__(self):
        self.memory = deque(maxlen=config.mem_len)
        
    def store_transition(self, states, actions, rewards, states_, dones):
        self.memory.append(Transitions(states, actions, rewards, states_, dones))
        
    def sample_memory(self):
        s = random.sample(self.memory, 1)
        
        states = s[0].states
        actions = s[0].actions
        rewards = s[0].rewards
        states_ = s[0].states_
        dones = s[0].dones
        
        return states, actions, rewards, states_, dones
    
    def __len__(self):
        return len(self.memory)

# Create an Agent

To create the submission, an agent function should be fully encapsulated (no external dependencies).  

When your agent is being evaluated against others, it will not have access to the Kaggle docker image.  Only the following can be imported: Python Standard Library Modules, gym, numpy, scipy, pytorch (1.3.1, cpu only), and more may be added later.



In [None]:
class RLAgent:
    def __init__(self):
        self.qeval = RLModel()
        self.qtarg = RLModel()
        self.memory = Replay()
        self.epsilon = config.epsilon
        self.gamma = config.gamma
        self.step_counter = 0
        self.optimizer = optim.Adam(self.qeval.parameters(), lr=config.lr, weight_decay=config.weight_decay)
        self.scheduler = scheduler.CosineAnnealingWarmRestarts(self.optimizer, T_0=300)
    
    def save_model(self):
        torch.save({'state_dict': self.qeval.state_dict()}, f'./qeval.pth.tar')
        torch.save({'state_dict': self.qtarg.state_dict()}, f'./qtarg.pth.tar')
    
    def load_model(self):
        checkpoint_eval = torch.load(f'../input/first-20k-games/qeval.pth.tar')
        checkpoint_targ = torch.load(f'../input/first-20k-games/qtarg.pth.tar')
        self.qeval.load_state_dict(checkpoint_eval['state_dict'])
        self.qtarg.load_state_dict(checkpoint_targ['state_dict'])
        
    def store_memory(self, states, actions, rewards, states_, dones):
        self.memory.store_transition(states, actions, rewards, states_, dones)
    
    def get_from_memory(self):
        return self.memory.sample_memory()
    
    def replace_target_network(self):
        if self.step_counter % config.replace_cntr == 0:
            self.qtarg.load_state_dict(self.qeval.state_dict())

    def decrement_epsilon(self):
        self.epsilon = self.epsilon * config.eps_decay if self.epsilon > config.eps_min else config.eps_min
    
    def choose_action(self, board):
        if np.random.random_sample() < self.epsilon:
            action = int(np.random.choice(range(config.num_actions)))
        else:
            action = torch.argmax(self.predict(board),dim=0).item()
        return action
        
    def predict(self, board):
        return self.qeval.forward(board.to(config.device))
    
    def predict_(self, board):
        return self.qtarg.forward(board.to(config.device))
        
    def learn(self):
        if len(self.memory) < 1:
            return
        
        self.replace_target_network()
        
        s, a, r, s_, d = self.get_from_memory()
        
#         a_ = torch.argmax(self.predict_(s_),dim=0).to(config.device)
#         q = self.predict(s_).gather(0,a_.to(config.device))
#         q_star = r + (1-d) * config.gamma * q
        
#         q = self.predict(s).gather(0,torch.tensor(a))
        q = self.predict(s).gather(0,torch.tensor(a).to(config.device))
        
        q_ = self.predict_(s_)
        a_ = torch.argmax(q_,dim=0).to(config.device)
        q_ = q_.gather(0,a_)
        
        q_star = r + (1-d) * config.gamma * q_
        
#         callback.on_loss_begin()
        criterion = nn.MSELoss()
        loss = criterion(q, q_star)
        
#         callback.on_loss_end()
        self.optimizer.zero_grad()
        loss.backward()
        
#         callback.on_step_begin()
        
        self.optimizer.step()
        
#         callback.on_step_end()
        
#         self.step_counter += 1
#         self.decrement_epsilon()
        
def model_agent(obs, config):
    board = torch.tensor(obs['board'], dtype=torch.float)
    column = agent.choose_action(board)
    return column

# Debug/Train your Agent

In [None]:
def change_players(new_players, game_counter):
    npas = ["random", "negamax"] # Non-playable Agents

    for idx,player in enumerate(new_players):
        if player:
            if game_counter%50 < 5: 
                new_players[idx] = npas[0] # Switches NPA to "random" for 10% of games
            else: 
                new_players[idx] = npas[1] # Switches NPA to "negamax" for 90% of games
    if game_counter%100 == 0 and game_counter != 0:
        new_players.reverse()

    return new_players

def reward(observation, won, done):
    if done:
        if won == 1: # Won
            state_reward = 1
            record['wins'] += 1
        elif won == -1: # Lost
            state_reward = -1
            record['losses'] += 1
        elif won == 0: # Draw
            state_reward = .5
            record['draws'] += 1
        elif won == None:
            state_reward = -10
            record['invalid_actions'] += 1
    else:
        state_reward = 1/42
    return state_reward


def train(games=100):
    confg = env.configuration
    players = [None, "random"]
    new_players = [None, "random"]
    trainer = env.train(players)

    for game in range(games):
        new_players = change_players(players.copy(), game)
        if new_players != players:
            players = new_players.copy()
            trainer = env.train(players)

        obs = trainer.reset()

        if game%1000==0:
            print(f'Game {game}')

        while not env.done:
            action = model_agent(obs, confg)
            state = torch.tensor(obs['board'], dtype=torch.float)
            obs, won, done, _ = trainer.step(action)
            step_reward = reward(obs, won, done)
            next_state = torch.tensor(obs['board'], dtype=torch.float)
            agent.store_memory(state, action, step_reward, next_state, done)
#             agent.store_memory(state[::-1], config.num_action - 1 - action. next_state[::-1], done)
            agent.learn()
        callback.on_game_end(step_reward)
    callback.on_train_end()

In [None]:
record = {'wins':0, 'losses':0,'draws':0,'invalid_actions':0}
# agent.epsilon = 0.
agent = RLAgent()
callback = Callback()

# agent.load_model()
train(games=1000)#config.epochs)
agent.save_model()

def mean_reward(all_rewards):
    adjusted_rewards = [[-1,y] if x==None else [x,y] for x,y in all_rewards]
    return sum(r[0] for r in adjusted_rewards) / float(len(adjusted_rewards))

# Run multiple episodes to estimate its performance.
agent.epsilon = 0.
print(record)
print(f"My Agent vs Random Agent: {mean_reward(evaluate('connectx', [model_agent, 'random'], num_episodes=100))}")
print(f"My Agent vs Negamax Agent: {mean_reward(evaluate('connectx', [model_agent, 'negamax'], num_episodes=100))}")

# Run the Agent

In [None]:
env.run([model_agent, "negamax"])
env.render(mode="ipython", width=500, height=450)