# Setup

In [None]:
!pip install keras-rl
!pip install tensorflow==1.14

In [None]:
from kaggle_environments import evaluate, make, utils

env = make("connectx", debug=True)
#env.render()

# Custom classes

In [None]:
import numpy as np

from rl.core import Processor
class CustomProcessor(Processor):
    def process_state_batch(self, obs_batch):
        batch = np.array(obs_batch)
        result = batch.reshape(-1, 6, 7)
        def group(asd):
            a, b = asd.copy(), asd.copy()
            a[a == 1] = 0
            a[a == 2] = 1
            b[b == 2] = 0
            return np.stack((a,b), axis=2)
        def get_mark(asd):
            return 1 if np.count_nonzero(asd==2) > np.count_nonzero(asd==1) else 2
        res1, res2 = [group(a) for a in result], [get_mark(a) for a in result]
        return [res1, res2]
    def process_action(self, action):
        return int(action)

In [None]:
from sklearn.utils import shuffle

def make_idiot():
    asd = shuffle(list(range(7)))
    def idiot(observation, configuration):
        board = observation.board
        columns = configuration.columns
        options= [c for c in range(columns) if board[c] == 0]
        return [a for a in asd if a in options][0]
    return idiot

In [None]:
import random
class ConnectTrainer():
    """Connect Trainer
    """
    def __init__(self, env, configuration):
        self.env =  env
        self.configuration = configuration
        self.get_players = lambda self: [None, 'random']
        
    def step(self, action):
        observation, reward, done, info = self.trainer.step(action)
        observation = observation.board
        if reward is None:
            # invalid action
            reward = -100
        else:
            if reward == 0:
                # lose
                reward = -10
            if not done:
                # game still in course
                reward = 0
            # long game equals less penalty and less reward
            reward *= 1+5/np.count_nonzero(observation)
        return observation, reward, done, info

    def reset(self):
        self.trainer = self.env.train(self.get_players(self))
        observation = self.trainer.reset()
        observation = observation.board
        return observation
    
    def set_difficulty(self, difficulty):
        if difficulty < 1:
            self.get_players = lambda self: shuffle([None, random.choice(["random", make_idiot()])])
        if difficulty >= 1:
            self.get_players = lambda self: shuffle([None, random.choice(["negamax", make_idiot()])])
        if 'my_agent' in globals():
            if difficulty >= 2:
                self.get_players = lambda self: shuffle([None, random.choice(["negamax", my_agent])])
            if difficulty >= 3:
                self.get_players = lambda self: shuffle([None, my_agent])
        else:
            print('Self play not available')
        print(f'Difficulty set to: {difficulty}.')

In [None]:
from rl.callbacks import Callback

class DifficultyChangeCallback(Callback):
    def __init__(self, initial_difficulty, difficulty_steepness=150):
        self.difficulty_steepness = difficulty_steepness
        self.difficulty = initial_difficulty
        
    def on_episode_begin(self, episode, logs):
        if episode and episode % self.difficulty_steepness == 0:
            self.difficulty += 1
            self.env.set_difficulty(self.difficulty)

# Model definition

In [None]:
import tensorflow as tf
import keras

from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from rl.agents import DQNAgent


# Environment setup
nb_actions = 7
env_wrapper = ConnectTrainer(env, env.configuration)

# Model setup
input_shape_1 = keras.Input(shape=(6, 7, 2))
input_shape_2 = keras.Input(shape=(1,)) #mark

tower_1 = keras.layers.Conv2D(100, kernel_size=(2, 2), padding='same', activation='relu')(input_shape_1)
tower_1 = keras.layers.Dropout(0.02)(tower_1)
tower_1 = keras.layers.Flatten()(tower_1)
tower_1 = keras.layers.Dense(150, activation='relu')(tower_1)
tower_1 = keras.layers.Dropout(0.2)(tower_1)

tower_2 = keras.layers.Conv2D(50, kernel_size=(3, 3), padding='same', activation='relu')(input_shape_1)
tower_2 = keras.layers.Dropout(0.02)(tower_2)
tower_2 = keras.layers.Flatten()(tower_2)
tower_2 = keras.layers.Dense(150, activation='relu')(tower_2)
tower_2 = keras.layers.Dropout(0.2)(tower_2)

tower_3 = keras.layers.Flatten()(input_shape_1)
tower_3 = keras.layers.Dense(150, activation='relu')(tower_3)
tower_3 = keras.layers.Dropout(0.2)(tower_3)

merged = keras.layers.Concatenate(axis=1)([tower_1, tower_2, tower_3, input_shape_2])
out = keras.layers.Dense(120, activation='relu')(merged)
out = keras.layers.Dropout(0.3)(out)
out = keras.layers.Dense(80, activation='relu')(out)
out = keras.layers.Dropout(0.4)(out)
out = keras.layers.Dense(50, activation='relu')(out)
out = keras.layers.Dropout(0.5)(out)
out = keras.layers.Dense(7, activation='linear')(out)

model = keras.models.Model([input_shape_1, input_shape_2], out)

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy(tau=0.3)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100,
               target_model_update=1e-2, policy=policy, processor=CustomProcessor())
dqn.compile(keras.optimizers.Adam(lr=0.0001), metrics=['mae'])

In [None]:
session = tf.Session()

keras.backend.set_session(session)
init = tf.global_variables_initializer()
session.run(init)

def my_agent(obs, conf):
    import tensorflow as tf
    import numpy as np
    import json
    
    if 'dqn' in globals():
        with session.as_default():
            with session.graph.as_default():
                q_values = dqn.compute_q_values(obs['board'])

    else:
        q_net = tf.keras.models.model_from_json('#MODEL')
        q_net.set_weights(np.array([np.array(a) for a in json.loads('#WEIGHTS')]))

        def process_state_batch(obs_batch):
            batch = np.array(obs_batch)
            result = batch.reshape(-1, 6, 7)
            def group(asd):
                a, b = asd.copy(), asd.copy()
                a[a == 1] = 0
                a[a == 2] = 1
                b[b == 2] = 0
                return np.stack((a,b), axis=2)
            def get_mark(asd):
                return 1 if np.count_nonzero(asd==2) > np.count_nonzero(asd==1) else 2
            res1, res2 = np.array([group(a) for a in result]), np.array([get_mark(a) for a in result])
            return [res1, res2]
    
        q_values = q_net.predict(process_state_batch(obs['board']))[0]

    def prune_invalid_actions(board, q_values):
        for i in range(7):
            if board[i]:
                q_values[i] = -1e7
        return q_values

    def boltzmannize(q_values, tau, clip=(-500., 500.)):
        nb_actions = q_values.shape[0]
        exp_values = np.exp(np.clip(q_values / tau, clip[0], clip[1]))
        probs = exp_values / np.sum(exp_values)
        return np.random.choice(range(nb_actions), p=probs)

    return int(np.argmax(prune_invalid_actions(obs['board'], q_values)))
    # return int(boltzmannize(prune_invalid_actions(obs['board'], q_values), 0.3))

my_agent({'board': [0]*42, 'mark': 1}, {})

In [None]:
# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
dqn.fit(env_wrapper, nb_steps=1200000, visualize=False, verbose=1, callbacks=[DifficultyChangeCallback(-1, 24000)])

In [None]:
import json
weights_json = json.dumps([w.tolist() for w in dqn.model.get_weights()])
model_json = dqn.model.to_json()

import inspect
import os
import base64

def write_agent_to_file(function, file):
    with open(file, "w") as f:
        process = CustomProcessor().process_state_batch
        source_code = inspect.getsource(function)
        source_code = source_code.replace('#MODEL', model_json)
        source_code = source_code.replace('#WEIGHTS', weights_json)

        f.write(source_code)
        print(function, "written to", file)
        
    
write_agent_to_file(my_agent, "submission.py")

In [None]:
# Note: Stdout replacement is a temporary workaround.
import sys
from submission import my_agent as agent

env = make("connectx", debug=True)
env.run([agent, agent])
print("Success!" if env.state[0].status == env.state[1].status == "DONE" else "Failed...")

In [None]:
env_wrapper.set_difficulty(0)
dqn.test(env_wrapper,nb_episodes=10,visualize=False)

In [None]:
def mean_reward(rewards):
    mr = sum(r[0] for r in rewards) / float(len(rewards))
    return f'{((mr+1)/2)*100}%'
    

# Run multiple episodes to estimate its performance.
print("My Agent vs Random Agent:", mean_reward(evaluate("connectx", [agent, "random"], num_episodes=10)))
print("My Agent vs Negamax Agent:", mean_reward(evaluate("connectx", [agent, "negamax"], num_episodes=10)))

In [None]:
env.reset()
env.run(['negamax', agent])
env.render(mode="ipython", width=500, height=450)