# Install kaggle-environments

In [None]:
# 1. Enable Internet in the Kernel (Settings side pane)

# 2. Curl cache may need purged if v0.1.6 cannot be found (uncomment if needed). 
# !curl -X PURGE https://pypi.org/simple/kaggle-environments

# ConnectX environment was defined in v0.1.6
!pip install 'kaggle-environments'

# Create ConnectX Environment

In [None]:
from kaggle_environments import evaluate, make, utils

env = make("connectx", debug=True)
env.render()

# Create an Agent

To create the submission, an agent function should be fully encapsulated (no external dependencies).  

When your agent is being evaluated against others, it will not have access to the Kaggle docker image.  Only the following can be imported: Python Standard Library Modules, gym, numpy, scipy, pytorch (1.3.1, cpu only), and more may be added later.



## DQN Agent declaration

In [None]:
import tensorflow as tf

class DqnAgent:
    """
    DQN Agent: the agent that explores the game and
    should eventually learn how to play the game.
    """
    def __init__(self):
        self.q_net = self._build_dqn_model()
        self.target_q_net = self._build_dqn_model()
        self.exploration_decay = 1

    def policy(self, state, mark):
        """
        Takes a state from the game environment and returns
        a action that should be taken given the current game
        environment.
        """
        if np.random.random() < 0.05 + self.exploration_decay:
            return int(np.random.choice([c for c in range(7) if state[:,:,:,0].ravel()[c] == 0 and state[:,:,:,1].ravel()[c] == 0]))
        
        action_q = self.q_net.predict([state, mark])[0]
        for i in range(7):
            if state[:,:,:,0].ravel()[i] != 0 or state[:,:,:,1].ravel()[i] != 0:
                action_q[i] = -1e7
        action = int(np.argmax(action_q))
        return action

    def train(self, batch):
        state_batch, next_state_batch, action_batch, reward_batch, done_batch, mark_batch = batch
        current_q = self.q_net.predict([state_batch, mark_batch])
        target_q = np.copy(current_q)
        next_q = self.target_q_net.predict([next_state_batch, mark_batch])
        max_next_q = np.amax(next_q, axis=1)
        for i in range(state_batch.shape[0]):
            target_q[i][action_batch[i]] = reward_batch[i] if done_batch[i] else reward_batch[i] + 0.95 * max_next_q[i]
        result = self.q_net.fit(x=(state_batch, mark_batch), y=target_q, verbose=0)
        self.exploration_decay *= 0.99
        return result.history['loss']

    @staticmethod
    def _build_dqn_model():
        """
        Builds a deep neural net which predicts the Q values for all possible
        actions given a state. The input should have the shape of the state
        (which is rows x columns + who_starts (6x7=42) in ConnectX), and the output should have the same shape as
        the action space (which is 7 in ConnectX) since we want 1 Q value per
        possible action.

        :return: the Q network
        """
        input_shape_1 = tf.keras.Input(shape=(6, 7, 2))
        input_shape_2 = tf.keras.Input(shape=(1,)) #mark
        tower_1 = tf.keras.layers.Conv2D(80, kernel_size=(2, 2), padding='same', activation='relu')(input_shape_1)
        tower_1 = tf.keras.layers.Dropout(0.1)(tower_1)
        tower_2 = tf.keras.layers.Conv2D(80, kernel_size=(3, 3), padding='same', activation='relu')(input_shape_1)
        tower_2 = tf.keras.layers.Dropout(0.1)(tower_2)
        merged = tf.keras.layers.Concatenate(axis=1)([tower_1, tower_2])
        merged = tf.keras.layers.Flatten()(merged)
        merged = tf.keras.layers.Concatenate(axis=1)([merged, input_shape_2])
        out = tf.keras.layers.Dense(80, activation='relu', kernel_initializer='random_normal')(merged)
        out = tf.keras.layers.Dropout(0.2)(out)
        out = tf.keras.layers.Dense(50, activation='relu', kernel_initializer='random_normal')(out)
        out = tf.keras.layers.Dropout(0.5)(out)
        out = tf.keras.layers.Dense(7, activation='linear', kernel_initializer='random_normal')(out)
        q_net = tf.keras.models.Model([input_shape_1, input_shape_2], out)
        
#         q_net = tf.keras.Sequential()
#         q_net.add(tf.keras.layers.Dense(128, input_dim=42, activation='relu', kernel_initializer='random_normal'))
#         q_net.add(tf.keras.layers.Dense(64, activation='relu', kernel_initializer='random_normal'))
#         q_net.add(tf.keras.layers.Dense(7, activation='linear', kernel_initializer='random_normal'))
        q_net.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse')
        return q_net
    
    def update_target_network(self):
        self.target_q_net.set_weights(self.q_net.get_weights())

## Replay Buffer declaration

In [None]:
from collections import deque
import numpy as np
import random

class ReplayBuffer:
    def __init__(self):
        self.gameplay_experiences = deque(maxlen=1000000)

    def store_gameplay_experience(self, state, next_state, reward, action, done, mark):
        self.gameplay_experiences.append((state, next_state, reward, action, done, mark))

    def sample_gameplay_batch(self):
        batch_size = int(len(self.gameplay_experiences)/20)
        if not batch_size:
            return [[]]
#         mu, sigma = 3., 1. # mean and standard deviation
#         distribution = np.random.lognormal(mu, sigma, len(self.gameplay_experiences))
#         sampled_gameplay_batch = random.choices(self.gameplay_experiences, distribution, k=batch_size)
        sampled_gameplay_batch = random.sample(self.gameplay_experiences, batch_size)
        state_batch, next_state_batch, action_batch, reward_batch, done_batch, mark_batch = [], [], [], [], [], []
        for gameplay_experience in sampled_gameplay_batch:
            state_batch.append(*gameplay_experience[0])
            next_state_batch.append(*gameplay_experience[1])
            reward_batch.append(gameplay_experience[2])
            action_batch.append(gameplay_experience[3])
            done_batch.append(gameplay_experience[4])
            mark_batch.append(*gameplay_experience[5])
        return np.array(state_batch), np.array(next_state_batch), action_batch, reward_batch, done_batch, np.array(mark_batch)

# Plotting function

In [None]:
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
import numpy as np
import random
from collections import deque


def plot_device(deque_mean_len=10):
    fig = plt.figure(figsize=(16, 8))
    ax1 = fig.add_subplot(1, 1, 1)
    ax2 = ax1.twinx()
    
    x = []
    y1 = []
    y1_mean = deque(maxlen=deque_mean_len)
    y2 = []
    y2_mean = deque(maxlen=deque_mean_len)
    def plot(episode_number, reward, loss):
        
        x.append(episode_number)
        y1_mean.append(reward)
        y1.append(sum(y1_mean)/len(y1_mean))
        y2_mean.append(loss)
        y2.append(sum(y2_mean)/len(y2_mean))

        ax1.set_xlim(x[0], x[-1])
        ax2.set_xlim(x[0], x[-1])
        
        ax1.cla()
        ax1.plot(x, y1, 'b', label='Reward')
        ax2.cla()
        ax2.plot(x, y2, 'r', label='Loss')
        
        display(fig)
        clear_output(wait=True)


    return plot

## Putting all together 

In [None]:
import random
import plotly.graph_objs as go
from IPython.display import display

def get_state_input(state):
    state =  state if isinstance(state, np.ndarray) else np.array([state['board']])
    result = state.reshape(-1, 6, 7)
    a, b = result.copy(), result.copy()
    a[a == 1] = 0
    a[a == 2] = 1
    b[b == 2] = 0
    return np.stack((a,b), axis=3)

def idiot_maker():
    asd = list(range(7))
    random.shuffle(asd)
    def idiot(observation, configuration):
        board = observation.board
        columns = configuration.columns
        options= [c for c in range(columns) if board[c] == 0]
        return [a for a in asd if a in options][0]
    return idiot

def collect_gameplay_experience(env, agent, buffer, difficulty):
    """
    The collect_gameplay_experience function plays the game "env" with the
    instructions produced by "agent" and stores the gameplay experiences
    into "buffer" for later training.
    """
    players = [idiot_maker()]
    if difficulty >= 1:
        players += ['random']
    if difficulty >= 2:
        players += [my_agent] if 'my_agent' in globals() else []
    if difficulty >= 3:
        players += ['negamax']
    players = [random.choice(players), None]
    if difficulty == 0:
        random.shuffle(players)
    trainer = env.train(players)
    state = trainer.reset()
    mark = np.array([state.mark])
    state = get_state_input(state)
    
    done = False
    steps = 0
    action_usage = [0]*7
    while not done:
        steps += 1
        action = agent.policy(state, mark)
        action_usage[action] += 1
        next_state, reward, done, _ = trainer.step(action)
        next_state = get_state_input(next_state)
        
        if done:
            reward *= 1 + 3/steps
            if reward < 0:
                reward *= 10

        buffer.store_gameplay_experience(state, next_state, reward, action, done, mark)
        state = next_state
    return reward, action_usage


def train_model(env, agent, buffer):
    """
    Trains a DQN agent to play the ConnectX game
    """
    pld = plot_device()
    cum_reward = 0
    cum_loss = 0
    cum_action_usage = np.zeros(7)
    difficulty = 0
    for episode_cnt in range(50000): # Train the agent for 6000 episodes of the game
        reward, action_usage = collect_gameplay_experience(env, agent, buffer, difficulty)
        cum_reward += reward
        cum_action_usage = cum_action_usage + np.array(action_usage)
        gameplay_experience_batch = buffer.sample_gameplay_batch()
        if len(gameplay_experience_batch[0]) < 20:
            cum_reward = 0
            continue
        loss, = agent.train(gameplay_experience_batch)
        cum_loss += loss
        if episode_cnt % 20 == 0:
            print(f'Episode No {episode_cnt}. Training with {len(gameplay_experience_batch[0])}. Explor. {0.05 + agent.exploration_decay:.2f}')
            print(f'Last 20 episodes: Mean loss - {cum_loss/20:.4f} Mean reward - {cum_reward/20:.2f}')
            
#             print(f'Action usage in this episode: {action_usage}')
            display(go.Figure(go.Bar(x=list(range(7)), y=cum_action_usage/7)))
            
    
            agent.update_target_network()
            pld(episode_cnt, cum_reward/20, cum_loss/20)
            
            difficulty = 0
            if cum_reward/20 > -10:
                difficulty = 1
            if cum_reward/20 > -5:
                difficulty = 2
            if cum_reward/20 > 0:
                difficulty = 3
            cum_reward = 0
            cum_loss = 0
            cum_action_usage = np.zeros(7)
        
    print('Done')

## Run model

In [None]:
experience_buffer = ReplayBuffer()

In [None]:
trained_agent = DqnAgent()

In [None]:
train_model(env, trained_agent, experience_buffer)

In [None]:
import json
weights_json = json.dumps([w.tolist() for w in trained_agent.q_net.get_weights()])
model_json = trained_agent.q_net.to_json()

In [None]:
def my_agent(observation, configuration):
    import tensorflow as tf
    import json
    import numpy as np
    
    if 'trained_agent' in globals():
        q_net = trained_agent.q_net
    #MODEL
    #WEIGHTS
    
    mark = np.array([observation.mark])
    board = np.array(observation.board)
    board = board[None, :].reshape(-1, 6, 7)
    a, b = board.copy(), board.copy()
    a[a == 1] = 0
    a[a == 2] = 1
    b[b == 2] = 0
    result = np.stack((a,b), axis=3)
    action_q = q_net.predict([result, mark])[0]
    for i in range(7):
        if observation.board[i] != 0:
            action_q[i] = -1e7
    action = int(np.argmax(action_q))
    return action


# Evaluate your Agent

In [None]:
def mean_reward(rewards):
    mr = sum(r[0] for r in rewards) / float(len(rewards))
    return f'{((mr+1)/2)*100}%'
    

# Run multiple episodes to estimate its performance.
print("My Agent vs Random Agent:", mean_reward(evaluate("connectx", [my_agent, "random"], num_episodes=100)))
print("My Agent vs Negamax Agent:", mean_reward(evaluate("connectx", [my_agent, "negamax"], num_episodes=10)))

# Test your Agent

In [None]:
env.reset()
# Play as the first agent against default "random" agent.
env.run(["negamax", my_agent])
env.render(mode="ipython", width=500, height=450)

# Write Submission File



In [None]:
import inspect
import os
import base64

def write_agent_to_file(function, file):
    with open(file, "w") as f:
        source_code = inspect.getsource(function)
        source_code = source_code.replace('#MODEL', f"q_net = tf.keras.models.model_from_json('{model_json}')")
        source_code = source_code.replace('#WEIGHTS', f"q_net.set_weights(np.array([np.array(a) for a in json.loads('{weights_json}')]))")
        
        f.write(source_code)
        print(function, "written to", file)
        
    
write_agent_to_file(my_agent, "submission.py")

# Validate Submission
Play your submission against itself.  This is the first episode the competition will run to weed out erroneous agents.

Why validate? This roughly verifies that your submission is fully encapsulated and can be run remotely.

In [None]:
# Note: Stdout replacement is a temporary workaround.
import sys
from submission import my_agent as agent

env = make("connectx", debug=True)
env.run([agent, agent])
print("Success!" if env.state[0].status == env.state[1].status == "DONE" else "Failed...")

# Submit to Competition

1. Commit this kernel.
2. View the commited version.
3. Go to "Data" section and find submission.py file.
4. Click "Submit to Competition"
5. Go to [My Submissions](https://kaggle.com/c/connectx/submissions) to view your score and episodes being played.