# Install kaggle-environments

In [None]:
# 1. Enable Internet in the Kernel (Settings side pane)

# 2. Curl cache may need purged if v0.1.4 cannot be found (uncomment if needed). 
# !curl -X PURGE https://pypi.org/simple/kaggle-environments

# ConnectX environment was defined in v0.1.4
    !pip install 'kaggle-environments>=0.1.4'

# Create ConnectX Environment

In [None]:
from kaggle_environments import evaluate, make

env = make("connectx", debug=True)
env.render()

# Create an Agent

To create the submission, an agent function should be fully encapsulated (no external dependencies).  

When your agent is being evaluated against others, it will not have access to the Kaggle docker image.  Only the following can be imported: Python Standard Library Modules, gym, numpy, scipy (more may be added later). 

# Critic Actor Agent

In [None]:
!pip install 'keras'
    
from keras import backend as K
from keras.layers import Activation, Dense, Input
from keras.models import Model
from keras.optimizers import Admas
import numpy as np
    
class Agent_Critic_Actor(object):
    def __init__(self, alpha, beta, gamma=0.99, n_actions=4,
               layer1_size=1024, layer2_size=512, input_dims=8):
        self.gamma = gamma
        self.alpha = alpha
        self.beta = beta
        self.input_dims = input_dims
        self.fc1_dims = layer1_size
        self.fc2_dims = layer2_size
        self.n_actions = n_actions
            
            
        self.actor, self.critic, self.policy = self.build_actor_critic_network()
        self.actionspace = [i for i in range(self.n_actions)]
        
    def build_actor_critic_network(self):
        input = Input(shape=(self.input_dims,))
        delta = Input(shape=[1])
        dense1 = Dense(self.fc1_dims, activation='relu')(input)
        dense2 = Dense(self.fc2_dims, activation='relu')(dense1)
        probs = Dense(self.n_actions, activation='softmax')(dense2)
        values = Dense(1, activaion='linear')(dense2)
            
        def custom_loss(y_true, y_pred):
            out = K.clip(y_pred, 1e-8, 1- 1e-8)
            log_lik = y_true*K.log(out)
                
            return K.sum(-log_lik*delta)
            
        actor = Model(input=[input,delta], output=[probs])
        actor.compile(optimizer=Adam(lr=self.alpha), loss = custom_losss)
        critic = Model(input=[input], output=[values])
        critic.compile(optimizer=Adam(lr=self.beta), loss='mean_squared_error')
        policy = Model(input=['input'], output=[probs])
        return actor, critic, policy
        
    def choose_action(self, observation):
        state = observation[np.newaxis, :]
        probabilities = self.policy.predict(state)[0]
        action = np.random.choice(self.action_space, p = probabilities)
        return action
        
    def learn(self, state, action, reward, state_, done):
        state = state[np.newaxis, :]
        state_ = state_[np.newaxis, :]
            
        critic_value_ = self.critic.predic(state_)
        critic_value = self.critic.predict(state)
            
        target = reward + self.gamma*critic_value_*(1-int(done))
        delta = target - critic_value
            
        actions = np.zeros([1, self.n_actions])
        actions[np.arange(1), action] = 1.0
            
        self.actor.fit([state,delta], actions, verbose=0)
        self.critic.fit(state, target, verbose=0)

# Q Learning

In [None]:
from random import choice
class QLearn:
    """
        params:
            config: Information about the board
            gamma: importance of the value of the next state. [0,1]
    """
    def __init__(self, max_num_actions, gamma = 0.9):
        self.max_num_actions = max_num_actions
        self.gamma = gamma
        self.Qtable = {}
        self.batch = []
    
    def state_to_str(self,state):
        
        def integer_to_character(elem):
            if elem == 0:
                return 'a'
            elif elem == 1:
                return 'b'
            elif elem == 2:
                return 'c'
            else:
                return 'd'
        return ''.join(map(integer_to_character, state)) 

    
    """
        return:
            Chooses an action given a state that maximizes the reward
    """
    def choose_action(self, state_as_list):
        
        state = self.state_to_str(state_as_list)
        # If it is the first time create a space for this state
        # and choose a random action
        
        if state not in self.Qtable:
            self.Qtable[state] = [0] * self.max_num_actions
            actions = [c for c in state_as_list if state_as_list[c] == 0]
            if len(actions):
                return choice(actions)
            return 1 # Game over
        
        # If not return the action that maximizes the reward
        return self.Qtable[state].index(max(self.Qtable[state]))
    
    """
        Stores the necessary parameters to update the q table.
    """
    def store(self, reward, state, action, next_state):
        self.batch.append({"reward":reward, "state":state, "action":action, "next_state":next_state})
        
    """
        param:
            state: a list that represents the board
        return: 
            posible actions given a state
    """
    def get_actions(self, state):
        return [c for c in self.max_num_actions if state[c] == 0]
    
    """
        Updates the q table. The value of state-action is the reward plus
        a portion of the value of the next state. The value of the next state
        is the max value you can get in that state
    """
    def update_qtable(self, state_reward, state_as_list, action, next_state_as_list):
        state = self.state_to_str(state_as_list)
        next_state = self.state_to_str(next_state_as_list)
        next_state_value = 0
        if next_state in self.Qtable:
            next_state_value = max(self.Qtable[next_state])
        reward = 0
        if state_reward is not None:
            reward = state_reward
        
        self.Qtable[state][action] = reward + self.gamma * next_state_value
    """
        This function is used when an episode terminates. Updates the qtable for
        every action that the agent has made.
    """
    def learn(self):
        # Updates the q table 
        for x in self.batch:
            self.update_qtable(x["reward"], x["state"], x["action"], x["next_state"])
        # Restart batch 
        self.batch = []
        
        
    """ UTILITIES"""
    def get_qtable_len(self):
        return len(self.Qtable)
            
 
        

# Test Q Learning Agent

In [None]:
qLearner = QLearn(env.configuration.columns)

def qlearner_choose_action(observation, configuration):
    return qLearner.choose_action(observation)

env.reset()
# Play as the first agent against default "random" agent.
env.run([qlearner_choose_action, "random"])
env.render(mode="ipython", width=500, height=450)

In [None]:
import matplotlib.pyplot as plt
rewards = []
qtablelens = []
qLearner = QLearn(env.configuration.columns)
actions = []
# Play as first position against random agent.
MAX_EPISODES = 10000
for episode in range(MAX_EPISODES):
    trainer = env.train([None, "random"])
    state = trainer.reset()
    total_reward = 0
    while not env.done:
        action = qLearner.choose_action(state.board)
        actions.append(action)
        next_state, reward, done, info = trainer.step(action)
        qLearner.store(reward, state.board, action, next_state.board)
        state = next_state
        
        r = -5
        if reward is not None:
            r = reward
        total_reward = total_reward + r
    qLearner.learn()
    qtablelens.append(qLearner.get_qtable_len())
    rewards.append(total_reward)
env.render()

In [None]:
import numpy as np
# fixed bin size
bins = np.arange(-100, 100, 5) # fixed bin size
plt.xlim([min(actions)-5, max(actions)+5])
plt.hist(actions, bins=bins, alpha=0.5)
plt.title('Actions distribution')
plt.xlabel('actions')
plt.ylabel('count')
plt.show()

In [None]:
plt.plot(rewards)

In [None]:
mean_rewards = []
index = 1
total_rewards = 0
for reward in rewards:
    total_rewards = total_rewards + reward
    mean_reward = total_rewards / index
    mean_rewards.append(mean_reward)
    index = index + 1
plt.plot(mean_rewards)

In [None]:
plt.plot(qtablelens)

In [None]:
mean_qtablelens = []
index = 1
total_qtablelens = 0
for qtablelen in qtablelens:
    total_qtablelens = total_qtablelens + qtablelen
    mean_qtablelen = total_qtablelens / index
    mean_qtablelens.append(mean_qtablelen)
    index = index + 1
plt.plot(mean_qtablelens)

In [None]:
def qlearner_choose_action(observation, configuration):
    return qLearner.choose_action(observation)
env.reset()
# Play as the first agent against default "random" agent.
env.run([qlearner_choose_action, "negamax"])
env.render(mode="ipython", width=500, height=450)

# Deep Q Learning

Using Tensorflow to facilitate DQN building

In [None]:
!pip install 'keras'
!pip install 'progressbar'
    
from keras import backend as K
from keras.layers import Activation, Dense, Input
from keras.models import Model, Sequential
from keras.optimizers import Adam
import numpy as np

In [None]:
from random import choice
class DQN:
    def __init__(self, num_input, num_output, alpha = 0.0001):
        self.num_input = num_input
        self.num_output = num_output
        self.alpha = alpha
        self.nn = self.build_neural_network()
        self.batch = []
        
    def build_neural_network(self):
        """
        input = Input(shape=(self.num_input,))
        dense1 = Dense(50, activation='relu')(input)
        dense2 = Dense(50, activation='relu')(dense1)
        output = Dense(self.num_output, activation='relu')(dense2)
        nn = Model(input=input, output=output)
        nn.compile(optimizer=Adam(lr=self.alpha), loss = 'mean_squared_error')
        """
        
        nn = Sequential()
        nn.add(Dense(100, input_dim=self.num_input, activation='relu'))

        nn.add(Dense(self.num_output, input_dim=100, activation='relu'))
        nn.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
        
        return nn
    
    def prepare_data(self, state):
        return np.array([np.asarray(state.board)])
    
    def prepare_target(self, target):
        return np.array([target])
    
    def predict(self, state):
        return self.nn.predict(self.prepare_data(state))[0].tolist()
    
    def get_max_valid_action(self, state, prediction):
        max_predicted_value = -100
        max_action = choice([c for c in range(self.num_output) if state.board[c] == 0])
        action = 0
        for predicted_value in prediction:
            if predicted_value > max_predicted_value and state.board[action] == 0:
                max_action = action
                max_predicted_value = predicted_value
            action = action + 1
        return max_action
    
    """
        Chooses an action that maximizes the agents reward
    """
    def choose_action(self, state):
        action_values = self.nn.predict(self.prepare_data(state))[0].tolist()
        return action_values.index(max(action_values))
    
    
    """
        Stores the necessary parameters to update the q table.
    """
    def store(self, reward, state, prediction, action, next_state):
        self.batch.append({"reward":reward, "state":state, "prediction":prediction, "action":action, "next_state":next_state})
        
    """
        Performs backpropagation
    """
    def backprop(self, reward, state, prediction, action , next_state):
        data = self.prepare_data(state)
        target = np.asarray(prediction)
        target[action] = reward
        target = self.prepare_target(target)
        self.nn.fit(data, target, verbose=0)
        
    """
        This function is used when an episode terminates. Updates the qtable for
        every action that the agent has made.
    """
    def learn(self):
        # Updates the q table 
        for x in self.batch:
            self.backprop(x["reward"], x["state"],x["prediction"], x["action"], x["next_state"])
        # Restart batch 
        self.batch = []


In [None]:
cols = env.configuration.columns
rows = env.configuration.rows
dqn = DQN(cols * rows, cols)

def dqn_choose_action(observation, configuration):
    return dqn.choose_action(observation)

env.reset()
# Play as the first agent against default "random" agent.
env.run([dqn_choose_action, "random"])
env.render(mode="ipython", width=500, height=450)

In [None]:
import matplotlib.pyplot as plt
from progressbar import ProgressBar
pbar = ProgressBar()

rewards = []
cols = env.configuration.columns
rows = env.configuration.rows
dqn = DQN(cols * rows, cols)
actions = []
# Play as first position against random agent.
MAX_EPISODES = 100000
for episode in pbar(range(MAX_EPISODES)):
    trainer = env.train([None, "random"])
    state = trainer.reset()
    total_reward = 0
    while not env.done:
        prediction = dqn.predict(state)
        action = dqn.get_max_valid_action(state, prediction)
        actions.append(action)
        next_state, reward, done, info = trainer.step(action)
        dqn.store(reward, state, prediction, action, next_state)
        state = next_state
        
        r = -5
        if reward is not None:
            r = reward
        total_reward = total_reward + r
    dqn.learn()
    rewards.append(total_reward)
env.render()

In [None]:
import numpy as np
# fixed bin size
bins = np.arange(-100, 100, 5) # fixed bin size
plt.xlim([min(actions)-5, max(actions)+5])
plt.hist(actions, bins=bins, alpha=0.5)
plt.title('Actions distribution')
plt.xlabel('actions')
plt.ylabel('count')
plt.show()

In [None]:
plt.plot(rewards)

In [None]:
mean_rewards = []
index = 1
total_rewards = 0
for reward in rewards:
    total_rewards = total_rewards + reward
    mean_reward = total_rewards / index
    mean_rewards.append(mean_reward)
    index = index + 1
plt.plot(mean_rewards)

In [None]:
# This agent random chooses a non-empty column.
def my_agent(observation, configuration):
    from random import choice
    return choice([c for c in range(configuration.columns) if observation.board[c] == 0])

# Test your Agent

In [None]:
env.reset()
# Play as the first agent against default "random" agent.
env.run([my_agent, "random"])
env.render(mode="ipython", width=500, height=450)

# Debug/Train your Agent

In [None]:
# Play as first position against random agent.
trainer = env.train([None, "random"])
observation = trainer.reset()
while not env.done:
    my_action = my_agent(observation, env.configuration)
    print("Config:     ", str(env.configuration))
    print("My Action", my_action)
    observation, reward, done, info = trainer.step(my_action)
    
    # env.render(mode="ipython", width=100, height=90, header=False, controls=False)
env.render()

# Evaluate your Agent

In [None]:
def qlearner_choose_action(observation, configuration):
    return qLearner.choose_action(observation)

def mean_reward(rewards):
    return sum(r[0] for r in rewards if r[0] is not None) / sum(r[0] + r[1] for r in rewards if r[0] is not None)

# Run multiple episodes to estimate it's performance.
print("My Agent vs Random Agent:", mean_reward(evaluate("connectx", [qlearner_choose_action, "random"], num_episodes=10)))
print("My Agent vs Negamax Agent:", mean_reward(evaluate("connectx", [qlearner_choose_action, "negamax"], num_episodes=10)))

# Write Submission File



In [None]:
import inspect
import os

def write_agent_to_file(function, file):
    with open(file, "a" if os.path.exists(file) else "w") as f:
        f.write(inspect.getsource(function))
        print(function, "written to", file)

write_agent_to_file(my_agent, "submission.py")

# Submit to Competition

1. Commit this kernel.
2. View the commited version.
3. Go to "Data" section and find submission.py file.
4. Click "Submit to Competition"
5. Go to [My Submissions](https://kaggle.com/c/connectx/submissions) to view your score and episodes being played.