# Install kaggle-environments

In [None]:
# 1. Enable Internet in the Kernel (Settings side pane)

# 2. Curl cache may need purged if v0.1.6 cannot be found (uncomment if needed). 
# !curl -X PURGE https://pypi.org/simple/kaggle-environments

# ConnectX environment was defined in v0.1.6
!pip install 'kaggle-environments>=0.1.6'

# Import necesary packages

In [None]:
from learntools.core import binder
binder.bind(globals())
from learntools.game_ai.ex4 import *

In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

!pip install 'tensorflow==1.15.0'

import tensorflow as tf
from kaggle_environments import make, evaluate, utils
from gym import spaces

!apt-get update
!apt-get install -y cmake libopenmpi-dev python3-dev zlib1g-dev
!pip install "stable-baselines[mpi]==2.9.0"

from stable_baselines.bench import Monitor 
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO1, A2C, ACER, ACKTR, TRPO
from stable_baselines.a2c.utils import conv, linear, conv_to_fc
from stable_baselines.common.policies import CnnPolicy

# Create ConnectX Environment

There's a bit of extra work that we need to do to make the environment compatible with Stable Baselines.  For this, we define the `ConnectFourGym` class below.  This class implements ConnectX as an [OpenAI Gym environment](http://gym.openai.com/docs/) and uses several methods:
- `reset()` will be called at the beginning of every game.  It returns the starting game board as a 2D numpy array with 6 rows and 7 columns.
- `change_reward()` customizes the rewards that the agent receives.  (_The competition already has its own system for rewards that are used to rank the agents, and this method changes the values to match the rewards system we designed._) 
- `step()` is used to play the agent's choice of action (supplied as `action`), along with the opponent's response.  It returns:
  - the resulting game board (as a numpy array), 
  - the agent's reward (from the most recent move only: one of `+1`, `-10`, `-1`, or `1/42`), and
  - whether or not the game has ended (if the game has ended, `done=True`; otherwise, `done=False`).

In [None]:
class ConnectFourGym:
    def __init__(self, agent2="random"):
        ks_env = make("connectx", debug=True)
        self.env = ks_env.train([None, agent2])
        self.rows = ks_env.configuration.rows
        self.columns = ks_env.configuration.columns
        # Learn about spaces here: http://gym.openai.com/docs/#spaces
        self.action_space = spaces.Discrete(self.columns)
        self.observation_space = spaces.Box(low=0, high=2, 
                                            shape=(self.rows,self.columns,1), dtype=np.int)
        # Tuple corresponding to the min and max possible rewards
        self.reward_range = (-10, 1)
        # StableBaselines throws error if these are not defined
        self.spec = None
        self.metadata = None
    def reset(self):
        self.obs = self.env.reset()
        return np.array(self.obs['board']).reshape(self.rows,self.columns,1)
    def change_reward(self, old_reward, done):
        if old_reward == 1: # The agent won the game
            return 1
        elif done: # The opponent won the game
            return -1
        else: # Reward 1/42
            return 1/(self.rows*self.columns)
    def step(self, action):
        # Check if agent's move is valid
        is_valid = (self.obs['board'][int(action)] == 0)
        if is_valid: # Play the move
            self.obs, old_reward, done, _ = self.env.step(int(action))
            reward = self.change_reward(old_reward, done)
        else: # End the game and penalize agent
            reward, done, _ = -10, True, {}
        return np.array(self.obs['board']).reshape(self.rows,self.columns,1), reward, done, _

Stable Baselines requires us to work with ["vectorized" environments](https://stable-baselines.readthedocs.io/en/master/guide/vec_envs.html).  For this, we can use the `DummyVecEnv` class.  

The `Monitor` class lets us watch how the agent's performance gradually improves, as it plays more and more games.

In [None]:
# Create ConnectFour environment
env = ConnectFourGym(agent2="negamax")

# Create directory for logging training information
log_dir = "log/"
os.makedirs(log_dir, exist_ok=True)

# Logging progress
monitor_env = Monitor(env, log_dir, allow_early_resets=True)

# Create a vectorized environment
vec_env = DummyVecEnv([lambda: monitor_env])

# Neural network for predicting action values
def modified_cnn(scaled_images, **kwargs):
    activ = tf.nn.relu
    layer_1 = activ(conv(scaled_images, 'c1', n_filters=32, filter_size=3, stride=1, 
                         init_scale=np.sqrt(2), **kwargs))
    layer_2 = activ(conv(layer_1, 'c2', n_filters=64, filter_size=3, stride=1, 
                         init_scale=np.sqrt(2), **kwargs))
    layer_2 = conv_to_fc(layer_2)
    return activ(linear(layer_2, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))  

class CustomCnnPolicy(CnnPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomCnnPolicy, self).__init__(*args, **kwargs, cnn_extractor=modified_cnn)

# Define Model Algorithm and Train

In [None]:
# Initialize agent
model = PPO1(CustomCnnPolicy, vec_env, verbose=0, 
             timesteps_per_actorbatch=256,
             clip_param=0.1,
             optim_stepsize=0.01,
             optim_epochs=3,
#              optim_batchsize=256,
             adam_epsilon=2.5e-04, 
             n_cpu_tf_sess=None)
# optim_stepsize=0.001, adam_epsilon=1e-04, timesteps_per_actorbatch=256 => Agent1 0.11, Agent2 0.89

import time
start_time = time.time()

# Train agent
model.learn(total_timesteps=150000)

print("--- %s minutes ---" % ((time.time() - start_time)/60))

# Plot cumulative reward
with open(os.path.join(log_dir, "monitor.csv"), 'rt') as fh:    
    firstline = fh.readline()
    assert firstline[0] == '#'
    df = pd.read_csv(fh, index_col=None)['r']
df.rolling(window=1000).mean().plot()
plt.show()

In [None]:
#os.makedirs("trained_1", exist_ok=True)

model.save("trained_1")

In [None]:
del model # remove to demonstrate saving and loading

# Create an Agent

To create the submission, an agent function should be fully encapsulated (no external dependencies).  

When your agent is being evaluated against others, it will not have access to the Kaggle docker image.  Only the following can be imported: Python Standard Library Modules, gym, numpy, scipy, pytorch (1.3.1, cpu only), and more may be added later.



In [None]:
def agent1(obs, config):  
    model = PPO1.load('trained_1')
#     model = PPO1.load('try1')
    # Use the best model to select a column
    col, _ = model.predict(np.array(obs['board']).reshape(6,7,1))
    # Check if selected column is valid
    is_valid = (obs['board'][int(col)] == 0)
    # If not valid, select random move. 
    if is_valid:
        return int(col)
    else:
        return random.choice([col for col in range(config.columns) if obs.board[int(col)] == 0])

In [None]:
# Create the game environment
env = make("connectx")

# agent1 playing one game round vs another agent (try "random" or "negamax")
env.run([agent1, "negamax"])

# Show the game
env.render(mode="ipython", width=500, height=450)

# Test your Agent

In [None]:
# Play as the first agent against default "random" agent.
env.run([agent1, "random"])
env.render(mode="ipython", width=500, height=450)

# Debug/Train your Agent

In [None]:
# Play as first position against random agent.
trainer = env.train([None, "random"])

observation = trainer.reset()

while not env.done:
    my_action = agent1(observation, env.configuration)
    print("My Action", my_action)
    observation, reward, done, info = trainer.step(my_action)
env.render(mode="ipython", width=200, height=180, header=False, controls=False)
#env.render()

# Evaluate your Agent

In [None]:
def get_win_percentages(agent1, agent2="random", n_rounds=10):
    # Use default Connect Four setup
    config = {'rows': 6, 'columns': 7, 'inarow': 4}
    # Agent 1 goes first (roughly) half the time          
    outcomes = evaluate("connectx", [agent1, agent2], config, [], n_rounds//2)
    # Agent 2 goes first (roughly) half the time      
    outcomes += [[b,a] for [a,b] in evaluate("connectx", [agent2, agent1], config, [], n_rounds-n_rounds//2)]
    print("Agent 1 Win Percentage:", np.round(outcomes.count([1,-1])/len(outcomes), 2))
    print("Agent 2 Win Percentage:", np.round(outcomes.count([-1,1])/len(outcomes), 2))
    print("Number of Invalid Plays by Agent 1:", outcomes.count([None, 0]))
    print("Number of Invalid Plays by Agent 2:", outcomes.count([0, None]))

In [None]:
get_win_percentages(agent1=agent1, agent2="random")

# Play your Agent
Click on any column to place a checker there ("manually select action").

In [None]:
# "None" represents which agent you'll manually play as (first or second player).
env.play([None, "negamax"], width=500, height=450)

# Write Submission File



The subnission file will look similar to the agent, with addition of any packages that migth be needed and modification of the load address. In this case, the load address is pointint go to my directory. Hvae not yet figured out how to have the submission file load the trained_1.zip file once submitted.

In [None]:
def agent1(obs, config):  
    
    import numpy as np
    from stable_baselines import PPO1
     

    model = PPO1.load('trained_1')
    # Use the best model to select a column
    col, _ = model.predict(np.array(obs['board']).reshape(6,7,1))
    # Check if selected column is valid
    is_valid = (obs['board'][int(col)] == 0)
    # If not valid, select random move. 
    if is_valid:
        return int(col)
    else:
        return random.choice([col for col in range(config.columns) if obs.board[int(col)] == 0])

In [None]:
import inspect

def write_agent_to_file(function, file):
    with open(file, "a" if os.path.exists(file) else "w") as f:
        f.write(inspect.getsource(function))
        print(function, "written to", file)

write_agent_to_file(agent1, "submission.py")

# Validate Submission
Play your submission against itself.  This is the first episode the competition will run to weed out erroneous agents.

Why validate? This roughly verifies that your submission is fully encapsulated and can be run remotely.

In [None]:
from kaggle_environments import utils, agent

# Note: Stdout replacement is a temporary workaround.
import sys
out = sys.stdout
submission = utils.read_file("/kaggle/working/submission.py")
agent = agent.get_last_callable(submission)
sys.stdout = out

env = make("connectx", debug=True)
env.run([agent, agent])

print("Success!" if env.state[0].status == env.state[1].status == "DONE" else "Failed...")

# Self contained submission

In [None]:
with open("./trained_1.zip", 'rb') as f:
     trained_1 = f.read()

trained_1[:1000]

In [None]:
bin_data = trained_1 #Whatever binary data you have store in a variable
binary_file_path = 'try1.zip' #Name for new zip file you want to regenerate
with open(binary_file_path, 'wb') as f:
    f.write(bin_data)

In [None]:
# Create the agent
my_agent = '''def agent1(obs, config):  
        
    from learntools.core import binder
    binder.bind(globals())
    
    import os
    import random
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt

    import tensorflow as tf
    from kaggle_environments import make, evaluate, utils
    from gym import spaces

    from stable_baselines.bench import Monitor 
    from stable_baselines.common.vec_env import DummyVecEnv
    from stable_baselines import PPO1, A2C, ACER, ACKTR, TRPO
    from stable_baselines.a2c.utils import conv, linear, conv_to_fc
    from stable_baselines.common.policies import CnnPolicy
    
    class ConnectFourGym:
        def __init__(self, agent2="random"):
            ks_env = make("connectx", debug=True)
            self.env = ks_env.train([None, agent2])
            self.rows = ks_env.configuration.rows
            self.columns = ks_env.configuration.columns
            # Learn about spaces here: http://gym.openai.com/docs/#spaces
            self.action_space = spaces.Discrete(self.columns)
            self.observation_space = spaces.Box(low=0, high=2, 
                                                shape=(self.rows,self.columns,1), dtype=np.int)
            # Tuple corresponding to the min and max possible rewards
            self.reward_range = (-10, 1)
            # StableBaselines throws error if these are not defined
            self.spec = None
            self.metadata = None
        def reset(self):
            self.obs = self.env.reset()
            return np.array(self.obs['board']).reshape(self.rows,self.columns,1)
        def change_reward(self, old_reward, done):
            if old_reward == 1: # The agent won the game
                return 1
            elif done: # The opponent won the game
                return -1
            else: # Reward 1/42
                return 1/(self.rows*self.columns)
        def step(self, action):
            # Check if agent's move is valid
            is_valid = (self.obs['board'][int(action)] == 0)
            if is_valid: # Play the move
                self.obs, old_reward, done, _ = self.env.step(int(action))
                reward = self.change_reward(old_reward, done)
            else: # End the game and penalize agent
                reward, done, _ = -10, True, {}
            return np.array(self.obs['board']).reshape(self.rows,self.columns,1), reward, done, _

    # Create ConnectFour environment
    env = ConnectFourGym(agent2="random")

    # Create directory for logging training information
    log_dir = "log/"
    os.makedirs(log_dir, exist_ok=True)

    # Logging progress
    monitor_env = Monitor(env, log_dir, allow_early_resets=True)

    # Create a vectorized environment
    vec_env = DummyVecEnv([lambda: monitor_env])

    # Neural network for predicting action values
    def modified_cnn(scaled_images, **kwargs):
        activ = tf.nn.relu
        layer_1 = activ(conv(scaled_images, 'c1', n_filters=32, filter_size=3, stride=1, 
                             init_scale=np.sqrt(2), **kwargs))
        layer_2 = activ(conv(layer_1, 'c2', n_filters=64, filter_size=3, stride=1, 
                             init_scale=np.sqrt(2), **kwargs))
        layer_2 = conv_to_fc(layer_2)
        return activ(linear(layer_2, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))  

    class CustomCnnPolicy(CnnPolicy):
        def __init__(self, *args, **kwargs):
            super(CustomCnnPolicy, self).__init__(*args, **kwargs, cnn_extractor=modified_cnn)

'''

my_agent += '''    trained_1 = %s

''' %trained_1

my_agent += '''    binary_file_path = 'trained_1.zip'
    with open(binary_file_path, 'wb') as f:
        f.write(trained_1)
'''

my_agent += '''    model = PPO1.load('trained_1')
    # Use the best model to select a column
    col, _ = model.predict(np.array(obs['board']).reshape(6,7,1))
    # Check if selected column is valid
    is_valid = (obs['board'][int(col)] == 0)
    # If not valid, select random move. 
    if is_valid:
        return int(col)
    else:
        return random.choice([col for col in range(config.columns) if obs.board[int(col)] == 0])'''

In [None]:
with open('submission.py', 'w') as f:
    f.write(my_agent)

# Submit to Competition with stable-baselines

### After several attempts, I was unable to successfully submit this to competition.  Once the file is submitted, the stable-baselines module cannot be installed/imported. 

### ModuleNotFoundError: No module named 'stable_baselines'

# Submit to Competition - Alternative

Below are two notebooks I used as reference.

https://colab.research.google.com/drive/1XwCWeZPnogjz7SLW2kLFXEJGmynQPI-4#scrollTo=FIRaYdm9tCjE

https://www.kaggle.com/nickulus/connectx-with-stable-baselines/notebook

In [None]:
model = PPO1.load('trained_1')

In [None]:
for key, value in model.get_parameters().items():
    print(key, value.shape)

In [None]:
import torch as th
import torch.nn as nn

In [None]:
# https://colab.research.google.com/drive/1XwCWeZPnogjz7SLW2kLFXEJGmynQPI-4#scrollTo=FIRaYdm9tCjE

class PyTorchCnnPolicy(nn.Module):
    def __init__(self):
        super(PyTorchCnnPolicy, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=0, bias=True)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=0, bias=True)
        self.fc1 = nn.Linear(384, 512)
        self.fc2 = nn.Linear(512, 7)
        self.relu = nn.ReLU()
        self.out_activ = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = x.permute(0,2,3,1).contiguous()
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        x = self.out_activ(x)
        return x

In [None]:
def copy_cnn_weights(baselines_model):
    torch_cnn = PyTorchCnnPolicy()
    model_params = baselines_model.get_parameters()
    # Get only the policy parameters
    policy_keys = [key for key in model_params.keys() if "pi" in key or "c" in key]
    policy_params = [model_params[key] for key in policy_keys]
    
    for (th_key, pytorch_param), key, policy_param in zip(torch_cnn.named_parameters(), policy_keys, policy_params):
        param = policy_param.copy()
        # Copy parameters from stable baselines model to pytorch model

        # Conv layer
        if len(param.shape) == 4:  
          # https://gist.github.com/chirag1992m/4c1f2cb27d7c138a4dc76aeddfe940c2
          # Tensorflow 2D Convolutional layer: height * width * input channels * output channels
          # PyTorch 2D Convolutional layer: output channels * input channels * height * width
          param = np.transpose(param, (3, 2, 0, 1))
        
        # weight of fully connected layer
        if len(param.shape) == 2:
            param = param.T

        # bias
        if 'b' in key:
            param = param.squeeze()

        param = th.from_numpy(param)
        pytorch_param.data.copy_(param.data.clone())
        
    return torch_cnn

In [None]:
th_model = copy_cnn_weights(model)

In [None]:
import gym

class ConnectX(gym.Env):
    """Custom Environment that follows gym interface"""
    
    def __init__(self, opponent_type):
        self.env = make("connectx", debug=True)
        self.trainer = self.env.train([None, opponent_type])
        self.obs = None
        self.action_space = gym.spaces.Discrete(self.env.configuration.columns)
        self.observation_space = gym.spaces.Box(0, 2, shape=(self.env.configuration.rows, self.env.configuration.columns), dtype=np.float32)

    def get_kaggle_env(self):
        return self.env

    def step(self, action):
        # Wrap kaggle environment.step()
        if self.obs[0][action] != 0:
          r = -1 # punish illegal move
          d = False
          o = self.obs
        else:
          o, r, d, _ = self.trainer.step(int(action))
          o = np.reshape(np.array(o['board']), (self.env.configuration.rows, self.env.configuration.columns))
          self.obs = o

        return o, float(r), bool(d), {}
    
    def reset(self):        
        o = self.trainer.reset()
        self.obs = np.reshape(np.array(o['board']), (self.env.configuration.rows, self.env.configuration.columns))
        return self.obs

    def render(self, **kwargs):
        return self.env.render(**kwargs)

In [None]:
gym_env = ConnectX('negamax')

env = Monitor(gym_env, log_dir, allow_early_resets=True)

env = DummyVecEnv([lambda: env])

In [None]:
import torch
from torch.autograd import Variable

episode_reward = 0
done = False
obs = env.reset()
step_cnt = 0
max_moves = gym_env.get_kaggle_env().configuration.columns * gym_env.get_kaggle_env().configuration.rows

while (not done) and step_cnt <= max_moves:
    step_cnt += 1
    th_obs = Variable(torch.from_numpy(obs))
    action = th.argmax(th_model(th_obs.unsqueeze(0))).item()   # th_obs.unsqueeze(0).size() => torch.Size([1, 1, 6, 7])

    print('action:', action)
    if obs[0][0][action] != 0:
        print('skipping illegal move')
    else:
        obs, reward, done, info = env.step([action])
        gym_env.render()
        episode_reward += reward
        print()

PyTorch serialization adapted from:

https://www.kaggle.com/c/connectx/discussion/126678

In [None]:
torch.save(th_model.state_dict(), 'thmodel')

In [None]:
import base64
with open('thmodel', 'rb') as f:
    raw_bytes = f.read()
    encoded_weights = base64.encodebytes(raw_bytes)

In [None]:
print(encoded_weights[:1000]) # printing first 1000 characters from encoded weigths for visualisation. Very long string.

In [None]:
import io
import base64
import torch
from torch.autograd import Variable
import random

agent_th_model = PyTorchCnnPolicy()
# encoded_weights =b'gAKKCmz8n ..... [long string]
decoded = base64.b64decode(encoded_weights)
buffer = io.BytesIO(decoded)
agent_th_model.load_state_dict(torch.load(buffer))

In [None]:
def my_agent(observation, configuration):
    obs = np.array(observation['board'])
    th_obs = Variable(torch.from_numpy(np.array(observation['board']).reshape(1,1,6,7))).float()
    y = agent_th_model(th_obs)
    action = th.argmax(agent_th_model(th_obs)).item()
    if observation.board[action] == 0:
        return action
    else:
        return random.choice([c for c in range(configuration.columns) if observation.board[c] == 0])

In [None]:
kaggle_env = gym_env.get_kaggle_env()
kaggle_env.reset()
kaggle_env.run([my_agent, "negamax"])
kaggle_env.render(mode="ipython", width=500, height=450)

In [None]:
get_win_percentages(agent1=my_agent, agent2="random")

In [None]:
get_win_percentages(agent1=my_agent, agent2="negamax")

In [None]:
import inspect
import os

def write_agent_to_file(file):
    with open(file, "w") as f:
        submission_file = '''
import random
import numpy as np
import torch as th
import torch.nn as nn
import io
import base64
import torch
from torch.autograd import Variable

class PyTorchCnnPolicy(nn.Module):
    def __init__(self):
        super(PyTorchCnnPolicy, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=0, bias=True)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=0, bias=True)
        self.fc1 = nn.Linear(384, 512)
        self.fc2 = nn.Linear(512, 7)
        self.relu = nn.ReLU()
        self.out_activ = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = x.permute(0,2,3,1).contiguous()
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        x = self.out_activ(x)
        return x

agent_th_model = PyTorchCnnPolicy()
encoded_weights = %s \n
decoded = base64.b64decode(encoded_weights)
buffer = io.BytesIO(decoded)
agent_th_model.load_state_dict(torch.load(buffer))

'''%str(encoded_weights)
        
        submission_file += inspect.getsource(my_agent)
        f.write(submission_file)
        
write_agent_to_file("submission.py")

In [None]:
# Note: Stdout replacement is a temporary workaround.
import sys
out = sys.stdout
from kaggle_environments import utils, agent
submission = utils.read_file("/kaggle/working/submission.py")
submission_agent = agent.get_last_callable(submission)
sys.stdout = out

kaggle_env.run([submission_agent, submission_agent])
print("Success!" if kaggle_env.state[0].status == kaggle_env.state[1].status == "DONE" else "Failed...")

kaggle_env.play([submission_agent, None])

In [None]:
submission_file = '''
import random
import numpy as np
import torch as th
import torch.nn as nn
import io
import base64
import torch
from torch.autograd import Variable

class PyTorchCnnPolicy(nn.Module):
    def __init__(self):
        super(PyTorchCnnPolicy, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=0, bias=True)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=0, bias=True)
        self.fc1 = nn.Linear(384, 512)
        self.fc2 = nn.Linear(512, 7)
        self.relu = nn.ReLU()
        self.out_activ = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = x.permute(0,2,3,1).contiguous()
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        x = self.out_activ(x)
        return x

agent_th_model = PyTorchCnnPolicy()
#encoded_weights = %s
decoded = base64.b64decode(encoded_weights)
buffer = io.BytesIO(decoded)
agent_th_model.load_state_dict(torch.load(buffer))

'''#%str(encoded_weights)

submission_file += inspect.getsource(my_agent)

In [None]:
submission_file.splitlines() # For reference only