In [None]:
!pip install 'kaggle-environments>=0.1.6'

from learntools.core import binder
binder.bind(globals())
from learntools.game_ai.ex4 import *

import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

!pip install 'tensorflow==1.14.0'

import tensorflow as tf
from kaggle_environments import make, evaluate
from gym import spaces

!apt-get update
!apt-get install -y cmake libopenmpi-dev python3-dev zlib1g-dev
!pip install "stable-baselines[mpi]==2.9.0"

In [None]:
from stable_baselines.bench import Monitor 
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO1, A2C, ACER, ACKTR, TRPO
from stable_baselines.a2c.utils import conv, linear, conv_to_fc
from stable_baselines.common.policies import CnnPolicy

In [None]:
class ConnectFourGym:
    def __init__(self, agent2="random"):
        ks_env = make("connectx", debug=True)
        self.env = ks_env.train([None, agent2])
        self.rows = ks_env.configuration.rows
        self.columns = ks_env.configuration.columns
        # Learn about spaces here: http://gym.openai.com/docs/#spaces
        self.action_space = spaces.Discrete(self.columns)
        self.observation_space = spaces.Box(low=0, high=2, 
                                            shape=(self.rows,self.columns,1), dtype=np.int)
        # Tuple corresponding to the min and max possible rewards
        self.reward_range = (-10, 1)
        # StableBaselines throws error if these are not defined
        self.spec = None
        self.metadata = None
    def reset(self):
        self.obs = self.env.reset()
        return np.array(self.obs['board']).reshape(self.rows,self.columns,1)
    def change_reward(self, old_reward, done):
        if old_reward == 1: # The agent won the game
            return 1
        elif done: # The opponent won the game
            return -1
        else: # Reward 1/42
            return 1/(self.rows*self.columns)
    def step(self, action):
        # Check if agent's move is valid
        is_valid = (self.obs['board'][int(action)] == 0)
        if is_valid: # Play the move
            self.obs, old_reward, done, _ = self.env.step(int(action))
            reward = self.change_reward(old_reward, done)
        else: # End the game and penalize agent
            reward, done, _ = -10, True, {}
        return np.array(self.obs['board']).reshape(self.rows,self.columns,1), reward, done, _

In [None]:
# Create ConnectFour environment
env = ConnectFourGym(agent2="negamax")

# Create directory for logging training information
log_dir = "log/"
os.makedirs(log_dir, exist_ok=True)

# Logging progress
monitor_env = Monitor(env, log_dir, allow_early_resets=True)

# Create a vectorized environment
vec_env = DummyVecEnv([lambda: monitor_env])

In [None]:
# Neural network for predicting action values
def modified_cnn(scaled_images, **kwargs):
    activ = tf.nn.relu
    layer_1 = activ(conv(scaled_images, 'c1', n_filters=32, filter_size=3, stride=1, 
                         init_scale=np.sqrt(2), **kwargs))
    layer_2 = activ(conv(layer_1, 'c2', n_filters=64, filter_size=3, stride=1, 
                         init_scale=np.sqrt(2), **kwargs))
    layer_2 = conv_to_fc(layer_2)
    return activ(linear(layer_2, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))  

class CustomCnnPolicy(CnnPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomCnnPolicy, self).__init__(*args, **kwargs, cnn_extractor=modified_cnn)

In [None]:
# Initialize agent
model = PPO1(CustomCnnPolicy, vec_env, verbose=0, n_cpu_tf_sess=None)

# Train agent
model.learn(total_timesteps=100000)

# Plot cumulative reward
with open(os.path.join(log_dir, "monitor.csv"), 'rt') as fh:    
    firstline = fh.readline()
    assert firstline[0] == '#'
    df = pd.read_csv(fh, index_col=None)['r']
df.rolling(window=1000).mean().plot()
plt.show()

In [None]:
import torch as th
import torch.nn as nn

baselines_cnn_model = model
class PyTorchCnnPolicy(nn.Module):
    def __init__(self):
        super(PyTorchCnnPolicy, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=0, bias=True)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=0, bias=True)          
        self.fc1 = nn.Linear(384, 512)
        self.fc2 = nn.Linear(512, 7)
        self.relu = nn.ReLU()
        self.out_activ = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = x.permute(0, 2, 3, 1).contiguous()
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        x = self.out_activ(x)                  
        return x

def copy_cnn_weights(baselines_model):
    torch_cnn = PyTorchCnnPolicy()
    model_params = baselines_model.get_parameters()
    # Get only the policy parameters
    policy_keys = [key for key in model_params.keys() if "pi" in key or "c" in key]
    policy_params = [model_params[key] for key in policy_keys]
    
    for (th_key, pytorch_param), key, policy_param in zip(torch_cnn.named_parameters(), policy_keys, policy_params):
        param = policy_param.copy()
        # Copy parameters from stable baselines model to pytorch model

        # Conv layer
        if len(param.shape) == 4:  
          # https://gist.github.com/chirag1992m/4c1f2cb27d7c138a4dc76aeddfe940c2
          # Tensorflow 2D Convolutional layer: height * width * input channels * output channels
          # PyTorch 2D Convolutional layer: output channels * input channels * height * width
          param = np.transpose(param, (3, 2, 0, 1))
        
        # weight of fully connected layer
        if len(param.shape) == 2:
            param = param.T
        
        # bias
        if 'b' in key:
            param = param.squeeze()

        param = th.from_numpy(param)
        pytorch_param.data.copy_(param.data.clone())
        
    return torch_cnn

th_model = copy_cnn_weights(baselines_cnn_model)
th.save(th_model.state_dict(), 'thmodel')

import base64
with open('thmodel', 'rb') as f:
    raw_bytes = f.read()
    encoded_weights = base64.encodebytes(raw_bytes)

import io
import torch
from torch.autograd import Variable
import random

agent_th_model = PyTorchCnnPolicy()
decoded = base64.b64decode(encoded_weights)
buffer = io.BytesIO(decoded)
agent_th_model.load_state_dict(torch.load(buffer))

In [None]:
import inspect
import os

def my_agent(observation, configuration):
    obs = np.array(observation['board'])
    th_obs = Variable(th.from_numpy(np.array(observation['board']).reshape(1,1,6,7))).float()
    y = agent_th_model(th_obs)
    action = th.argmax(agent_th_model(th_obs)).item()
    if observation.board[action] == 0:
        return action
    else:
        return random.choice([c for c in range(configuration.columns) if observation.board[c] == 0])

def write_agent_to_file(file):
#     with open(file, "a" if os.path.exists(file) else "w") as f:
    with open(file, "w") as f:
        f.write('import numpy as np\n')
        f.write('import random\n')
        f.write('import torch as th\n')
        f.write('import torch.nn as nn\n')
        f.write('import io\n')
        f.write('import base64\n')
        f.write('from torch.autograd import Variable\n')

        f.write('class PyTorchCnnPolicy(nn.Module):\n')
        f.write('    def __init__(self):\n')
        f.write('        super(PyTorchCnnPolicy, self).__init__()\n')
        f.write('        self.conv1= nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=0, bias=True)\n')
        f.write('        self.conv2= nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=0, bias=True)\n')
        f.write('        self.fc1 = nn.Linear(384, 512)\n') 
        f.write('        self.fc2 = nn.Linear(512, 7)\n') 
        f.write('        self.relu = nn.ReLU()\n') 
        f.write('        self.out_activ = nn.Softmax(dim=1)\n')
        f.write('    def forward(self, x):\n')
        f.write('        x = self.relu(self.conv1(x))\n')
        f.write('        x = self.relu(self.conv2(x))\n')
        f.write('        x = x.permute(0, 2, 3, 1).contiguous()\n')
        f.write('        x = x.view(x.size(0), -1)\n')
        f.write('        x = self.relu(self.fc1(x))\n')
        f.write('        x = self.fc2(x)\n')
        f.write('        x = self.out_activ(x)\n')
        f.write('        return x\n')

        f.write('agent_th_model = PyTorchCnnPolicy()\n')
        f.write('encoded_weights =' + str(encoded_weights) + '\n')
        f.write('decoded = base64.b64decode(encoded_weights)\n')
        f.write('buffer = io.BytesIO(decoded)\n')
        f.write('agent_th_model.load_state_dict(th.load(buffer))\n')
        
        f.write(inspect.getsource(my_agent))
        
write_agent_to_file("submission.py")        