# Checkers AI training through self play

This notebook uses a checkers implementation from the pettingzoo package. 

Monte carlo tree search and self-play training is based on https://web.stanford.edu/~surag/posts/alphazero.html

In [1]:
!pip install pettingzoo



In [2]:
from pettingzoo.utils.env import AECEnv
from pettingzoo.classic import checkers_v3
from copy import deepcopy
import numpy as np

## Utils
Let's start by defining some util classes used later in the code

In [3]:
# Game env wrapper for MCTS search
class State:

    def __init__(self, env : AECEnv):
        self.env = env

    def gameEnded(self):
        _, _, done, _ = self.env.last()
        return done

    def gameReward(self):
        _, reward, _, _ = self.env.last()
        return reward

    def getActionMask(self):
        observation, _, _, _ = self.env.last()
        return observation["action_mask"]

    def getValidActions(self):
        return np.flatnonzero(self.getActionMask())

    def nextState(self, action):
        new_env = deepcopy(self.env)
        new_env.step(action)
        player_changed = self.env.agent_selection != new_env.agent_selection
        return State(new_env), player_changed

    def getObservation(self):
        return self.env.observe(self.currentAgent())["observation"]

    def currentAgent(self):
        return self.env.agent_selection

    def show(self, wait=False):
        self.env.render()
        if wait:
            input("press any key to continue")


    def __eq__(self, x):
        if not isinstance(x, State):
            return False
        # this should be enough
        same_agent = self.env.agent_selection == x.env.agent_selection
        observations_match = (self.getObservation() == x.getObservation()).all()
        return same_agent and observations_match

    def toStr(self):
        o = self.getObservation()
        # reduce dimensions from 3 to 2
        o = np.sum(o, axis = 2) * (np.argmax(o, axis = 2) + 1)
        return str(o)

    def __hash__(self):
        return hash(self.toStr())


In [4]:
class TrainingExample:

    def __init__(self, state : State, pi, reward):
        self.state = state
        self.pi = pi
        self.reward = reward


## The neural net

In [5]:
from typing import List
import random
from tensorflow.keras import layers, Model, Input, metrics, losses
import tensorflow as tf

def applyActionMaskToPolicy(p, action_mask):
    p_masked = p * action_mask
    # policy zeroed all possible actions
    if np.sum(p_masked) == 0:
        p_masked = m

    return p_masked / np.sum(p_masked) # renormalize


class NNet:

    def __init__(self, action_size):
        x = Input(shape=(8,8,4))
        y = layers.Conv2D(32, 3, activation='relu')(x)
        y = layers.Conv2D(32, 3, activation='relu')(y)
        y = layers.Flatten()(y)
        # y = layers.Dropout(0.5)(y)
        p = layers.Dense(action_size, activation='softmax', name="p")(y)
        v = layers.Dense(1, name="v")(y)
        self.nnet = Model(x, [p,v])
        print(self.nnet.summary())
        
        def entropyLoss(y_true, y_pred):
            return -y_true * tf.math.log(y_pred + 1e-10)
            
        self.nnet.compile(
                optimizer="adam",
                loss={"p": entropyLoss, "v":"mse"}
        )

    def predict(self, state : State):
        x = state.getObservation()
        x = np.expand_dims(x, 0)
        p, v = self.nnet.predict(x, batch_size=1)

        p = p[0]
        p = applyActionMaskToPolicy(p, state.getActionMask())
        return p, v[0][0]

    @staticmethod
    def _prepare_examples(examples: List[TrainingExample]):
        X = []
        pi = []
        v = []
        for e in examples:
            X.append(e.state.getObservation())
            pi.append(e.pi)
            v.append(e.reward)
        
        return np.array(X), [np.array(pi), np.array(v)]
     
    def train(self, examples):
        X, y = self._prepare_examples(examples)
        self.nnet.fit(X, y, batch_size=32, shuffle=True, epochs=3)
        return self


2021-12-14 18:47:17.802294: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-14 18:47:17.802314: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [6]:
class RandomPlayer:
    
    def predict(self, state : State):
        p = np.random.uniform(256)
        return applyActionMaskToPolicy(p, state.getActionMask()), 0


Check if it works : create environment, wrap it in state, run nnet predict.

In [7]:
env = checkers_v3.env()
env.reset()
env.render()

  M   M   M   M 
M   M   M   M   
  M   M   M   M 
_   _   _   _   
  _   _   _   _ 
m   m   m   m   
  m   m   m   m 
m   m   m   m   


In [8]:
state = State(env)
state.getObservation().shape

(8, 8, 4)

In [9]:
nnet = NNet(256)
examples = [TrainingExample(state, np.full(256, 1.0 / 256), 1) for _ in range(32)]
nnet.train(examples)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 8, 8, 4)]    0           []                               
                                                                                                  
 conv2d (Conv2D)                (None, 6, 6, 32)     1184        ['input_1[0][0]']                
                                                                                                  
 conv2d_1 (Conv2D)              (None, 4, 4, 32)     9248        ['conv2d[0][0]']                 
                                                                                                  
 flatten (Flatten)              (None, 512)          0           ['conv2d_1[0][0]']               
                                                                                              

2021-12-14 18:47:19.661426: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-12-14 18:47:19.661466: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-12-14 18:47:19.661495: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (TARS): /proc/driver/nvidia/version does not exist
2021-12-14 18:47:19.661796: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 2/3
Epoch 3/3


<__main__.NNet at 0x7f43857c72b0>

In [10]:
policy, value = nnet.predict(state)
print(policy.shape)
print(value)

(256,)
0.37934464


In [11]:
policy, value = RandomPlayer().predict(state)
print(policy.shape)
print(value)

(256,)
0


## MCTS

In [12]:
class MCTSNode:

    def __init__ (self, p, q):
        """
        Parameters
        ----------
        p : policy in this state
        q : q value of this state
        """
        self.p = p
        self.q = q
        # n[a] : number of times and action has been performed from this state
        self.n = np.zeros(len(p))
        # q_a : q values of states following performing an action a
        self.q_a = np.zeros(len(p))


class MCTS:

    def __init__(self, nnet, num_mcts_sims, max_depth = 10):
        self.nnet = nnet
        self.nodes = {}
        self.c_puct = 1.0
        self.num_mcts_sims = num_mcts_sims
        self.max_depth = max_depth
        
    def search(self, s : State):
        for _ in range(self.num_mcts_sims):
            self._search(s, self.max_depth)

    def _search(self, s : State, max_depth):
        if s.gameEnded(): return s.gameReward()

        if s not in self.nodes:
            p, v = self.nnet.predict(s)
            self.nodes[s] = MCTSNode(p, v)
            return v

        node = self.nodes[s]

        if max_depth == 0:
            # max depth reached, returning a heuristic value of this state
            return node.q
      
        # upper confidence bound
        ucb = node.q_a + self.c_puct * node.p * np.sqrt(np.sum(node.n)) / (1 + node.n)
        ucb[s.getActionMask() == 0] = -np.inf
        # choose best action based on ucb
        a = np.argmax(ucb)
        
        sp, player_changed = s.nextState(a)
        v = self._search(sp, max_depth - 1)
        if player_changed:
            v = -v

        node.q_a[a] = (node.n[a] * node.q_a[a] + v) / (node.n[a] + 1)
        node.n[a] += 1
        return v

    # improved policy
    def pi(self, s : State):
        node = self.nodes[s]
        n_sum = np.sum(node.n)
        if n_sum == 0:
            return node.p

        return node.n / n_sum


# mcts = MCTS(nnet, 30)
# %timeit -r 2 -n 5 mcts.search(state)
# old version time: 5 loops, best of 2: 4.6 s per loop
# current time: 5 loops, best of 2: 1.54 s per loop

In [13]:
mcts = MCTS(nnet, 2)
mcts.search(state)
mcts.pi(state).shape

(256,)

In [14]:
def pit(new_nnet : NNet, nnet : NNet, games_played = 40):
    new_nnet_tag = "player_0"
    nnet_tag = "player_1"
    wins = 0
    ties = 0

    for g in range(games_played):
        env = checkers_v3.env()
        env.reset()
        s = State(env)
        # swap players before each round
        new_nnet_tag, nnet_tag = nnet_tag, new_nnet_tag  
        agents = {new_nnet_tag : new_nnet, nnet_tag : nnet}

        while not s.gameEnded():
            agent = agents[s.currentAgent()]
            p, _ = agent.predict(s)
            action = np.random.choice(len(p), p=p)
            s.env.step(action)

        if s.gameReward() == 0:
            ties += 1
       
        if s.gameReward() == 1 and s.currentAgent() == new_nnet_tag:
            wins += 1
    
        if s.gameReward() == -1 and s.currentAgent() != new_nnet_tag:
            wins += 1
        
            
    frac_win = wins / (games_played - ties)
    return frac_win

# training
def policyIterSP(env : AECEnv, num_iters = 10, num_eps = 10,  num_mcts_sims=25, frac_win_thresh = 0.55):
    # hard coded action space size
    nnet = NNet(256)
    frac_win = pit(nnet, RandomPlayer())                              # compare new net with a random player
    print("frac_wins against a random player", frac_win)
    examples = []
    for i in range(num_iters):
        for e in range(num_eps):
            examples += executeSelfPlayEpisode(env, nnet, num_mcts_sims)    # collect examples from this game
            print("episode done")
        new_nnet = nnet.train(examples)
        frac_win = pit(new_nnet, nnet)                                # compare new net with previous net
        print("frac_win", frac_win)
        if frac_win > frac_win_thresh:
            print("new net is better!")
            nnet = new_nnet                                           # replace with new net
            frac_win = pit(nnet, RandomPlayer())                      # compare new net with a random player
            print("frac_wins against a random player", frac_win)
        examples = random.sample(examples, len(examples) // 2)        # discard half of the examples
    return nnet

def executeSelfPlayEpisode(env : AECEnv, nnet, num_mcts_sims = 3):
    examples = []
    env.reset()
    s = State(env)
    # s.show(wait = False)
    mcts = MCTS(nnet, num_mcts_sims)

    while True:
        mcts.search(s)
        pi = mcts.pi(s)
        examples.append(TrainingExample(deepcopy(s), pi, None))  # rewards can not be determined yet
        a = np.random.choice(len(pi), p=pi)                      # sample action from improved policy
        s, _ = s.nextState(a)
        # s.show(wait = False)
        if s.gameEnded():
            examples = assignRewards(examples, s.gameReward(), s.currentAgent())
            return examples

def assignRewards(examples, reward, player_w_reward):
    for e in examples:
        e.reward = reward if e.state.currentAgent() == player_w_reward else -reward

    return examples

### test

In [15]:
env = checkers_v3.env()
nnet = policyIterSP(env, num_iters=1, num_eps=1, num_mcts_sims=3)

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 8, 8, 4)]    0           []                               
                                                                                                  
 conv2d_2 (Conv2D)              (None, 6, 6, 32)     1184        ['input_2[0][0]']                
                                                                                                  
 conv2d_3 (Conv2D)              (None, 4, 4, 32)     9248        ['conv2d_2[0][0]']               
                                                                                                  
 flatten_1 (Flatten)            (None, 512)          0           ['conv2d_3[0][0]']               
                                                                                            

## Run training!

In [16]:
env = checkers_v3.env()
nnet = policyIterSP(env, num_iters=8, num_eps=50, num_mcts_sims=25)

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 8, 8, 4)]    0           []                               
                                                                                                  
 conv2d_4 (Conv2D)              (None, 6, 6, 32)     1184        ['input_3[0][0]']                
                                                                                                  
 conv2d_5 (Conv2D)              (None, 4, 4, 32)     9248        ['conv2d_4[0][0]']               
                                                                                                  
 flatten_2 (Flatten)            (None, 512)          0           ['conv2d_5[0][0]']               
                                                                                            

KeyboardInterrupt: 