In [3]:
from dice import roll
import tensorflow as tf
import gym
from gym.spaces import Dict, Box, Discrete
import numpy as np
from random import random, choice
from copy import copy, deepcopy
from network import Network

2022-08-28 16:50:42.249351: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-08-28 16:50:42.253419: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/compat/:
2022-08-28 16:50:42.253431: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
class BackgammonEnv(gym.Env):
    def __init__(self):

        # defining some ranges for our observation space
        low = np.zeros((96,)) # 4 per each of the 24 positions on the board
        high = np.ones((96,)) # the first three of the four values encoding the number of checkers at a given position are either 0 or 1...

        for i in range(3, 97, 4): # but every fourth value can go as high as (15-3)/2 = 6...
            high[i] = 6.0

        # to make them more readable to humans, our observations are dictionaries
        # but we can't feed dictionaries to our ANN, so we are going to have to use gym's 'FlattenObservation' wrapper to flatten the dictionary into a single array later
        self.observation_space = Dict(
            {
                # player 'White'
                'W': Dict(
                    {
                        'board': Box(low=low, high=high,dtype=np.float32), # the board, consisting of the bar and 24 'points'
                        'barmen': Box(low=0.0, high=7.5,shape=(1,),dtype=np.float32), # and the very first value, which encodes the number of checkers on the bar, can go as high as 15/2 = 7.5
                        'menoff': Box(low=0.0,high=1.0,shape=(1,),dtype=np.float32), # number of checkers removed from the board as a fraction of the total number of checkers i.e. n/15
                        'turn': Discrete(2) # '1' if it is this player's turn, '0' if not
                    }
                ),
                # player 'Black'
                'B': Dict(
                    {
                        'board': Box(low=low, high=high,dtype=np.float32),
                        'barmen': Box(low=0.0,high=7.5,shape=(1,),dtype=np.float32), 
                        'menoff': Box(low=0.0,high=1.0,shape=(1,),dtype=np.float32),
                        'turn': Discrete(2)
                    }
                )
            }
        )
        
        # which actions can be taken at any given time step depend on the state of the board and the roll of the dice. hence we choose to not attempt to define the action space
        self.action_space = None
        
        # create an empty board:
        # to allow for indexing of board positions, the environment's state uses an array of arrays to store the values of the bar and each point
        # later we simply flatten this array to create observations

        self.state = {
            'W': {
                'board': np.zeros((24, 4)),
                'barmen': 0,
                'menoff': 0,
                'turn': 0
            },
            'B': {
                'board': np.zeros((24, 4)),
                'barmen': 0,
                'menoff': 0,
                'turn': 0
            }
        }

        # the previously mentioned truncated unary encoding:
        self.encoding = {
            0: np.array([0.,0.,0.,0.]),
            1: np.array([1.,0.,0.,0.]),
            2: np.array([1.,1.,0.,0.]),
            3: np.array([1.,1.,1.,0.]),
            4: np.array([1.,1.,1.,0.5]),
            5: np.array([1.,1.,1.,1.]),
            6: np.array([1.,1.,1.,1.5]),
            7: np.array([1.,1.,1.,2.0]),
            8: np.array([1.,1.,1.,2.5]),
            9: np.array([1.,1.,1.,3.0]),
            10: np.array([1.,1.,1.,3.5]),
            11: np.array([1.,1.,1.,4.0]),
            12: np.array([1.,1.,1.,4.5]),
            13: np.array([1.,1.,1.,5.0]),
            14: np.array([1.,1.,1.,5.5]),
            15: np.array([1.,1.,1.,6.0])
        }

        # define the game's starting position:
  
        # create an empty board
        self.starting_pos = np.zeros((24, 4))

        # place the correct number of checkers in the correct positions
        self.starting_pos[0] = copy(self.encoding[2])
        self.starting_pos[11] = copy(self.encoding[5])
        self.starting_pos[16] = copy(self.encoding[3])
        self.starting_pos[18] = copy(self.encoding[5])

    def _flatten_obs(self, obs):
        w_board = obs['W']['board']
        w_board = w_board.flatten()
        b_board = obs['B']['board']
        b_board = b_board.flatten()

        observation = []
        observation = np.append(observation,w_board)
        observation = np.append(observation,obs['W']['barmen'])
        observation = np.append(observation,obs['W']['menoff'])
        observation = np.append(observation,obs['W']['turn'])
        observation = np.append(observation,b_board)
        observation = np.append(observation,obs['B']['barmen'])
        observation = np.append(observation,obs['B']['menoff'])
        observation = np.append(observation,obs['B']['turn'])

        return observation

    def _get_info(self):
        pass

    def reset(self): 

        # a 'coin flip' to determine which side goes first
        coin = int(random()>0.5)

        # reset the board to the game's starting position and assign a turn order based on the coin flip
        self.state = {
            'W': {
                'board':copy(self.starting_pos), 
                'barmen': 0,
                'menoff': 0,
                'turn': coin
            },
            'B': {
                'board':copy(self.starting_pos),
                'barmen': 0,
                'menoff': 0,
                'turn': 1-coin
            }
        }

        return copy(self.state)

    def step(self, persistent, action):

        if persistent:
            state = self.state
        else:
            state = deepcopy(self.state)

        # who's turn is it?
        if self.state['W']['turn'] == 1:
            player = 'W'
            opponent = 'B'
        else:
            player = 'B'
            opponent = 'W'

        for move in action:
            # 'LIFTING' A CHECKER

            old_pos, new_pos = move
            
            # are we moving a piece off the bar?
            if old_pos == -1:
                # remove a checker from the bar
                state[player]['barmen'] -= 0.5

            else:
                # get the current number of checkers at the position from which we need to remove a checker
                encoded_checkers = state[player]['board'][old_pos]
                # decode
                for key, value in self.encoding.items():
                    if np.array_equal(encoded_checkers,value):
                        n_checkers = key
                # subtract a checker
                state[player]['board'][old_pos] = copy(self.encoding[n_checkers-1])

            # 'PLACING DOWN' A CHECKER

            # are we bearing off?
            if new_pos == 24:
                state[player]['menoff'] += 1/15

            else:
                # get the current number of checkers at the position to which we need to add a checker
                encoded_checkers = state[player]['board'][new_pos]
                # decode
                for key, value in self.encoding.items():
                    if np.array_equal(encoded_checkers,value):
                        n_checkers = key
                # add a checker
                state[player]['board'][new_pos] = copy(self.encoding[n_checkers+1])

                # check for blots
                mirror_pos = new_pos+23-2*new_pos
                if not np.array_equal(state[opponent]['board'][mirror_pos],[0,0,0,0]):
                    # if there is a blot, move the opponent's piece to the bar
                    state[opponent]['board'][mirror_pos] = [0,0,0,0]
                    state[opponent]['barmen'] += 0.5

        # update the turn order
        state['W']['turn'] = 1 - state['W']['turn']
        state['B']['turn'] = 1 - state['B']['turn']

        # if this is only a 'simulated' step, return the new state here
        if not persistent:
            return self._flatten_obs(state)

        # reward is zero unless one of four conditions is met:
        reward = 0

        # 1) White wins, Black is gammoned
        if self.state['W']['menoff'] > 0.9 and self.state['B']['menoff'] == 0:
            reward = 2
        # 2) White wins
        elif self.state['W']['menoff'] > 0.9:
            reward = 1
        # 3) Black wins, White is gammoned
        elif self.state['B']['menoff'] > 0.9 and self.state['W']['menoff'] == 0: 
            reward = -2
        # 4) Black wins
        elif self.state['B']['menoff'] > 0.9:
            reward = -1

        # if one of the four conditions is met, the game is finished and the episode ends
        done = reward != 0

        return copy(self.state), reward, done

In [4]:
class Agent:
    def __init__(self, env, gamma, lam, network):
        self.env = env
        self.gamma = gamma
        self.lam = lam
        self.network = network
        self.eligibility_trace = np.zeros_like(self.network.layers[0].get_weights()[0])
        self.state = np.zeros_like(self.env._flatten_obs(env.observation_space))
        self.state_ = np.zeros_like(self.env._flatten_obs(env.observation_space))

    def choose_action(self, obs):
        
        dice = roll()
        
        legal_actions = []
        size = len(dice)
        action = []

        # check whose turn it is
        if obs['W']['turn'] == 1:
            player = 'W'
            opponent = 'B'
        else:
            player = 'B'
            opponent = 'W'
            
        player_obs = deepcopy(obs[player])
        opponent_obs = obs[opponent]
        
        # make board more easily readable
        new_board = []
        for pos in player_obs['board']:
            new_board.append(pos[0] + pos[1] + pos[2] + pos[3] * 2)
        player_obs['board'] = new_board
        
        # !!! just for testing purposes !!!
        # if not length == size and not len(action) == 0:
        #test_opponent = deepcopy(opponent_obs)
        #new_op_board = []
        #for idx, pos in enumerate(test_opponent['board']):
        #    new_op_board.append(pos[0] + pos[1] + pos[2] + pos[3] * 2)
        #test_opponent['board'] = new_op_board
        #print("Dice: ", dice)
        #print("Player:\t\t", player, player_obs['board'])
        #print("Opponent:\t", opponent, test_opponent['board'])
        
        # recursive function to search "action-tree"
        def find_board_actions(action, dice, player_obs):
            
            # if we have iterated through all dice the action is appended and we return
            if len(dice) == 0:
                legal_actions.append(action)
                return

            # in case there are chips in the bar they have to be removed before any other actions can be taken
            if player_obs['barmen'] > 0:
                # check for free points/blots
                for i, die in enumerate(dice):
                    # for each die, check if the barmen would land on a free spot/a blot
                    if opponent_obs['board'][die + 23 - 2 * die][1] == 0:
                        # create a new observation in which the barman has been freed
                        new_player_obs = deepcopy(player_obs)
                        new_player_obs['board'][die] += 1
                        new_player_obs['barmen'] -= 0.5
                        # remove the used die
                        new_dice = copy(dice)
                        new_dice.pop(i)
                        # append the chosen move to action, pass new observation and dice to the recursion
                        find_board_actions(action.copy() + [(-1, die)], new_dice, new_player_obs)
                        
            else:
                # is it legal to move off the board?
                bearingoff = True
                # check for checkers in the first three quadrants
                # NOTE: easier to just add up all the values and compare them to zero?
                for idx in range(18):
                    if player_obs['board'][idx] > 0:
                        bearingoff = False
                        break
                        
                # iterate through all positions and check if we can move to position + current dice
                for pos, n_checkers in enumerate(player_obs['board']):
                    # check if a dice roll could exceed the limitations of the board
                    if n_checkers > 0 and (pos + dice[0]) > 23:
                        # can a checker be moved off the board
                        if bearingoff:
                            if pos+dice[0] == 24 or sum(player_obs['board'][:pos]) == 0:
                                # create a new observation
                                new_player_obs = deepcopy(player_obs)
                                new_player_obs['board'][pos] -= 1
                                # recursively call function
                                find_board_actions(action.copy() + [(pos, 24)], dice[1:], new_player_obs)       
                            
                    # If there is no checker on the point indicated by the roll, the player must make a legal move using a checker on a higher-numbered point. If there are no checkers on higher-numbered points, the player is permitted (and required) to remove a checker from the highest point on which one of his checkers resides.
                    
                    # move a checker only if the move is legal
                    elif n_checkers > 0 and opponent_obs['board'][(pos + dice[0]) + 23 - 2 * (pos + dice[0])][1] == 0:
                        new_player_obs = deepcopy(player_obs)
                        new_player_obs['board'][pos] -= 1
                        new_player_obs['board'][pos + dice[0]] += 1
                        # recursively call function
                        find_board_actions(action.copy() + [(pos, pos + dice[0])], dice[1:], new_player_obs)
                            
            # if we couldn't move and reach return we recursively call the function with the same state and action but iterated dice 
            find_board_actions(action.copy(), dice[1:], player_obs)
    
        # call the recursive function
        find_board_actions(action, dice, player_obs)
        
        # only keep actions if they have the max length
        length = max(len(x) for x in legal_actions)
        legal_actions = list(l for l in legal_actions if len(l) == length)
          
        states = [] 
        # call function that returns the state
        for action in legal_actions:
            state = env.step(False, action)
            states.append(state.copy())

        values = []
        for state in states:
            value = self.network.call(state.reshape(1,-1))[0]
            if player == 'W':
                values.append(float(value[0] + 2 * value[1] - value[2] - 2 * value[3]))
            else:
                values.append(float(-value[0] - 2 * value[1] + value[2] + 2 * value[3]))
           
        index = values.index(max(values))
        action = legal_actions[index]
        self.state_ = states[index]
        
        print("Action: ", action)
            
        return action

    def learn(self):
        # update the eligibility trace
        self.eligibility_trace = self.gamma * self.lam * self.eligibility_trace + TD error???
        
        # update the weights
        weights = self.network.layers[0].get_weights[0] + self.alpha * (reward + self.gamma * TD-Error) * self.eligibility_trace
        self.network.layers[0].set_weights(weights)
        

In [5]:
env = BackgammonEnv()
network = Network()
agent = Agent(env, network)


  logger.warn(
2022-08-16 12:31:20.802380: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-08-16 12:31:20.802411: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-08-16 12:31:20.802436: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (maries-debian): /proc/driver/nvidia/version does not exist
2022-08-16 12:31:20.802938: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
done = False
observation = env.reset()
while not done:
    action = agent.choose_action(observation)
    observation_, reward, done = env.step(True, action)
    agent.learn
    
print(reward)


Action:  [(18, 19), (11, 16)]
Action:  [(16, 22), (18, 21)]
Action:  [(0, 2), (18, 21)]
Action:  [(-1, 3), (16, 19), (16, 19), (18, 21)]
Action:  [(-1, 2), (16, 21)]
Action:  [(-1, 3), (18, 20)]
Action:  [(11, 17), (2, 3)]
Action:  [(-1, 3), (3, 9)]
Action:  [(18, 21), (11, 13)]
Action:  [(3, 9), (18, 20)]
Action:  [(-1, 5), (11, 13)]
Action:  []
Action:  [(13, 15), (13, 15), (15, 17), (15, 17)]
Action:  [(-1, 4), (11, 16)]
Action:  [(-1, 5), (18, 19)]
Action:  []
Action:  [(11, 17), (17, 18)]
Action:  [(-1, 1), (16, 21)]
Action:  [(18, 20), (18, 22)]
Action:  [(-1, 4), (-1, 4), (19, 23), (19, 23)]
Action:  [(-1, 2), (-1, 4)]
Action:  [(-1, 3), (9, 10)]
Action:  [(-1, 3), (4, 8)]
Action:  [(-1, 1), (11, 17)]
Action:  [(-1, 5), (8, 9)]
Action:  [(17, 21), (4, 5)]
Action:  [(-1, 5), (5, 6)]
Action:  [(11, 17), (17, 20)]
Action:  [(-1, 2), (-1, 5)]
Action:  []
Action:  [(5, 10), (10, 13)]
Action:  [(-1, 5)]
Action:  [(9, 14), (14, 20)]
Action:  [(-1, 5), (-1, 4)]
Action:  [(2, 3), (13, 16

In [5]:
print(tf.zeros(4,))

tf.Tensor([0. 0. 0. 0.], shape=(4,), dtype=float32)
