In [1]:
# exercise 2.6 from the "An Introduction to Counterfactual Regret Minimization"
# paper by Neller and Lanctot

import numpy as np
from itertools import product

In [2]:
# Colonel Blotto game definition

S = 5
N = 3

ACTIONS = np.array([a for a in product(range(S + 1), repeat=N) if sum(a) == S])
ACTION_INDICES = range(len(ACTIONS))
ACTIONS

array([[0, 0, 5],
       [0, 1, 4],
       [0, 2, 3],
       [0, 3, 2],
       [0, 4, 1],
       [0, 5, 0],
       [1, 0, 4],
       [1, 1, 3],
       [1, 2, 2],
       [1, 3, 1],
       [1, 4, 0],
       [2, 0, 3],
       [2, 1, 2],
       [2, 2, 1],
       [2, 3, 0],
       [3, 0, 2],
       [3, 1, 1],
       [3, 2, 0],
       [4, 0, 1],
       [4, 1, 0],
       [5, 0, 0]])

In [3]:
def utility(action_i_p_0, action_i_p_1):
    wins_p_0 = np.sum(ACTIONS[action_i_p_0] > ACTIONS[action_i_p_1])
    wins_p_1 = np.sum(ACTIONS[action_i_p_0] < ACTIONS[action_i_p_1])

    if wins_p_0 > wins_p_1:
        return 1, -1
    elif wins_p_0 < wins_p_1:
        return -1, 1
    else:
        return 0, 0

In [4]:
# regret matching

NUM_ITERATIONS = 20000

regrets_sum_p_0 = np.zeros(len(ACTIONS))
regrets_sum_p_1 = np.zeros(len(ACTIONS))

strategy_sum_p_0 = np.zeros(len(ACTIONS))
strategy_sum_p_1 = np.zeros(len(ACTIONS))

def regret_matching_strategy(regrets_sum):
    strategy = regrets_sum.copy()
    strategy[strategy < 0] = 0
    normalizing_sum = np.sum(strategy)
    if normalizing_sum == 0:
        return np.full(strategy.shape, 1.0 / len(strategy))
    return strategy / normalizing_sum


def current_regret(action_i_me, action_i_opponent):
    regret = np.zeros(len(ACTIONS))
    actual_utility = utility(action_i_me, action_i_opponent)[0]

    for action_i in range(len(ACTIONS)):
        regret[action_i] = utility(action_i, action_i_opponent)[0] - actual_utility
    return regret


def sample_action(strategy):
    return np.random.choice(ACTION_INDICES, size=1, p=strategy)[0]


for _ in range(NUM_ITERATIONS):
    strategy_p_0 = regret_matching_strategy(regrets_sum_p_0)
    strategy_p_1 = regret_matching_strategy(regrets_sum_p_1)

    strategy_sum_p_0 += strategy_p_0
    strategy_sum_p_1 += strategy_p_1

    action_p_0 = sample_action(strategy_p_0)
    action_p_1 = sample_action(strategy_p_1)

    regrets_sum_p_0 += current_regret(action_p_0, action_p_1)
    regrets_sum_p_1 += current_regret(action_p_1, action_p_0)

strategy_avg_p_0 = strategy_sum_p_0 / NUM_ITERATIONS
strategy_avg_p_1 = strategy_sum_p_1 / NUM_ITERATIONS

np.testing.assert_almost_equal(np.sum(strategy_avg_p_0), 1.0)
np.testing.assert_almost_equal(np.sum(strategy_avg_p_1), 1.0)


def print_strategies():
    for action, p0, p1 in zip(ACTIONS, strategy_avg_p_0, strategy_avg_p_1):
        print("{} - player0 p={:.4f} - player1 p={:.4f}".format(action, p0, p1))


print_strategies()

[0 0 5] - player0 p=0.0000 - player1 p=0.0000
[0 1 4] - player0 p=0.0001 - player1 p=0.0001
[0 2 3] - player0 p=0.0870 - player1 p=0.1066
[0 3 2] - player0 p=0.1560 - player1 p=0.1103
[0 4 1] - player0 p=0.0004 - player1 p=0.0000
[0 5 0] - player0 p=0.0000 - player1 p=0.0000
[1 0 4] - player0 p=0.0001 - player1 p=0.0001
[1 1 3] - player0 p=0.1136 - player1 p=0.1162
[1 2 2] - player0 p=0.0001 - player1 p=0.0005
[1 3 1] - player0 p=0.0714 - player1 p=0.1087
[1 4 0] - player0 p=0.0003 - player1 p=0.0002
[2 0 3] - player0 p=0.1174 - player1 p=0.1116
[2 1 2] - player0 p=0.0001 - player1 p=0.0001
[2 2 1] - player0 p=0.0001 - player1 p=0.0002
[2 3 0] - player0 p=0.1346 - player1 p=0.1113
[3 0 2] - player0 p=0.0849 - player1 p=0.1045
[3 1 1] - player0 p=0.1277 - player1 p=0.1111
[3 2 0] - player0 p=0.1060 - player1 p=0.1182
[4 0 1] - player0 p=0.0000 - player1 p=0.0001
[4 1 0] - player0 p=0.0002 - player1 p=0.0002
[5 0 0] - player0 p=0.0000 - player1 p=0.0000


In [5]:
# test the strategies

NUM_GAMES = 10000


def play_games(strategy_p_0, strategy_p_1, num_games=NUM_GAMES):
    wins_p_0 = 0
    wins_p_1 = 0
    draws = 0

    for _ in range(NUM_GAMES):
        action_p_0 = sample_action(strategy_p_0)
        action_p_1 = sample_action(strategy_p_1)
        utility_p_0 = utility(action_p_0, action_p_1)[0]
        if utility_p_0 == 1:
            wins_p_0 += 1
        elif utility_p_0 == -1:
            wins_p_1 += 1
        else:
            draws += 1

    print("Player 0 wins: {:d} ({:.4f})".format(wins_p_0, wins_p_0 / num_games))
    print("Player 1 wins: {:d} ({:.4f})".format(wins_p_1, wins_p_1 / num_games))
    print("Draws:         {:d} ({:.4f})".format(draws, draws / num_games))

In [6]:
print("Both players trained")
play_games(strategy_avg_p_0, strategy_avg_p_1)

Both players trained
Player 0 wins: 2177 (0.2177)
Player 1 wins: 2227 (0.2227)
Draws:         5596 (0.5596)


In [7]:
print("Player 0 trained, player 1 uniform")
strategy_uniform = np.ones(len(ACTIONS)) / len(ACTIONS)
play_games(strategy_avg_p_0, strategy_uniform)

Player 0 trained, player 1 uniform
Player 0 wins: 3037 (0.3037)
Player 1 wins: 1800 (0.1800)
Draws:         5163 (0.5163)


In [8]:
for action_i in range(len(ACTIONS)):
    strategy_pure = np.zeros(len(ACTIONS))
    strategy_pure[action_i] = 1.0
    print("Player 0 trained, player 1 always selecting action {}".format(ACTIONS[action_i]))
    play_games(strategy_avg_p_0, strategy_pure)
    print()

Player 0 trained, player 1 always selecting action [0 0 5]
Player 0 wins: 5522 (0.5522)
Player 1 wins: 0 (0.0000)
Draws:         4478 (0.4478)

Player 0 trained, player 1 always selecting action [0 1 4]
Player 0 wins: 3104 (0.3104)
Player 1 wins: 2002 (0.2002)
Draws:         4894 (0.4894)

Player 0 trained, player 1 always selecting action [0 2 3]
Player 0 wins: 2030 (0.2030)
Player 1 wins: 2138 (0.2138)
Draws:         5832 (0.5832)

Player 0 trained, player 1 always selecting action [0 3 2]
Player 0 wins: 2380 (0.2380)
Player 1 wins: 2327 (0.2327)
Draws:         5293 (0.5293)

Player 0 trained, player 1 always selecting action [0 4 1]
Player 0 wins: 3177 (0.3177)
Player 1 wins: 2422 (0.2422)
Draws:         4401 (0.4401)

Player 0 trained, player 1 always selecting action [0 5 0]
Player 0 wins: 5200 (0.5200)
Player 1 wins: 0 (0.0000)
Draws:         4800 (0.4800)

Player 0 trained, player 1 always selecting action [1 0 4]
Player 0 wins: 3736 (0.3736)
Player 1 wins: 2413 (0.2413)
Draws: 