Solving Rock-Paper-Scissors using CFR

Paper: http://modelai.gettysburg.edu/2013/cfr/cfr.pdf

In [1]:
import numpy as np
import random
import math
from itertools import combinations_with_replacement

In [24]:
NUM_ACTIONS = 3 # 3 Actions, R, P, S
ROCK = 0
PAPER = 1
SCISSORS = 2

In [2]:
# Pick mixed action according to probability given
def get_action(strategy):
    return np.random.choice(np.arange(NUM_ACTIONS), p=strategy)

In [26]:
# Rock-Paper-Scissors Player class
class RPSPlayer:
    # Can initialize with given strategy
    def __init__(self, strategy_sum=np.zeros(NUM_ACTIONS)):
        self.regret_sum = np.zeros(NUM_ACTIONS) # Cumulative regret table, 0, 0, 0 for R, P, S
        self.strategy_sum = strategy_sum # Cumulative strategy table

    # Trains given number of iterations, by calculating utility, updating sum values.
    def self_train(self, opponent_strategy, iterations=10000):
        action_utility = np.zeros(NUM_ACTIONS)
        for _ in range(iterations):
            strategy = self.get_strategy()
            self_action = get_action(strategy)
            opponent_action = get_action(opponent_strategy)
            
            action_utility[opponent_action] = 0;
            action_utility[(opponent_action + 1) % 3] = 1
            action_utility[(opponent_action - 1) % 3] = -1

            for action in range(NUM_ACTIONS):
                self.regret_sum[action] += action_utility[action] - action_utility[self_action]

    # Gets strategy based on regret sum table
    def get_strategy(self):
        strategy = np.zeros(NUM_ACTIONS) # Strategy table
        normalizing_sum = 0
        for action in range(NUM_ACTIONS):
            strategy[action] = self.regret_sum[action] if (self.regret_sum[action] > 0) else 0
            normalizing_sum += strategy[action]
        for action in range(NUM_ACTIONS):
            if normalizing_sum > 0:
                strategy[action] /= normalizing_sum
            else:
                strategy[action] = 1.0 / NUM_ACTIONS
            self.strategy_sum[action] += strategy[action]
        return strategy

    # Total average strategy, to be used after minimizing regret over many iterations.
    # Uses strategy_sum to normalize, instead of regret_sum
    def get_average_strategy(self):
        average_strategy = np.zeros(NUM_ACTIONS)
        normalizing_sum = sum(self.strategy_sum)
        for action in range(NUM_ACTIONS):
            if normalizing_sum > 0:
                average_strategy[action] = self.strategy_sum[action] / normalizing_sum
            else:
                average_strategy[action] = 1.0 / NUM_ACTIONS
        return average_strategy

In [27]:
# Method to print strategy nicely
def format_strategy(strategy):
    return "Rock %: " + str(strategy[0]) + "\nPaper %: " + str(strategy[1]) + "\nScissors %: " + str(strategy[2]) + "\n"

In [21]:
# Exercise: RPS Equilibrium

unbalanced_strategy = np.array([3, 4, 5])
random_strategy = np.random.dirichlet(alpha=[1, 1, 1]) # Method to get random strategy (sums to 1)
random_strategy2 = np.random.dirichlet(alpha=[1, 1, 1])
player1 = RPSPlayer(random_strategy)
player2 = RPSPlayer(random_strategy2)

# Epoch #'s to print current strategies, to see as they develop
data_points = (0, 10, 100, 1000, 10000, 100000, 500000, 1000000, 2000000)

# View the initial strategies, before regret min
print("Initial strategies:\n")
print("Player 1:\n", format_strategy(player1.get_average_strategy()))
print("Player 2:\n", format_strategy(player2.get_average_strategy()))

for epoch in range(2000001):
    # Set iterations to one in self training, so players can update after each.
    # This minimizes magnitude of walk away from equilibrium, in this case (0.3-, 0.3-, 0.3-)
    player2_strategy = player2.get_strategy()
    player1.self_train(player2_strategy, iterations=1)
    player1_strategy = player1.get_strategy()
    player2.self_train(player1_strategy, iterations=1)
    if epoch in data_points:
        print(f"----- After {epoch} iterations: ----- \n")
        print("Player 1 Strategy:\n", format_strategy(player1.get_average_strategy()))
        print("Player 2 Strategy:\n", format_strategy(player2.get_average_strategy()))


Initial strategies:

Player 1:
 Rock %: 0.6393915768472631
Paper %: 0.041171115761381764
Scissors %: 0.31943730739135523

Player 2:
 Rock %: 0.5643345431415068
Paper %: 0.13211443878454807
Scissors %: 0.3035510180739453

----- After 0 iterations: ----- 

Player 1 Strategy:
 Rock %: 0.43535274783797656
Paper %: 0.23594592747601614
Scissors %: 0.32870132468600727

Player 2 Strategy:
 Rock %: 0.41033373660272443
Paper %: 0.2662603684837382
Scissors %: 0.3234058949135373

----- After 10 iterations: ----- 

Player 1 Strategy:
 Rock %: 0.20171267725422887
Paper %: 0.20468859923600213
Scissors %: 0.593598723509769

Player 2 Strategy:
 Rock %: 0.39844932796267424
Paper %: 0.3390774393674441
Scissors %: 0.26247323266988165

----- After 100 iterations: ----- 

Player 1 Strategy:
 Rock %: 0.37207844356837316
Paper %: 0.2746400072734566
Scissors %: 0.35328154915817017

Player 2 Strategy:
 Rock %: 0.2464188840579309
Paper %: 0.40055576629345413
Scissors %: 0.35302534964861504

----- After 1000 iter

In [24]:
# Exercise: Colonel Blotto
# Game with N battlefields, S soldiers. 2 Teams. Team that captures most battlefields wins.
# A team captures a battlefield if they send more soldiers than the opponent.
# A team can send any number of soldiers to each battlefield, including 0, but must sum < S.
# Will be solving for N = 3, S = 5
NUM_BATTLEFIELDS = 2
NUM_SOLDIERS = 3

# A Pure strategy would be any ordered set: (S1, S2, S3), Sum(S1-3) < 5, 0 <= Sn <= 5
# This is any permutation of BBSSSSS, so 7 choose 2. 21 total pure strategies.
NUM_ACTIONS = math.comb(NUM_BATTLEFIELDS + NUM_SOLDIERS - 1, NUM_BATTLEFIELDS - 1)


ALL_STRATEGIES = combinations_with_replacement(range(NUM_BATTLEFIELDS), NUM_SOLDIERS)
ALL_STRATEGIES = [list([combo.count(i) for i in range(NUM_BATTLEFIELDS)]) for combo in ALL_STRATEGIES]

PURE_STRATEGIES = {count: strat for count, strat in enumerate(ALL_STRATEGIES)}

# Method that returns the utility (-1, 0, 1) for strategy1
def strategy_utility(strategy1, strategy2):
    strategy1 = PURE_STRATEGIES[strategy1]
    strategy2 = PURE_STRATEGIES[strategy2]
    wins = 0
    for battlefield in range(NUM_BATTLEFIELDS):
        soldier_difference = strategy1[battlefield] - strategy2[battlefield]
        if soldier_difference != 0:
            wins += soldier_difference / abs(soldier_difference)
    return wins if wins == 0 else wins / abs(wins)

In [25]:
# War Commander class
class WarCommander:
    # Can initialize with given strategy
    def __init__(self, strategy_sum=np.zeros(NUM_ACTIONS)):
        self.regret_sum = np.zeros(NUM_ACTIONS) # Cumulative regret table for each pure action
        self.strategy_sum = strategy_sum # Cumulative strategy table

    # Trains given number of iterations, by calculating utility, updating sum values.
    def self_train(self, opponent_strategy, iterations=1):
        action_utility = np.zeros(NUM_ACTIONS)
        for _ in range(iterations):
            strategy = self.get_strategy()
            self_action = get_action(strategy)
            opponent_action = get_action(opponent_strategy)

            for action in range(NUM_ACTIONS):
                action_utility[action] = strategy_utility(action, opponent_action)

            for action in range(NUM_ACTIONS):
                self.regret_sum[action] += action_utility[action] - action_utility[self_action]

    # Gets strategy based on regret sum table
    def get_strategy(self):
        strategy = np.zeros(NUM_ACTIONS) # Strategy table
        normalizing_sum = 0
        for action in range(NUM_ACTIONS):
            strategy[action] = self.regret_sum[action] if (self.regret_sum[action] > 0) else 0
            normalizing_sum += strategy[action]
        for action in range(NUM_ACTIONS):
            if normalizing_sum > 0:
                strategy[action] /= normalizing_sum
            else:
                strategy[action] = 1.0 / NUM_ACTIONS
            self.strategy_sum[action] += strategy[action]
        return strategy

    # Total average strategy, to be used after minimizing regret over many iterations.
    # Uses strategy_sum to normalize, instead of regret_sum
    def get_average_strategy(self):
        average_strategy = np.zeros(NUM_ACTIONS)
        normalizing_sum = sum(self.strategy_sum)
        for action in range(NUM_ACTIONS):
            if normalizing_sum > 0:
                average_strategy[action] = self.strategy_sum[action] / normalizing_sum
            else:
                average_strategy[action] = 1.0 / NUM_ACTIONS
        return average_strategy

In [28]:
# Method to print strategy nicely
def format_strategy(strategy):
    strStrat = ''
    for num in range(NUM_ACTIONS):
        if strategy[num] < 1/(NUM_ACTIONS * 2):
            continue
        else:
            strStrat += 'Chance of playing: ' + str(PURE_STRATEGIES[num]) + ' is: %' + str(strategy[num]) + '\n'
    return strStrat

In [29]:
# Exercise: Colonel Blotto Equilibrium

random_strategy = np.random.dirichlet(alpha=np.ones(NUM_ACTIONS)) # Method to get random strategy (sums to 1)
random_strategy2 = np.random.dirichlet(alpha=np.ones(NUM_ACTIONS))
player1 = WarCommander(random_strategy)
player2 = WarCommander(random_strategy2)

# Epoch #'s to print current strategies, to see as they develop
data_points = (0, 10, 100, 1000, 10000, 100000, 500000, 1000000, 2000000)

# View the initial strategies, before regret min
print("Initial strategies:\n")
print("Player 1:\n", format_strategy(player1.get_average_strategy()))
print("Player 2:\n", format_strategy(player2.get_average_strategy()))

for epoch in range(100001):
    # Set iterations to one in self training, so players can update after each.
    # This minimizes magnitude of walk away from equilibrium, in this case (0.3-, 0.3-, 0.3-)
    player2_strategy = player2.get_strategy()
    player1.self_train(player2_strategy, iterations=1)
    player1_strategy = player1.get_strategy()
    player2.self_train(player1_strategy, iterations=1)
    if epoch in data_points:
        print(f"----- After {epoch} iterations: ----- \n")
        print("Player 1 Strategy:\n", format_strategy(player1.get_average_strategy()))
        print("Player 2 Strategy:\n", format_strategy(player2.get_average_strategy()))


Initial strategies:

Player 1:
 Chance of playing: [2, 1] is: %0.5107274294947752
Chance of playing: [0, 3] is: %0.3567486443787358

Player 2:
 Chance of playing: [3, 0] is: %0.1983744598448869
Chance of playing: [2, 1] is: %0.37013180346587815
Chance of playing: [1, 2] is: %0.39364440537985546

----- After 0 iterations: ----- 

Player 1 Strategy:
 Chance of playing: [3, 0] is: %0.17092348428178186
Chance of playing: [2, 1] is: %0.3369091431649251
Chance of playing: [1, 2] is: %0.20658449109371438
Chance of playing: [0, 3] is: %0.2855828814595786

Player 2 Strategy:
 Chance of playing: [3, 0] is: %0.2327914866149623
Chance of playing: [2, 1] is: %0.29004393448862603
Chance of playing: [1, 2] is: %0.2978814684599518
Chance of playing: [0, 3] is: %0.17928311043645984

----- After 10 iterations: ----- 

Player 1 Strategy:
 Chance of playing: [3, 0] is: %0.23968567186284112
Chance of playing: [2, 1] is: %0.261335975195425
Chance of playing: [1, 2] is: %0.24433710753396276
Chance of playing