Solving Rock-Paper-Scissors using CFR

Paper: http://modelai.gettysburg.edu/2013/cfr/cfr.pdf

In [23]:
import numpy as np

In [24]:
NUM_ACTIONS = 3 # 3 Actions, R, P, S
ROCK = 0
PAPER = 1
SCISSORS = 2

In [25]:
# Pick mixed action according to probability given
def get_action(strategy):
    return np.random.choice(np.arange(NUM_ACTIONS), p=strategy)

In [26]:
# Rock-Paper-Scissors Player class
class RPSPlayer:
    # Can initialize with given strategy
    def __init__(self, strategy_sum=np.zeros(NUM_ACTIONS)):
        self.regret_sum = np.zeros(NUM_ACTIONS) # Cumulative regret table, 0, 0, 0 for R, P, S
        self.strategy_sum = strategy_sum # Cumulative strategy table

    # Trains given number of iterations, by calculating utility, updating sum values.
    def self_train(self, opponent_strategy, iterations=10000):
        action_utility = np.zeros(NUM_ACTIONS)
        for _ in range(iterations):
            strategy = self.get_strategy()
            self_action = get_action(strategy)
            opponent_action = get_action(opponent_strategy)
            
            action_utility[opponent_action] = 0;
            action_utility[(opponent_action + 1) % 3] = 1
            action_utility[(opponent_action - 1) % 3] = -1

            for action in range(NUM_ACTIONS):
                self.regret_sum[action] += action_utility[action] - action_utility[self_action]

    # Gets strategy based on regret sum table
    def get_strategy(self):
        strategy = np.zeros(NUM_ACTIONS) # Strategy table
        normalizing_sum = 0
        for action in range(NUM_ACTIONS):
            strategy[action] = self.regret_sum[action] if (self.regret_sum[action] > 0) else 0
            normalizing_sum += strategy[action]
        for action in range(NUM_ACTIONS):
            if normalizing_sum > 0:
                strategy[action] /= normalizing_sum
            else:
                strategy[action] = 1.0 / NUM_ACTIONS
            self.strategy_sum[action] += strategy[action]
        return strategy

    # Total average strategy, to be used after minimizing regret over many iterations.
    # Uses strategy_sum to normalize, instead of regret_sum
    def get_average_strategy(self):
        average_strategy = np.zeros(NUM_ACTIONS)
        normalizing_sum = sum(self.strategy_sum)
        for action in range(NUM_ACTIONS):
            if normalizing_sum > 0:
                average_strategy[action] = self.strategy_sum[action] / normalizing_sum
            else:
                average_strategy[action] = 1.0 / NUM_ACTIONS
        return average_strategy

In [27]:
# Method to print strategy nicely
def format_strategy(strategy):
    return "Rock %: " + str(strategy[0]) + "\nPaper %: " + str(strategy[1]) + "\nScissors %: " + str(strategy[2]) + "\n"

In [21]:
# Exercise: RPS Equilibrium

unbalanced_strategy = np.array([3, 4, 5])
random_strategy = np.random.dirichlet(alpha=[1, 1, 1]) # Method to get random strategy (sums to 1)
random_strategy2 = np.random.dirichlet(alpha=[1, 1, 1])
player1 = RPSPlayer(random_strategy)
player2 = RPSPlayer(random_strategy2)

# Epoch #'s to print current strategies, to see as they develop
data_points = (0, 10, 100, 1000, 10000, 100000, 500000, 1000000, 2000000)

# View the initial strategies, before regret min
print("Initial strategies:\n")
print("Player 1:\n", format_strategy(player1.get_average_strategy()))
print("Player 2:\n", format_strategy(player2.get_average_strategy()))

for epoch in range(2000001):
    # Set iterations to one in self training, so players can update after each.
    # This minimizes magnitude of walk away from equilibrium, in this case (0.3-, 0.3-, 0.3-)
    player2_strategy = player2.get_strategy()
    player1.self_train(player2_strategy, iterations=1)
    player1_strategy = player1.get_strategy()
    player2.self_train(player1_strategy, iterations=1)
    if epoch in data_points:
        print(f"----- After {epoch} iterations: ----- \n")
        print("Player 1 Strategy:\n", format_strategy(player1.get_average_strategy()))
        print("Player 2 Strategy:\n", format_strategy(player2.get_average_strategy()))


Initial strategies:

Player 1:
 Rock %: 0.6393915768472631
Paper %: 0.041171115761381764
Scissors %: 0.31943730739135523

Player 2:
 Rock %: 0.5643345431415068
Paper %: 0.13211443878454807
Scissors %: 0.3035510180739453

----- After 0 iterations: ----- 

Player 1 Strategy:
 Rock %: 0.43535274783797656
Paper %: 0.23594592747601614
Scissors %: 0.32870132468600727

Player 2 Strategy:
 Rock %: 0.41033373660272443
Paper %: 0.2662603684837382
Scissors %: 0.3234058949135373

----- After 10 iterations: ----- 

Player 1 Strategy:
 Rock %: 0.20171267725422887
Paper %: 0.20468859923600213
Scissors %: 0.593598723509769

Player 2 Strategy:
 Rock %: 0.39844932796267424
Paper %: 0.3390774393674441
Scissors %: 0.26247323266988165

----- After 100 iterations: ----- 

Player 1 Strategy:
 Rock %: 0.37207844356837316
Paper %: 0.2746400072734566
Scissors %: 0.35328154915817017

Player 2 Strategy:
 Rock %: 0.2464188840579309
Paper %: 0.40055576629345413
Scissors %: 0.35302534964861504

----- After 1000 iter

In [None]:
# Next:
# Exercise: Colonel Blotto