In [38]:
import numpy as np
import matplotlib.pyplot as plt

# Suppress scientific notation.
np.set_printoptions(suppress=True)

In [53]:
class RPSTrainer:
    def __init__(self):
        self.NUM_ACTIONS = 3
        self.regretSum = np.zeros(self.NUM_ACTIONS)
        self.strategySum = np.zeros(self.NUM_ACTIONS)
        self.oppStrategy = [1/3, 1/3, 1/3]
    
    def getStrategy(self):
        strategy = self.regretSum.clip(min=0)
        normalizingSum = np.sum(strategy)
        if normalizingSum > 0:
            strategy /= normalizingSum
        else:
            strategy = np.repeat(1 / self.NUM_ACTIONS, self.NUM_ACTIONS)
        self.strategySum += strategy
        return strategy
    
    def getAction(self, strategy):
        return np.random.choice(len(strategy), p=strategy)
    
    def train(self, iterations):
        actionUtility = np.zeros(self.NUM_ACTIONS)
        
        for i in range(iterations):
            strategy = self.getStrategy()
            myAction = self.getAction(strategy)
            otherAction = self.getAction(self.oppStrategy)
            
            actionUtility[otherAction] = 0
            actionUtility[0 if otherAction == self.NUM_ACTIONS - 1 else otherAction + 1] = 1
            actionUtility[self.NUM_ACTIONS - 1 if otherAction == 0 else otherAction - 1] = -1
            
            for a in range(self.NUM_ACTIONS):
                self.regretSum[a] += actionUtility[a] - actionUtility[myAction]
    
    def getAverageStrategy(self):
        normalizingSum = np.sum(self.strategySum)
        if normalizingSum > 0:
            avgStrategy = self.strategySum / normalizingSum
        else:
            avgStrategy = np.repeat(1 / self.NUM_ACTIONS, self.NUM_ACTIONS)
        return avgStrategy

In [56]:
trainer = RPSTrainer()
trainer.train(1000000)
print(trainer.getAverageStrategy())

[0.36640582 0.40592287 0.22767132]


In [45]:
def select_action(probs):
    """
    Select an action based on the given probabilities.
    """
    return np.random.choice(len(probs), p=probs)

def get_probs_from_regret_sum(regret_sum):
    """
    Get the probabilities from the given regret sum.
    """

    regret_sum = np.clip(regret_sum, 0, None)
    
    if np.sum(regret_sum) == 0:
        return np.ones(len(regret_sum)) / len(regret_sum)

    return regret_sum / np.sum(regret_sum)

In [67]:
opponent_policy = np.array([0.4, 0.3, 0.3])

ROCK = 0
PAPER = 1
SCISSORS = 2

def utility(player_action, opponent_action):
    """
    Get the utility of the given actions.
    """

    if player_action == opponent_action:
        return 0

    if player_action == ROCK and opponent_action == SCISSORS:
        return 1

    if player_action == PAPER and opponent_action == ROCK:
        return 1

    if player_action == SCISSORS and opponent_action == PAPER:
        return 1

    return -1


def train(iterations):
    regret_sums = np.zeros(len(opponent_policy))
    opp_regret_sums = np.zeros(len(opponent_policy))

    strat_sums = np.zeros(len(opponent_policy))

    current_policy = get_probs_from_regret_sum(regret_sums)
    opp_current_policy = get_probs_from_regret_sum(opp_regret_sums)

    for _ in range(iterations):
        # Get the current action
        action = select_action(current_policy)
        # Get the opponent's action
        opponent_action = select_action(opp_current_policy)

        current_utility = utility(action, opponent_action)
        for alternative_action in range(len(opponent_policy)):
            if alternative_action == action:
                continue

            alternative_utility = utility(alternative_action, opponent_action)
            regret = alternative_utility - current_utility

            regret_sums[alternative_action] += regret

        current_opp_utility = utility(opponent_action, action)
        for alternative_opponent_action in range(len(opponent_policy)):
            if alternative_opponent_action == opponent_action:
                continue

            alternative_opp_utility = utility(opponent_action, alternative_opponent_action)
            opp_regret = alternative_opp_utility - current_opp_utility

            opp_regret_sums[alternative_opponent_action] += opp_regret

        # Update the current policy
        current_policy = get_probs_from_regret_sum(regret_sums)
        opp_current_policy = get_probs_from_regret_sum(opp_regret_sums)

        strat_sums += current_policy

    return current_policy, strat_sums / iterations

In [68]:
train(100000)

(array([0.15666667, 0.        , 0.84333333]),
 array([0.33259869, 0.33486345, 0.33253787]))