In [1]:
import random
import numpy as np

In [44]:
class RPSTrainer():
    # define
    def __init__(self):
        self.regret_sum = [[0, 0, 0], [0, 0, 0]]
        self.strategy = [[0, 0, 0], [0, 0, 0]]  # regret_sumと同じか、負の数なら0。0以上1以下。総計が1
        self.strategy_sum = [[0, 0, 0], [0, 0, 0]] # strategyを毎回足し合わせたもの。総計は施行回数と一致
    
    # get current mixed strategy through regret-matching
    def get_strategy(self, player): # 引数を増やして複数のプレイヤーに対応できるようにする
        normalizing_sum = 0
        self.strategy[player] = [self.regret_sum[player][a] if self.regret_sum[player][a] > 0 else 0 for a in range(3)]
        normalizing_sum = sum(self.strategy[player])
        if normalizing_sum > 0:
            self.strategy[player] = [self.strategy[player][a] / normalizing_sum for a in range(3)]
        else:
            self.strategy[player] = [1.0 / 3 for a in range(3)]
        self.strategy_sum[player] = [self.strategy_sum[player][a] + self.strategy[player][a] for a in range(3)]
        return self.strategy[player]
    
    # get random action according to mixed-strategy distribution
    def get_action(self, strategy):
        r = np.random.rand()
        a = 0
        cumulative_probability = 0
        while a < 3 - 1:
            cumulative_probability += strategy[a]
            if r < cumulative_probability:
                break
            a += 1
        return a
    
    # training algorithm
    def train(self, iterations):
        action_utility = [0, 0, 0]

        for i in range(iterations // 2):
            # action_bを固定
            # get regret-matched mixed-strategy actions
            self.strategy[0] = self.get_strategy(0)
            action_a = self.get_action(self.strategy[0]) # 0, 1, 2
            action_b = self.get_action(self.strategy[1]) # 0, 1, 2
            # compute action utilities
            action_utility[action_b] = 0
            action_utility[0 if action_b == 2 else action_b + 1] = 1
            action_utility[2 if action_b == 0 else action_b - 1] = -1
            # accumulate action regrets
            for a in range(3):
                self.regret_sum[0][a] += action_utility[a] - action_utility[action_a]
            
            # action_aを固定
            self.strategy[1] = self.get_strategy(1)
            action_b = self.get_action(self.strategy[1]) # 0, 1, 2
            action_a = self.get_action(self.strategy[0]) # 0, 1, 2
            action_utility[action_a] = 0
            action_utility[0 if action_a == 2 else action_a + 1] = 1
            action_utility[2 if action_a == 0 else action_a - 1] = -1
            for a in range(3):
                self.regret_sum[1][a] += action_utility[a] - action_utility[action_b]
    
    # get average mixed strategy accross all training iterations (similar to get_strategy function)
    def get_average_strategy(self, player): # 両方のaverage strategyを取得したいからプレイヤーごとに取得できるよう引数を増やす
        avg_strategy = [0, 0, 0]
        normalizing_sum = sum(self.strategy_sum[player])
        avg_strategy = [self.strategy_sum[player][a] / normalizing_sum if normalizing_sum > 0 else 1.0/3 for a in range(3)]
        return avg_strategy
    



In [45]:
trainer = RPSTrainer()
trainer.train(100000)
print(trainer.get_average_strategy(0), trainer.get_average_strategy(1))

[0.3394837076091029, 0.3292360557085543, 0.33128023668234274] [0.3365072245306505, 0.3292903994727815, 0.334202375996568]


In [None]:
# definitions
regret_sum = [0, 0, 0]
strategy = [0, 0, 0]  # regret_sumと同じか、負の数なら0。0以上1以下。総計が1
strategy_sum = [0, 0, 0]  # strategyを毎回足し合わせたもの。総計は施行回数と一致
opp_strategy = [0.4, 0.4, 0.2]

In [None]:
# get current mixed strategy through regret-matching
def get_strategy():
    normalizing_sum = 0
    strategy[a] = [regret_sum[a] if regret_sum[a] > 0 else 0 for a in range(3)]
    normalizing_sum = sum(strategy)
    if normalizing_sum > 0:
        strategy = [strategy[a] / normalizing_sum for a in range(3)]
    else:
        strategy = [1.0 / 3 for a in range(3)]
    strategy_sum = [strategy_sum[a] + strategy[a] for a in range(3)]
    return strategy

In [None]:
# get random action according to mixed-strategy distribution
def get_action(strategy):
    r = np.random.rand()
    a = 0
    cumulative_probability = 0
    while a < 3 - 1:
        cumulative_probability += strategy[a]
        if r < cumulative_probability:
            break
        a += 1
    return a

In [None]:
iterations = 10

# training algorithm
def train(iterations):
    action_utility = [0, 0, 0]
    
    for i in range(iterations):
        # get regret-matched mixed-strategy actions
        strategy = get_strategy()
        my_action = get_action(strategy) # 0, 1, 2
        other_action = get_action(opp_strategy) # 0, 1, 2
        # compute action utilities
        action_utility[other_action] = 0
        action_utility[0 if other_action == 2 else other_action + 1] = 1
        action_utility[2 if other_action == 0 else other_action - 1] = -1
        # accumulate action regrets
        for a in range(3):
            regret_sum[a] += action_utility[a] - action_utility[my_action]

In [None]:
# get average mixed strategy accross all training iterations (similar to get_strategy function)
def get_average_strategy():
    avg_strategy = [0, 0, 0]
    normalizing_sum = sum(strategy_sum)
    avg_strategy = [strategy_sum[a] / normalizing_sum if normalizing_sum > 0 else 1.0/3 for a in range(3)]
    return avg_strategy

In [None]:
# main method initializing computation
def main():
    trainer = RPSTrainer()
    trainer.train(1000000)
    