In [3]:
import random
import numpy as np

In [4]:
class RPSTrainer():
    # define
    def __init__(self):
        self.regret_sum = [[0, 0, 0], [0, 0, 0]]
        self.strategy = [[0, 0, 0], [0, 0, 0]]  # regret_sumと同じか、負の数なら0。0以上1以下。総計が1
        self.strategy_sum = [[0, 0, 0], [0, 0, 0]] # strategyを毎回足し合わせたもの。総計は施行回数と一致
    
    # get current mixed strategy through regret-matching
    def get_strategy(self, player): # 引数を増やして複数のプレイヤーに対応できるようにする
        normalizing_sum = 0
        self.strategy[player] = [self.regret_sum[player][a] if self.regret_sum[player][a] > 0 else 0 for a in range(3)]
        normalizing_sum = sum(self.strategy[player])
        if normalizing_sum > 0:
            self.strategy[player] = [self.strategy[player][a] / normalizing_sum for a in range(3)]
        else:
            self.strategy[player] = [1.0 / 3 for a in range(3)]
        self.strategy_sum[player] = [self.strategy_sum[player][a] + self.strategy[player][a] for a in range(3)]
        return self.strategy[player]
    
    # get random action according to mixed-strategy distribution
    def get_action(self, strategy):
        r = np.random.rand()
        a = 0
        cumulative_probability = 0
        while a < 3 - 1:
            cumulative_probability += strategy[a]
            if r < cumulative_probability:
                break
            a += 1
        return a
    
    # training algorithm
    def train(self, iterations):
        action_utility = [0, 0, 0]

        for i in range(iterations // 2):
            # action_bを固定
            # get regret-matched mixed-strategy actions
            self.strategy[0] = self.get_strategy(0)
            action_a = self.get_action(self.strategy[0]) # 0, 1, 2
            action_b = self.get_action(self.strategy[1]) # 0, 1, 2
            # compute action utilities
            action_utility[action_b] = 0
            action_utility[0 if action_b == 2 else action_b + 1] = 1
            action_utility[2 if action_b == 0 else action_b - 1] = -1
            # accumulate action regrets
            for a in range(3):
                self.regret_sum[0][a] += action_utility[a] - action_utility[action_a]
            
            # action_aを固定
            self.strategy[1] = self.get_strategy(1)
            action_b = self.get_action(self.strategy[1]) # 0, 1, 2
            action_a = self.get_action(self.strategy[0]) # 0, 1, 2
            action_utility[action_a] = 0
            action_utility[0 if action_a == 2 else action_a + 1] = 1
            action_utility[2 if action_a == 0 else action_a - 1] = -1
            for a in range(3):
                self.regret_sum[1][a] += action_utility[a] - action_utility[action_b]
    
    # get average mixed strategy accross all training iterations (similar to get_strategy function)
    def get_average_strategy(self, player): # 両方のaverage strategyを取得したいからプレイヤーごとに取得できるよう引数を増やす
        avg_strategy = [0, 0, 0]
        normalizing_sum = sum(self.strategy_sum[player])
        avg_strategy = [self.strategy_sum[player][a] / normalizing_sum if normalizing_sum > 0 else 1.0/3 for a in range(3)]
        return avg_strategy
    



In [5]:
trainer = RPSTrainer()
trainer.train(1000000)
print(trainer.get_average_strategy(0), trainer.get_average_strategy(1))

[0.3343793114012657, 0.3326342969526716, 0.3329863916460627] [0.33158137778846397, 0.33416248094573864, 0.33425614126579734]


In [17]:
class FixedTrainer():
    # define
    def __init__(self):
        self.regret_sum = [0, 0, 0]
        self.strategy = [0, 0, 0]  # regret_sumと同じか、負の数なら0。0以上1以下。総計が1
        self.strategy_sum = [0, 0, 0] # strategyを毎回足し合わせたもの。総計は施行回数と一致
        self.op_strategy = [0.4, 0.3, 0.3]
    
    # get current mixed strategy through regret-matching
    def get_strategy(self): # 引数を増やして複数のプレイヤーに対応できるようにする
        normalizing_sum = 0
        self.strategy = [self.regret_sum[a] if self.regret_sum[a] > 0 else 0 for a in range(3)]
        normalizing_sum = sum(self.strategy)
        if normalizing_sum > 0:
            self.strategy = [self.strategy[a] / normalizing_sum for a in range(3)]
        else:
            self.strategy = [1.0 / 3 for a in range(3)]
        self.strategy_sum = [self.strategy_sum[a] + self.strategy[a] for a in range(3)]
        return self.strategy
    
    # get random action according to mixed-strategy distribution
    def get_action(self, strategy):
        r = np.random.rand()
        a = 0
        cumulative_probability = 0
        while a < 3 - 1:
            cumulative_probability += strategy[a]
            if r < cumulative_probability:
                break
            a += 1
        return a
    
    # training algorithm
    def train(self, iterations):
        action_utility = [0, 0, 0]
        for i in range(iterations):
            # get regret-matched mixed-strategy actions
            self.strategy = self.get_strategy()
            action_a = self.get_action(self.strategy) # 0, 1, 2
            action_b = self.get_action(self.op_strategy) # 0, 1, 2
            # compute action utilities
            action_utility[action_b] = 0
            action_utility[0 if action_b == 2 else action_b + 1] = 1
            action_utility[2 if action_b == 0 else action_b - 1] = -1
            # accumulate action regrets
            for a in range(3):
                self.regret_sum[a] += action_utility[a] - action_utility[action_a]
            
    
    # get average mixed strategy accross all training iterations (similar to get_strategy function)
    def get_average_strategy(self): # 両方のaverage strategyを取得したいからプレイヤーごとに取得できるよう引数を増やす
        avg_strategy = [0, 0, 0]
        normalizing_sum = sum(self.strategy_sum)
        avg_strategy = [self.strategy_sum[a] / normalizing_sum if normalizing_sum > 0 else 1.0/3 for a in range(3)]
        return avg_strategy
    



In [21]:
fixed_trainer = FixedTrainer()
fixed_trainer.train(100000)
print(fixed_trainer.get_average_strategy())

[0.0064746897049004284, 0.9934762203480096, 4.908994708994707e-05]
