In [66]:
import random
import numpy as np

In [161]:
# Kuhn Trainer
class KuhnTrainer():
    def __init__(self):
        # Kuhn poker definitions
        self.num = 2
        self.node_map = {}
    
    # information set node class definition
    class Node:
        def __init__(self):
            # Kuhn node definitions
            self.infoset = ""
            self.regret_sum = [0, 0]
            self.strategy = [0, 0]                # 総計は1
            self.strategy_sum = [0, 0]
            self.utility_sum = 0  # 追加

        # get current information set mixed strategy through regret-matching
        def get_strategy(self, realization_weight):
            normalizing_sum = 0
            for a in range(2):
                self.strategy[a] = max(self.regret_sum[a], 0)
                normalizing_sum += self.strategy[a]
            for a in range(2):
                if (normalizing_sum > 0):
                    self.strategy[a] /= normalizing_sum
                else:
                    self.strategy[a] = 1.0 / 2
                self.strategy_sum[a] += realization_weight * self.strategy[a]
            return self.strategy

        # get average information set mixed strategy across all training iterations
        def get_average_strategy(self):
            avg_strategy = [0, 0]
            normalizing_sum = sum(self.strategy_sum)
            for a in range(2):
                if normalizing_sum > 0:
                    avg_strategy[a] = self.strategy_sum[a] / normalizing_sum
                else:
                    avg_strategy[a] = 1.0 / 2
            return avg_strategy    # 総計は1

        # get information set string representation
        def to_string(self):
            a = self.get_average_strategy()
            r = self.regret_sum
            u = self.utility_sum
            data.append([a[0], a[1]])
            return f'{a[0]:.3f}, {a[1]:.3f}, {r[0]:.3f}, {r[1]:.3f}, {u:.3f}'
        
    # train kuhn poker
    def train(self, iterations):
        cards = [1, 2, 3]
        util = 0
        for i in range(iterations):
            random.shuffle(cards)  # shuffle cards
            self.t = i
            util += self.cfr(cards, "", 1, 1)
        print(f"Average game value: {(util / iterations):.3f}")
        i = 0
        for k in sorted(self.node_map.keys()):
            v = self.node_map[k]
            v.to_string()
#             print(f'{k:5}', v.to_string())

    # counterfactual regret minimization iteration
    def cfr(self, cards, history, p0, p1):
        plays = len(history)
        player = plays % 2
        opponent = 1 - player
        infoset = str(cards[player]) + history 
        
        # return pay-off for terminal states
        if plays > 1:
            terminal_pass = history.endswith("p")
            double_bet = history.endswith("bb")
            is_player_card_higher = cards[player] > cards[opponent]
            if terminal_pass:
                if history == "pp":
                    return 1 if is_player_card_higher else -1
                else:
                    return 1
            elif double_bet:
                return 2 if is_player_card_higher else -2

        # get information set node or create it if nonexistant
        node = self.node_map.get(infoset)
        if node is None:
            node = self.Node()
            node.infoset = infoset
            self.node_map[infoset] = node

        # for each action, recursively call cfr with additional history and probability
        strategy = node.get_strategy(p0 if player == 0 else p1)
        util = [0, 0]
        node_util = 0
        for a in range(2):
            next_history = history + ("p" if a == 0 else "b")
            q0 = p0 * strategy[a] if player == 0 else p0
            q1 = p1 * strategy[a] if player == 1 else p1
            util[a] = -self.cfr(cards, next_history, q0, q1)
            node_util += strategy[a] * util[a]

        # for each action, compute and accumulate counterfactual regret
        for a in range(2):
            regret = util[a] - node_util
            node.regret_sum[a] += (p1 if player == 0 else p0) * regret
        node.utility_sum+=node_util * p0 * p1
        
        return node_util

    # Kuhn Trainer main method
    def main(self, iterations):
        KuhnTrainer().train(iterations)

In [47]:
alpha = data[0][1]
nash = [[1-alpha, alpha], [1, 0], [2.0/3, 1.0/3], [1, 0],
       [1, 0], [2.0/3, 1.0/3], [1, 0], [2.0/3-alpha, alpha+1.0/3],
       [1-3*alpha, 3*alpha], [0, 1], [0, 1], [0, 1]]

In [184]:
list10 = []

In [185]:
for _ in range(30):
    data = []
    hello = KuhnTrainer()
    hello.main(100000)

    dif = 0

    for i in range(len(data)):
        for j in range(2):
            dif += (data[i][j] - nash[i][j]) ** 2
    list10.append(dif)

Average game value: -0.055
Average game value: -0.059
Average game value: -0.052
Average game value: -0.060
Average game value: -0.061
Average game value: -0.052
Average game value: -0.058
Average game value: -0.060
Average game value: -0.055
Average game value: -0.060
Average game value: -0.054
Average game value: -0.058
Average game value: -0.053
Average game value: -0.059
Average game value: -0.061
Average game value: -0.054
Average game value: -0.056
Average game value: -0.051
Average game value: -0.051
Average game value: -0.053
Average game value: -0.061
Average game value: -0.061
Average game value: -0.046
Average game value: -0.060
Average game value: -0.055
Average game value: -0.053
Average game value: -0.057
Average game value: -0.055
Average game value: -0.051
Average game value: -0.057


In [186]:
print(f'{len(list10)=}, {np.mean(list10)=}')

len(list10)=30, np.mean(list10)=0.08837097497153715


In [173]:
len(data), len(nash)

(12, 12)

In [174]:
print(f'{data=}, {nash=}')

data=[[0.16666666666666666, 0.8333333333333334], [0.5, 0.5], [0.5, 0.5], [0.625, 0.375], [0.5833333333333334, 0.4166666666666667], [0.27777777777777773, 0.7222222222222222], [0.08333333333333333, 0.9166666666666666], [0.7380952380952381, 0.2619047619047619], [0.16666666666666666, 0.8333333333333334], [0.16666666666666666, 0.8333333333333334], [0.3333333333333333, 0.6666666666666666], [0.5, 0.5]], nash=[[0.7991363804953773, 0.20086361950462267], [1, 0], [0.6666666666666666, 0.3333333333333333], [1, 0], [1, 0], [0.6666666666666666, 0.3333333333333333], [1, 0], [0.46580304716204396, 0.534196952837956], [0.39740914148613204, 0.602590858513868], [0, 1], [0, 1], [0, 1]]


In [137]:
dif = 0

for i in range(len(data)):
    for j in range(2):
        dif += (data[i][j] - nash[i][j]) ** 2
list10.append(dif)

In [135]:
alpha = 1.0/3
# ナッシュ均衡解
equilibrium = {
        # player 0
        '1': [1-alpha, alpha], '1b': [1, 0], '1p': [2/3, 1/3], '1pb': [1, 0],
    '2': [1, 0], '2b': [2/3, 1/3], '2p': [1, 0],  '2pb': [2/3-alpha, alpha+1/3],
    '3': [1-3*alpha, 3*alpha], '3b': [0, 1],'3p': [0,1], '3pb': [0, 1],
        
        # player 1
        
         
    }

In [None]:
# Kuhn poker definitions
Pass = 0
Bet = 1
num = 2
node_map = {}

In [None]:
# Kuhn node definitions
regret_sum = [0, 0]
strategy = [0, 0]                # 総計は1
strategy_sum = [0, 0]

In [None]:
# get current information set mixed strategy through regret-matching
def get_strategy(realization_weight):
    normalizing_sum = 0
    for a in range(2):
        strategy[a] = regret_sum[a] if regret_sum[a] > 0 else 0
        normalizing_sum += strategy[a]
    for a in range(2):
        if (normalizing_sum > 0):
            strategy[a] /= normalizing_sum
        else:
            strategy[a] = 1.0 / 2
        strategy_sum[a] += realization_weight * strategy[a]
    return strategy

In [None]:
# get average information set mixed strategy across all training iterations
def get_average_strategy():
    avg_strategy = [0, 0]
    normalizing_sum = sum(strategy_sum)
    for a in range(2):
        if normalizing_sum > 0:
            avg_strategy[a] = strategy_sum[a] / normalizing_sum
        else:
            avg_strategy[a] = 1.0 / 2
    return avg_strategy

In [None]:
# information set node class definition
class Node():
    def __init__():
        # Kuhn node definitions
        self.regret_sum = [0, 0]
        self.strategy = [0, 0]                # 総計は1
        self.strategy_sum = [0, 0]
    
    # get current information set mixed strategy through regret-matching
    def get_strategy(realization_weight):
        normalizing_sum = 0
        for a in range(2):
            self.strategy[a] = self.regret_sum[a] if self.regret_sum[a] > 0 else 0
            normalizing_sum += self.strategy[a]
        for a in range(2):
            if (normalizing_sum > 0):
                self.strategy[a] /= normalizing_sum
            else:
                self.strategy[a] = 1.0 / 2
            self.strategy_sum[a] += realization_weight * self.strategy[a]
        return self.strategy

    # get average information set mixed strategy across all training iterations
    def get_average_strategy():
        avg_strategy = [0, 0]
        normalizing_sum = sum(self.strategy_sum)
        for a in range(2):
            if normalizing_sum > 0:
                avg_strategy[a] = self.strategy_sum[a] / normalizing_sum
            else:
                avg_strategy[a] = 1.0 / 2
        return avg_strategy
    
    # get information set string representation
    def to_string():
        return 0

In [None]:
# train kuhn poker
def train(iterations):
    cards = [1, 2, 3]
    util = 0
    for i in range(iterations):
        # shuffle cards
        list_range = range(0, len(cards))
        for j in list_range:
            j = randint(list_range[0], list_range[-1])
            cards[i], cards[j] = cards[j], cards[i]
            
        util += cfr(cards, "", 1, 1)
    print("Average game value: " + str(util / iterations))
    for v in node_map.values():
        print(n)

In [None]:
# counterfactual regret minimization iteration
def cfr(cards, history, p0, p1):
    plays = len(history)
    player = plays % 2
    opponent = 1 - player
    
    # return pay-off for terminal states
    if plays > 1:
        terminal_pass = history[plays-1] == "p"
        double_bet = history[plays-2:2] == "bb"
        is_player_card_higher = cards[player] > cards[opponent]
        if terminal_pass:
            if history == "pp":
                return 1 if is_player_card_higher else -1
            else:
                return 1
        elif double_bet:
            return 2 if is_player_card_higher else -2
        
    infoset = cards[player] + history
    
    # get information set node or create it if nonexistant
    node = node_map[infoset]
    if node == null:
        node = Node()
        node.infoset = infoset
        node_map[infoset] = node
    
    # for each action, recursively call cfr with additional history and probability
    strategy = node.get_strategy(p0 if player == 0 else p1)
    util = [0, 0]
    node_util = 0
    for a in range(2):
        next_history = history + ("p" if a == 0 else "b")
        util[a] = -cfr(cards, next_history, p0*strategy[a], p1) if player == 0 else -cfr(cards, next_history, p0, p1*strategy[a])
        node_util += strategy[a] * util[a]
        
    # for each action, compute and accumulate counterfactual regret
    for a in range(2):
        regret = util[a] - node_util
        node.regret_sum[a] += (p1 if player == 0 else p0) * regret
    return node_util

In [None]:
# Kuhn Trainer main method
def main():
    iterations = 10
    KuhnTrainer().train(iterations)

In [None]:
cards = [1, 2, 3]
random.shuffle(cards)
cards

In [None]:
dic = {}
dic["orange"] = 1
dic["orange"] 

In [None]:
dic.get("apple") is None

In [None]:
dic.get("orange")

In [None]:
h = "pbbbbbbb"
h[-2:len(h)]

In [None]:
h.endswith("bo")

In [None]:
h1 = ""
h1.endswith("b")