### 2 Period Bilateral Bargain

In [96]:
import numpy as np

# Buyer realizes value 
value_buyer = 1
    
# Parameters
avg_value_seller = 0.3
std_value_seller = 0.2

# Hyperparameters
num_actions = 11
num_episodes = 100000
alpha = 0.1  # Learning rate
gamma = 1  # No discount for immediate rewards
initial_epsilon = 0.99
epsilon_decay = 0.99995  # Decay factor for epsilon
min_epsilon = 0.01

# Arrays
buyer_value_divisions = np.linspace(0.5, 1.0, num_actions)
action2bid = np.linspace(0, 1, num_actions)  # Mapping of action index to bid value

# Q-table initialization
num_states = 2
q_table_period1 = np.zeros((num_actions))
q_table_period2 = np.zeros((num_states, num_actions))

# Training loop
epsilon = initial_epsilon
for episode in range(num_episodes):
    
    # Seller realizes value
    value_seller = np.random.normal(avg_value_seller, std_value_seller, 1)[0]
        
    # Period 1
    # Buyer action
    if np.random.uniform(0, 1) < epsilon:
        action1 = np.random.randint(0, num_actions)  # Exploration: Random action
    else:
        action1 = np.argmax(q_table_period1)  # Exploitation: Choose best action based on Q-values
    bid1 = action2bid[action1]  # Convert action index to bid value
    
    # Seller action
    if bid1 >= value_seller:
        reward1 = value_buyer - bid1  # Calculate reward based on bid and buyer's value
        state = 1 # Offer accepted
    else:
        reward1 = 0 # No reward if bid is below seller's value
        state = 0 # Offer rejected
    
    # Period 2
    if np.random.uniform(0, 1) < epsilon:
        action2 = np.random.randint(0, num_actions)  # Exploration: Random action
    else:
        action2 = np.argmax(q_table_period2[state])  # Exploitation: Choose best action based on Q-values
    bid2 = action2bid[action2]  # Convert action index to bid value
    
    if state == 0:
        if bid2 >= value_seller:
            reward2 = value_buyer - bid2  # Calculate reward based on bid and buyer's value
        else:
            reward2 = 0 # No reward if bid is below seller's value
    else:
        reward2 = 0
    
    # Q-value update using Q-learning equation
    q_table_period2[state][action2] = q_table_period2[state][action2] + alpha * (reward2 - q_table_period2[state][action2])
    q_table_period1[action1] = q_table_period1[action1] + alpha * (reward1 + gamma * np.max(q_table_period2[state]) - q_table_period1[action1])

    # Decay epsilon to reduce exploration over time
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    # Print relevant information for each episode
    if episode % (num_episodes / 10) == 0:
        print("SelVal:", round(value_seller, 2), "BuyVal:", round(value_buyer, 2), "Bid1:", round(bid1, 2), "Reward1:", round(reward1, 2),"State:",state,"Bid2:", round(bid2, 2), "Reward2:", round(reward2, 2), "Epsilon:", round(epsilon, 2))     

SelVal: 0.26 BuyVal: 1 Bid1: 0.3 Reward1: 0.7 State: 1 Bid2: 0.0 Reward2: 0 Epsilon: 0.99
SelVal: 0.63 BuyVal: 1 Bid1: 0.2 Reward1: 0 State: 0 Bid2: 0.5 Reward2: 0 Epsilon: 0.6
SelVal: 0.45 BuyVal: 1 Bid1: 0.7 Reward1: 0.3 State: 1 Bid2: 0.0 Reward2: 0 Epsilon: 0.36
SelVal: 0.13 BuyVal: 1 Bid1: 0.4 Reward1: 0.6 State: 1 Bid2: 0.0 Reward2: 0 Epsilon: 0.22
SelVal: 0.03 BuyVal: 1 Bid1: 0.5 Reward1: 0.5 State: 1 Bid2: 0.0 Reward2: 0 Epsilon: 0.13
SelVal: 0.49 BuyVal: 1 Bid1: 0.4 Reward1: 0 State: 0 Bid2: 0.7 Reward2: 0.3 Epsilon: 0.08
SelVal: 0.14 BuyVal: 1 Bid1: 0.4 Reward1: 0.6 State: 1 Bid2: 0.0 Reward2: 0 Epsilon: 0.05
SelVal: 0.13 BuyVal: 1 Bid1: 0.5 Reward1: 0.5 State: 1 Bid2: 0.0 Reward2: 0 Epsilon: 0.03
SelVal: -0.02 BuyVal: 1 Bid1: 0.4 Reward1: 0.6 State: 1 Bid2: 0.0 Reward2: 0 Epsilon: 0.02
SelVal: 0.3 BuyVal: 1 Bid1: 0.4 Reward1: 0.6 State: 1 Bid2: 0.0 Reward2: 0 Epsilon: 0.01


In [97]:
print(np.argmax(q_table_period1), np.argmax(q_table_period2[0]))

4 7


In [98]:
print(np.max(q_table_period1), np.max(q_table_period2[0]))

0.5002087682872619 0.2793795973791373


### 3 Period Bilateral Bargain

In [99]:
import numpy as np

# Buyer realizes value 
value_buyer = 1
    
# Parameters
avg_value_seller = 0.5
std_value_seller = 0.3

# Hyperparameters
num_actions = 11
num_episodes = 50000
alpha = 0.1  # Learning rate
gamma = 1  # No discount for immediate rewards
initial_epsilon = 0.99
epsilon_decay = 0.99995  # Decay factor for epsilon
min_epsilon = 0.01

# Divide the buyer value range into 11 divisions
buyer_value_divisions = np.linspace(0.5, 1.0, num_actions)

# Q-table initialization
num_states = 2
q_table_period1 = np.zeros((num_actions))
q_table_period2 = np.zeros((num_states, num_actions))
q_table_period3 = np.zeros((num_states, num_actions))
action2bid = np.linspace(0, 1, num_actions)  # Mapping of action index to bid value

# Training loop
epsilon = initial_epsilon
for episode in range(num_episodes):
    
    # Seller realizes value
    value_seller = np.random.normal(avg_value_seller, std_value_seller, 1)[0]
        
    # Period 1
    # Buyer action
    if np.random.uniform(0, 1) < epsilon:
        action1 = np.random.randint(0, num_actions)  # Exploration: Random action
    else:
        action1 = np.argmax(q_table_period1)  # Exploitation: Choose best action based on Q-values
    bid1 = action2bid[action1]  # Convert action index to bid value
    
    # Seller action
    if bid1 >= value_seller:
        reward1 = value_buyer - bid1  # Calculate reward based on bid and buyer's value
        state1 = 1 # Offer accepted in period 1
    else:
        reward1 = 0 # No reward if bid is below seller's value
        state1 = 0 # Offer rejected in period 1
    
    # Period 2
    if np.random.uniform(0, 1) < epsilon:
        action2 = np.random.randint(0, num_actions)  # Exploration: Random action
    else:
        action2 = np.argmax(q_table_period2[state1])  # Exploitation: Choose best action based on Q-values
    bid2 = action2bid[action2]  # Convert action index to bid value
    
    if state1 == 0:
        if bid2 >= value_seller:
            reward2 = value_buyer - bid2  # Calculate reward based on bid and buyer's value
            state2 = 1 # Offer accepted in second period
        else:
            reward2 = 0 # No reward if bid is below seller's value
            state2 = 0 # Offer rejected in second period
    else:
        reward2 = 0
        state2 = 1 # Offer already accepted

    # Period 3
    if np.random.uniform(0, 1) < epsilon:
        action3 = np.random.randint(0, num_actions)  # Exploration: Random action
    else:
        action3 = np.argmax(q_table_period3[state2])  # Exploitation: Choose best action based on Q-values
    bid3 = action2bid[action3]  # Convert action index to bid value
    
    if state2 == 0:
        if bid3 >= value_seller:
            reward3 = value_buyer - bid3  # Calculate reward based on bid and buyer's value
        else:
            reward3 = 0 # No reward if bid is below seller's value
    else:
        reward3 = 0
    
    # Q-value update using Q-learning equation
    q_table_period3[state2][action3] += alpha * (reward3                                           - q_table_period3[state2][action3])
    q_table_period2[state1][action2] += alpha * (reward2 + gamma * np.max(q_table_period3[state2]) - q_table_period2[state1][action2])
    q_table_period1[action1]         += alpha * (reward1 + gamma * np.max(q_table_period2[state1]) - q_table_period1[action1])

    # Decay epsilon to reduce exploration over time
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    # Print relevant information for each episode
    if episode % (num_episodes / 20) == 0:
        print("SelVal:", round(value_seller, 2), "BuyVal:", round(value_buyer, 2), "B1:", round(bid1, 2), "R1:", round(reward1, 2),"S1:",state1,"B2:", round(bid2, 2), "R2:", round(reward2, 2), "S2:",state1,"B3:", round(bid3, 2), "R3:", round(reward3, 2),"Eps:", round(epsilon, 2))

SelVal: 0.16 BuyVal: 1 B1: 0.4 R1: 0.6 S1: 1 B2: 0.2 R2: 0 S2: 1 B3: 1.0 R3: 0 Eps: 0.99
SelVal: -0.02 BuyVal: 1 B1: 0.1 R1: 0.9 S1: 1 B2: 0.1 R2: 0 S2: 1 B3: 0.1 R3: 0 Eps: 0.87
SelVal: 0.71 BuyVal: 1 B1: 0.3 R1: 0 S1: 0 B2: 0.9 R2: 0.1 S2: 0 B3: 1.0 R3: 0 Eps: 0.77
SelVal: 0.66 BuyVal: 1 B1: 1.0 R1: 0.0 S1: 1 B2: 1.0 R2: 0 S2: 1 B3: 0.3 R3: 0 Eps: 0.68
SelVal: 0.38 BuyVal: 1 B1: 0.5 R1: 0.5 S1: 1 B2: 0.1 R2: 0 S2: 1 B3: 0.0 R3: 0 Eps: 0.6
SelVal: 0.32 BuyVal: 1 B1: 0.4 R1: 0.6 S1: 1 B2: 0.0 R2: 0 S2: 1 B3: 0.0 R3: 0 Eps: 0.53
SelVal: 0.52 BuyVal: 1 B1: 0.4 R1: 0 S1: 0 B2: 0.7 R2: 0.3 S2: 0 B3: 0.0 R3: 0 Eps: 0.47
SelVal: 0.21 BuyVal: 1 B1: 0.1 R1: 0 S1: 0 B2: 0.9 R2: 0.1 S2: 0 B3: 0.0 R3: 0 Eps: 0.41
SelVal: 0.76 BuyVal: 1 B1: 0.5 R1: 0 S1: 0 B2: 0.6 R2: 0 S2: 0 B3: 0.7 R3: 0 Eps: 0.36
SelVal: 0.93 BuyVal: 1 B1: 0.8 R1: 0 S1: 0 B2: 0.6 R2: 0 S2: 0 B3: 0.8 R3: 0 Eps: 0.32
SelVal: 0.54 BuyVal: 1 B1: 0.8 R1: 0.2 S1: 1 B2: 0.0 R2: 0 S2: 1 B3: 0.0 R3: 0 Eps: 0.28
SelVal: 0.82 BuyVal: 1 B1

In [100]:
print(np.argmax(q_table_period1), np.argmax(q_table_period2[0]), np.argmax(q_table_period3[0]))

4 8 9


In [101]:
print(np.max(q_table_period1), np.max(q_table_period2[0]), np.max(q_table_period3[0]))

0.4788515421766211 0.1690272551549997 0.07061715680066886


### K-Period Bilateral Bargain

In [113]:
import numpy as np

# Parameters
value_buyer = 1
avg_value_seller = 0.5
std_value_seller = 0.4
num_periods = 4

# Hyperparameters
num_actions = 101
num_states = 2
num_episodes = 100000
#num_episodes = 1
alpha = 0.1  # Learning rate
gamma = 1  # Full discount for immediate rewards
initial_epsilon = 0.99
epsilon_decay = 0.99995  # Decay factor for epsilon
min_epsilon = 0.01

# Q-table and Arrays
buyer_value_divisions = np.linspace(0.5, 1.0, num_actions)
action2bid = np.linspace(0, 1, num_actions)  # Mapping of action index to bid value
q_table = np.zeros((num_periods, num_states, num_actions))

# Training loop
epsilon = initial_epsilon
for episode in range(num_episodes):
    
    # Seller realizes value
    value_seller = np.random.normal(avg_value_seller, std_value_seller, 1)[0]
        
    # History in one full game
    actions = []
    states = [0]
    rewards = []
    current_action = 0
    
    for period in range(num_periods):
        if np.random.uniform(0, 1) < epsilon:
            action = np.random.randint(current_action, num_actions)  # Exploration: Random action
        else:
            action = np.argmax(q_table[period][states[period]])  # Exploitation: Choose best action based on Q-values
        bid = action2bid[action]  # Convert action index to bid value

        # Seller action
        if states[period] == 0:
            if bid >= value_seller:
                reward = value_buyer - bid  # Calculate reward based on bid and buyer's value
                next_state = 1 # Offer accepted
            else:
                reward = 0 # No reward if bid is below seller's value
                next_state = 0 # Offer rejected
        else:
            reward = 0
            next_state = 1 # Offer already accepted

        actions.append(action)
        states.append(next_state)
        rewards.append(reward)
        current_action = action

    for period in range(num_periods-1,-1,-1):
        if period == num_periods-1:
            q_table[period][states[period]][actions[period]] += alpha * (rewards[period] - q_table[period,states[period],actions[period]])
        else:
            q_table[period][states[period]][actions[period]] += alpha * (rewards[period] + gamma * np.max(q_table[period+1,states[period+1]]) - q_table[period,states[period],actions[period]])

    # Decay epsilon to reduce exploration over time
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    # Print relevant information for each episode
    if episode % (num_episodes / 20) == 0:
        print("SelVal:", round(value_seller, 2), "BuyVal:", round(value_buyer, 2), "Actions:", actions ,"States:",states,"Eps:", round(epsilon, 2))

SelVal: 0.15 BuyVal: 1 Actions: [7, 15, 50, 98] States: [0, 0, 1, 1, 1] Eps: 0.99
SelVal: 0.36 BuyVal: 1 Actions: [10, 17, 80, 96] States: [0, 0, 0, 1, 1] Eps: 0.77
SelVal: 0.79 BuyVal: 1 Actions: [30, 82, 97, 0] States: [0, 0, 1, 1, 1] Eps: 0.6
SelVal: 0.78 BuyVal: 1 Actions: [72, 59, 61, 78] States: [0, 0, 0, 0, 0] Eps: 0.47
SelVal: 1.05 BuyVal: 1 Actions: [37, 61, 80, 86] States: [0, 0, 0, 0, 0] Eps: 0.36
SelVal: 0.35 BuyVal: 1 Actions: [49, 0, 47, 80] States: [0, 1, 1, 1, 1] Eps: 0.28
SelVal: 0.97 BuyVal: 1 Actions: [60, 54, 71, 78] States: [0, 0, 0, 0, 0] Eps: 0.22
SelVal: 0.55 BuyVal: 1 Actions: [16, 61, 92, 0] States: [0, 0, 1, 1, 1] Eps: 0.17
SelVal: 0.42 BuyVal: 1 Actions: [64, 0, 0, 0] States: [0, 1, 1, 1, 1] Eps: 0.13
SelVal: 0.76 BuyVal: 1 Actions: [45, 86, 0, 44] States: [0, 0, 1, 1, 1] Eps: 0.1
SelVal: 0.33 BuyVal: 1 Actions: [13, 66, 0, 0] States: [0, 0, 1, 1, 1] Eps: 0.08
SelVal: 0.44 BuyVal: 1 Actions: [22, 63, 0, 0] States: [0, 0, 1, 1, 1] Eps: 0.06
SelVal: 0.94 BuyVa

In [114]:
for period in range(num_periods):
    print(np.argmax(q_table[period,0]))

57
73
88
99


In [115]:
for period in range(num_periods):
    print(np.max(q_table[period,0]))

0.31779142688720213
0.13285562693520528
0.05220838771594758
0.003987678896455829
