### 2 Period Bilateral Bargain

In [64]:
import numpy as np

# Buyer realizes value 
value_buyer = 1
    
# Parameters
avg_value_seller = 0.5
std_value_seller = 0.2

# Hyperparameters
num_actions = 11
num_episodes = 50000
alpha = 0.1  # Learning rate
gamma = 1  # No discount for immediate rewards
initial_epsilon = 0.99
epsilon_decay = 0.99995  # Decay factor for epsilon
min_epsilon = 0.01

# Divide the buyer value range into 11 divisions
buyer_value_divisions = np.linspace(0.5, 1.0, num_actions)

# Q-table initialization
num_states = 2
q_table_period1 = np.zeros((num_actions))
q_table_period2 = np.zeros((num_states, num_actions))
action2bid = np.linspace(0, 1, num_actions)  # Mapping of action index to bid value

# Training loop
epsilon = initial_epsilon
for episode in range(num_episodes):
    
    # Seller realizes value
    value_seller = np.random.normal(avg_value_seller, std_value_seller, 1)[0]
        
    # Period 1
    # Buyer action
    if np.random.uniform(0, 1) < epsilon:
        action1 = np.random.randint(0, num_actions)  # Exploration: Random action
    else:
        action1 = np.argmax(q_table_period1)  # Exploitation: Choose best action based on Q-values
    bid1 = action2bid[action1]  # Convert action index to bid value
    
    # Seller action
    if bid1 >= value_seller:
        reward1 = value_buyer - bid1  # Calculate reward based on bid and buyer's value
        state = 1 # Offer accepted
    else:
        reward1 = 0 # No reward if bid is below seller's value
        state = 0 # Offer rejected
    
    # Period 2
    if np.random.uniform(0, 1) < epsilon:
        action2 = np.random.randint(0, num_actions)  # Exploration: Random action
    else:
        action2 = np.argmax(q_table_period2[state])  # Exploitation: Choose best action based on Q-values
    bid2 = action2bid[action2]  # Convert action index to bid value
    
    if state == 0:
        if bid2 >= value_seller:
            reward2 = value_buyer - bid2  # Calculate reward based on bid and buyer's value
        else:
            reward2 = 0 # No reward if bid is below seller's value
    else:
        reward2 = 0
    
    # Q-value update using Q-learning equation
    q_table_period2[state][action2] = q_table_period2[state][action2] + alpha * (reward2 - q_table_period2[state][action2])
    q_table_period1[action1] = q_table_period1[action1] + alpha * (reward1 + gamma * np.max(q_table_period2[state]) - q_table_period1[action1])

    # Decay epsilon to reduce exploration over time
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    # Print relevant information for each episode
    if episode % (num_episodes / 10) == 0:
        print("Seller value:", round(value_seller, 2), "Buyer value:", round(value_buyer, 2), "Bid1:", round(bid1, 2), "Reward1:", round(reward1, 2),"State:",state,"Bid2:", round(bid2, 2), "Reward2:", round(reward2, 2), "Epsilon:", round(epsilon, 2))
        
        
        

Seller value: 0.32 Buyer value: 1 Bid1: 0.7 Reward1: 0.3 State: 1 Bid2: 0.8 Reward2: 0 Epsilon: 0.99
Seller value: 1.0 Buyer value: 1 Bid1: 0.5 Reward1: 0 State: 0 Bid2: 0.2 Reward2: 0 Epsilon: 0.77
Seller value: 0.53 Buyer value: 1 Bid1: 0.1 Reward1: 0 State: 0 Bid2: 0.2 Reward2: 0 Epsilon: 0.6
Seller value: 0.6 Buyer value: 1 Bid1: 0.6 Reward1: 0 State: 0 Bid2: 0.8 Reward2: 0.2 Epsilon: 0.47
Seller value: 0.47 Buyer value: 1 Bid1: 0.6 Reward1: 0.4 State: 1 Bid2: 0.0 Reward2: 0 Epsilon: 0.36
Seller value: 1.1 Buyer value: 1 Bid1: 1.0 Reward1: 0 State: 0 Bid2: 0.8 Reward2: 0 Epsilon: 0.28
Seller value: 0.82 Buyer value: 1 Bid1: 0.6 Reward1: 0 State: 0 Bid2: 0.8 Reward2: 0 Epsilon: 0.22
Seller value: 0.88 Buyer value: 1 Bid1: 0.5 Reward1: 0 State: 0 Bid2: 0.8 Reward2: 0 Epsilon: 0.17
Seller value: 0.55 Buyer value: 1 Bid1: 0.0 Reward1: 0 State: 0 Bid2: 0.8 Reward2: 0.2 Epsilon: 0.13
Seller value: 0.42 Buyer value: 1 Bid1: 0.5 Reward1: 0.5 State: 1 Bid2: 0.0 Reward2: 0 Epsilon: 0.1


In [65]:
print(np.argmax(q_table_period1), np.argmax(q_table_period2[0]))

6 8


In [66]:
print(np.max(q_table_period1), np.max(q_table_period2[0]))

0.29235624680617367 0.18568354481950503


### 3 Period Bilateral Bargain

In [70]:
import numpy as np

# Buyer realizes value 
value_buyer = 1
    
# Parameters
avg_value_seller = 0.5
std_value_seller = 0.2

# Hyperparameters
num_actions = 11
num_episodes = 50000
alpha = 0.1  # Learning rate
gamma = 1  # No discount for immediate rewards
initial_epsilon = 0.99
epsilon_decay = 0.99995  # Decay factor for epsilon
min_epsilon = 0.01

# Divide the buyer value range into 11 divisions
buyer_value_divisions = np.linspace(0.5, 1.0, num_actions)

# Q-table initialization
num_states = 2
q_table_period1 = np.zeros((num_actions))
q_table_period2 = np.zeros((num_states, num_actions))
q_table_period3 = np.zeros((num_states, num_actions))
action2bid = np.linspace(0, 1, num_actions)  # Mapping of action index to bid value

# Training loop
epsilon = initial_epsilon
for episode in range(num_episodes):
    
    # Seller realizes value
    value_seller = np.random.normal(avg_value_seller, std_value_seller, 1)[0]
        
    # Period 1
    # Buyer action
    if np.random.uniform(0, 1) < epsilon:
        action1 = np.random.randint(0, num_actions)  # Exploration: Random action
    else:
        action1 = np.argmax(q_table_period1)  # Exploitation: Choose best action based on Q-values
    bid1 = action2bid[action1]  # Convert action index to bid value
    
    # Seller action
    if bid1 >= value_seller:
        reward1 = value_buyer - bid1  # Calculate reward based on bid and buyer's value
        state1 = 1 # Offer accepted in period 1
    else:
        reward1 = 0 # No reward if bid is below seller's value
        state1 = 0 # Offer rejected in period 1
    
    # Period 2
    if np.random.uniform(0, 1) < epsilon:
        action2 = np.random.randint(0, num_actions)  # Exploration: Random action
    else:
        action2 = np.argmax(q_table_period2[state1])  # Exploitation: Choose best action based on Q-values
    bid2 = action2bid[action2]  # Convert action index to bid value
    
    if state1 == 0:
        if bid2 >= value_seller:
            reward2 = value_buyer - bid2  # Calculate reward based on bid and buyer's value
            state2 = 1 # Offer accepted in second period
        else:
            reward2 = 0 # No reward if bid is below seller's value
            state2 = 0 # Offer rejected in second period
    else:
        reward2 = 0
        state2 = 1 # Offer already accepted

    # Period 3
    if np.random.uniform(0, 1) < epsilon:
        action3 = np.random.randint(0, num_actions)  # Exploration: Random action
    else:
        action3 = np.argmax(q_table_period3[state2])  # Exploitation: Choose best action based on Q-values
    bid3 = action2bid[action3]  # Convert action index to bid value
    
    if state2 == 0:
        if bid3 >= value_seller:
            reward3 = value_buyer - bid3  # Calculate reward based on bid and buyer's value
        else:
            reward3 = 0 # No reward if bid is below seller's value
    else:
        reward3 = 0
    
    # Q-value update using Q-learning equation
    q_table_period3[state2][action3] += alpha * (reward3                                           - q_table_period3[state2][action3])
    q_table_period2[state1][action2] += alpha * (reward2 + gamma * np.max(q_table_period3[state2]) - q_table_period2[state1][action2])
    q_table_period1[action1]         += alpha * (reward1 + gamma * np.max(q_table_period2[state1]) - q_table_period1[action1])

    # Decay epsilon to reduce exploration over time
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    # Print relevant information for each episode
    if episode % (num_episodes / 20) == 0:
        print("SelVal:", round(value_seller, 2), "BuyVal:", round(value_buyer, 2), "B1:", round(bid1, 2), "R1:", round(reward1, 2),"S1:",state1,"B2:", round(bid2, 2), "R2:", round(reward2, 2), "S2:",state1,"B3:", round(bid3, 2), "R3:", round(reward3, 2),"Eps:", round(epsilon, 2))

SelVal: 0.04 BuyVal: 1 B1: 0.5 R1: 0.5 S1: 1 B2: 0.4 R2: 0 S2: 1 B3: 0.8 R3: 0 Eps: 0.99
SelVal: 0.92 BuyVal: 1 B1: 1.0 R1: 0.0 S1: 1 B2: 0.5 R2: 0 S2: 1 B3: 0.7 R3: 0 Eps: 0.87
SelVal: 0.74 BuyVal: 1 B1: 0.6 R1: 0 S1: 0 B2: 0.4 R2: 0 S2: 0 B3: 0.0 R3: 0 Eps: 0.77
SelVal: 0.17 BuyVal: 1 B1: 0.6 R1: 0.4 S1: 1 B2: 0.0 R2: 0 S2: 1 B3: 1.0 R3: 0 Eps: 0.68
SelVal: 0.46 BuyVal: 1 B1: 0.9 R1: 0.1 S1: 1 B2: 0.0 R2: 0 S2: 1 B3: 0.4 R3: 0 Eps: 0.6
SelVal: 0.47 BuyVal: 1 B1: 0.9 R1: 0.1 S1: 1 B2: 0.0 R2: 0 S2: 1 B3: 0.0 R3: 0 Eps: 0.53
SelVal: 0.59 BuyVal: 1 B1: 0.5 R1: 0 S1: 0 B2: 1.0 R2: 0.0 S2: 0 B3: 0.0 R3: 0 Eps: 0.47
SelVal: 0.34 BuyVal: 1 B1: 0.3 R1: 0 S1: 0 B2: 0.6 R2: 0.4 S2: 0 B3: 0.0 R3: 0 Eps: 0.41
SelVal: 0.35 BuyVal: 1 B1: 0.5 R1: 0.5 S1: 1 B2: 1.0 R2: 0 S2: 1 B3: 0.0 R3: 0 Eps: 0.36
SelVal: 0.68 BuyVal: 1 B1: 0.5 R1: 0 S1: 0 B2: 0.0 R2: 0 S2: 0 B3: 0.8 R3: 0.2 Eps: 0.32
SelVal: 0.31 BuyVal: 1 B1: 0.8 R1: 0.2 S1: 1 B2: 0.0 R2: 0 S2: 1 B3: 0.0 R3: 0 Eps: 0.28
SelVal: 0.24 BuyVal: 1 B

In [71]:
print(np.argmax(q_table_period1), np.argmax(q_table_period2[0]), np.argmax(q_table_period3[0]))

6 8 9


In [72]:
print(np.max(q_table_period1), np.max(q_table_period2[0]), np.max(q_table_period3[0]))

0.34198938985227545 0.1630077112533948 0.08266643865148145


### K-Period Bilateral Bargain

In [100]:
import numpy as np

# Buyer realizes value 
value_buyer = 1
    
# Parameters
avg_value_seller = 0.8
std_value_seller = 0.2

# Hyperparameters
num_periods = 1
num_actions = 6
num_episodes = 100000
alpha = 0.1  # Learning rate
gamma = 1  # No discount for immediate rewards
initial_epsilon = 0.99
epsilon_decay = 0.99995  # Decay factor for epsilon
min_epsilon = 0.01

# Divide the buyer value range into 11 divisions
buyer_value_divisions = np.linspace(0.5, 1.0, num_actions)

# Q-table initialization
num_states = 2
q_table = np.zeros((num_periods, num_states, num_actions))
action2bid = np.linspace(0, 1, num_actions)  # Mapping of action index to bid value

# Training loop
epsilon = initial_epsilon
period0 = 1
state0 = 0
for episode in range(num_episodes):
    
    # Seller realizes value
    value_seller = np.random.normal(avg_value_seller, std_value_seller, 1)[0]
        
    # History in one full game
    actions = []
    states = [0]
    rewards = []
    
    for period in range(num_periods):
            
        if np.random.uniform(0, 1) < epsilon:
            action = np.random.randint(0, num_actions)  # Exploration: Random action
        else:
            action = np.argmax(q_table[period,states[period]])  # Exploitation: Choose best action based on Q-values
        bid = action2bid[action]  # Convert action index to bid value

        # Seller action
        if states[period] == 0:
            if bid >= value_seller:
                reward = value_buyer - bid  # Calculate reward based on bid and buyer's value
                next_state = 1 # Offer accepted in period 1
            else:
                reward = 0 # No reward if bid is below seller's value
                next_state = 0 # Offer rejected in period 1
        else:
            reward = 0
            next_state = 1

        actions.append(action)
        states.append(next_state)
        rewards.append(reward)

    for period in range(num_periods-1,-1,-1):
        if period == num_periods-1:
            q_table[period][states[period-1]] += alpha * (rewards[period] - q_table[period][states[period-1]])
        else:
            q_table[period][states[period-1]] += alpha * (rewards[period] + gamma * q_table[period+1][states[period]] - q_table[period][states[period-1]])

    # Decay epsilon to reduce exploration over time
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    # Print relevant information for each episode
    if episode % (num_episodes / 20) == 0:
        print("SelVal:", round(value_seller, 2), "BuyVal:", round(value_buyer, 2), "Actions:", actions ,"States:",states,"Eps:", round(epsilon, 2))

SelVal: 1.1 BuyVal: 1 Actions: [4] States: [0, 0] Eps: 0.99
SelVal: 1.12 BuyVal: 1 Actions: [0] States: [0, 0] Eps: 0.77
SelVal: 0.94 BuyVal: 1 Actions: [5] States: [0, 1] Eps: 0.6
SelVal: 0.55 BuyVal: 1 Actions: [1] States: [0, 0] Eps: 0.47
SelVal: 0.68 BuyVal: 1 Actions: [0] States: [0, 0] Eps: 0.36
SelVal: 1.0 BuyVal: 1 Actions: [0] States: [0, 0] Eps: 0.28
SelVal: 0.84 BuyVal: 1 Actions: [3] States: [0, 0] Eps: 0.22
SelVal: 0.69 BuyVal: 1 Actions: [0] States: [0, 0] Eps: 0.17
SelVal: 1.16 BuyVal: 1 Actions: [0] States: [0, 0] Eps: 0.13
SelVal: 0.59 BuyVal: 1 Actions: [4] States: [0, 1] Eps: 0.1
SelVal: 0.77 BuyVal: 1 Actions: [0] States: [0, 0] Eps: 0.08
SelVal: 0.68 BuyVal: 1 Actions: [0] States: [0, 0] Eps: 0.06
SelVal: 0.71 BuyVal: 1 Actions: [0] States: [0, 0] Eps: 0.05
SelVal: 1.2 BuyVal: 1 Actions: [0] States: [0, 0] Eps: 0.04
SelVal: 0.67 BuyVal: 1 Actions: [0] States: [0, 0] Eps: 0.03
SelVal: 0.69 BuyVal: 1 Actions: [0] States: [0, 0] Eps: 0.02
SelVal: 1.14 BuyVal: 1 Action

In [101]:
for period in range(num_periods):
    print(np.argmax(q_table[period,0]))

0


In [5]:
random_input = torch.arange(0, 11, 1, dtype=torch.float32)

with torch.no_grad():
    optimal_actions = []
    for value in random_input:
        q_values = q_network(value.unsqueeze(0))  # Reshape value to match input_size
        optimal_action = torch.argmax(q_values).item()
        optimal_actions.append(optimal_action)

print("Random Input:", random_input)
print("Optimal Actions:", optimal_actions)

Random Input: tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])
Optimal Actions: [2, 2, 2, 4, 4, 4, 5, 5, 5, 5, 5]


In [4]:
random_input.shape

torch.Size([11])