### Simple Q-learning buyer making bids to a seller with unknown but fixed reservation price. 

In [101]:
import numpy as np

# Parameters
value_buyer = 1
value_seller = 0.2

# Hyperparameters
num_actions = 21
num_episodes = 5000
alpha = 0.1  # Learning rate
gamma = 0  # No discount for immediate rewards
initial_epsilon = 0.99
epsilon_decay = 0.999  # Decay factor for epsilon
min_epsilon = 0.01

# Q-table initialization
q_table = np.zeros((num_actions,))
bid2action = np.linspace(0, 1, num_actions)  # Mapping of action index to bid value

print('Value Buyer:', value_buyer)
print('Value Seller:', value_seller)

# Training loop
epsilon = initial_epsilon
for episode in range(num_episodes):
    
    # Select action using epsilon-greedy strategy
    if np.random.uniform(0, 1) < epsilon:
        action = np.random.randint(0, num_actions)  # Exploration: Random action
    else:
        action = np.argmax(q_table)  # Exploitation: Choose best action based on Q-values

    bid = bid2action[action]  # Convert action index to bid value
    
    if bid >= value_seller:
        reward = value_buyer - bid  # Calculate reward based on bid and buyer's value
    else:
        reward = 0  # No reward if bid is below seller's value

    # Q-value update using Q-learning equation
    q_table[action] += alpha * (reward + gamma * np.max(q_table) - q_table[action])

    # Decay epsilon to reduce exploration over time
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    # Print relevant information for each episode
    if episode % 500 == 0:
        print("Bid:", round(bid, 2), "Reward:", round(reward, 2), "Epsilon:", round(epsilon, 2))


Value Buyer: 1
Value Seller: 0.2
Bid: 0.0 Reward: 0 Epsilon: 0.99
Bid: 0.7 Reward: 0.3 Epsilon: 0.6
Bid: 0.35 Reward: 0.65 Epsilon: 0.36
Bid: 0.2 Reward: 0.8 Epsilon: 0.22
Bid: 0.2 Reward: 0.8 Epsilon: 0.13
Bid: 0.15 Reward: 0 Epsilon: 0.08
Bid: 0.2 Reward: 0.8 Epsilon: 0.05
Bid: 0.2 Reward: 0.8 Epsilon: 0.03
Bid: 0.2 Reward: 0.8 Epsilon: 0.02
Bid: 0.2 Reward: 0.8 Epsilon: 0.01


In [103]:
print(q_table)

[0.         0.         0.         0.         0.8        0.74718217
 0.69161924 0.63682089 0.59999991 0.54341512 0.49910149 0.44563602
 0.39408765 0.34534402 0.29860848 0.24871156 0.19858607 0.1490456
 0.09903023 0.04964652 0.        ]


### Q-learning against seller with unknown and random reservation price

In [107]:
import numpy as np

# Parameters
value_buyer = 1
avg_value_seller = 0.2
std_value_seller = 0.05

# Hyperparameters
num_actions = 21
num_episodes = 10000
alpha = 0.1  # Learning rate
gamma = 0  # No discount for immediate rewards
initial_epsilon = 0.99
epsilon_decay = 0.995  # Decay factor for epsilon
min_epsilon = 0.01

# Q-table initialization
q_table = np.zeros((num_actions,))
bid2action = np.linspace(0, 1, num_actions)  # Mapping of action index to bid value

print('Value Buyer:', value_buyer)
print('Avg Value Seller:', avg_value_seller)

# Training loop
epsilon = initial_epsilon
for episode in range(num_episodes):
    
    # Seller realizes value
    value_seller = np.random.normal(avg_value_seller,std_value_seller,1)[0]
    
    # Select action using epsilon-greedy strategy
    if np.random.uniform(0, 1) < epsilon:
        action = np.random.randint(0, num_actions)  # Exploration: Random action
    else:
        action = np.argmax(q_table)  # Exploitation: Choose best action based on Q-values

    bid = bid2action[action]  # Convert action index to bid value
    
    if bid >= value_seller:
        reward = value_buyer - bid  # Calculate reward based on bid and buyer's value
    else:
        reward = 0  # No reward if bid is below seller's value

    # Q-value update using Q-learning equation
    q_table[action] += alpha * (reward + gamma * np.max(q_table) - q_table[action])

    # Decay epsilon to reduce exploration over time
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    # Print relevant information for each episode
    if episode % 500 == 0:
        print("Bid:",round(bid,2), "Seller value:",round(value_seller,2),round(bid, 2), "Reward:", round(reward, 2), "Epsilon:", round(epsilon, 2))



Value Buyer: 1
Avg Value Seller: 0.2
Bid: 0.9 Seller value: 0.13 0.9 Reward: 0.1 Epsilon: 0.99
Bid: 0.35 Seller value: 0.19 0.35 Reward: 0.65 Epsilon: 0.08
Bid: 0.35 Seller value: 0.17 0.35 Reward: 0.65 Epsilon: 0.01
Bid: 0.35 Seller value: 0.23 0.35 Reward: 0.65 Epsilon: 0.01
Bid: 0.35 Seller value: 0.12 0.35 Reward: 0.65 Epsilon: 0.01
Bid: 0.35 Seller value: 0.16 0.35 Reward: 0.65 Epsilon: 0.01
Bid: 0.35 Seller value: 0.16 0.35 Reward: 0.65 Epsilon: 0.01
Bid: 0.35 Seller value: 0.2 0.35 Reward: 0.65 Epsilon: 0.01
Bid: 0.35 Seller value: 0.18 0.35 Reward: 0.65 Epsilon: 0.01
Bid: 0.35 Seller value: 0.14 0.35 Reward: 0.65 Epsilon: 0.01
Bid: 0.35 Seller value: 0.21 0.35 Reward: 0.65 Epsilon: 0.01
Bid: 0.35 Seller value: 0.18 0.35 Reward: 0.65 Epsilon: 0.01
Bid: 0.35 Seller value: 0.26 0.35 Reward: 0.65 Epsilon: 0.01
Bid: 0.35 Seller value: 0.21 0.35 Reward: 0.65 Epsilon: 0.01
Bid: 0.35 Seller value: 0.22 0.35 Reward: 0.65 Epsilon: 0.01
Bid: 0.35 Seller value: 0.26 0.35 Reward: 0.65 Epsil

### Q-learning when buyer value changes but is known, but seller value is unknown and random. 

In [113]:
import numpy as np

# Parameters
avg_value_seller = 0.7
std_value_seller = 0.05

# Hyperparameters
num_actions = 21
num_episodes = 200000
alpha = 0.1  # Learning rate
gamma = 0  # No discount for immediate rewards
initial_epsilon = 0.99
epsilon_decay = 0.99999  # Decay factor for epsilon
min_epsilon = 0.01

# Divide the buyer value range into 11 divisions
buyer_value_divisions = np.linspace(0.5, 1.0, num_actions)

# Q-table initialization
num_states = num_actions
q_table = np.zeros((num_states, num_actions))
action2bid = np.linspace(0, 1, num_actions)  # Mapping of action index to bid value

# Training loop
epsilon = initial_epsilon
for episode in range(num_episodes):
    
    # Seller realizes value
    value_seller = np.random.normal(avg_value_seller, std_value_seller, 1)[0]
    
    # Buyer realizes value 
    value_buyer = np.random.choice(buyer_value_divisions)
    
    # Find the index of the buyer value division
    state = np.argmin(np.abs(buyer_value_divisions - value_buyer))
    
    # Select action using epsilon-greedy strategy
    if np.random.uniform(0, 1) < epsilon:
        action = np.random.randint(0, num_actions)  # Exploration: Random action
    else:
        action = np.argmax(q_table[state])  # Exploitation: Choose best action based on Q-values

    bid = action2bid[action]  # Convert action index to bid value
    
    if bid >= value_seller:
        reward = value_buyer - bid  # Calculate reward based on bid and buyer's value
    else:
        reward = 0  # No reward if bid is below seller's value

    # Q-value update using Q-learning equation
    q_table[state][action] += alpha * (reward + gamma * np.max(q_table[state]) - q_table[state][action])

    # Decay epsilon to reduce exploration over time
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    # Print relevant information for each episode
    if episode % 20000 == 0:
        print("Seller value:", round(value_seller, 2), "Buyer value:", round(value_buyer, 2), "Bid:", round(bid, 2), "Reward:", round(reward, 2), "Epsilon:", round(epsilon, 2))


Seller value: 0.6 Buyer value: 0.92 Bid: 0.65 Reward: 0.28 Epsilon: 0.99
Seller value: 0.59 Buyer value: 0.62 Bid: 0.15 Reward: 0 Epsilon: 0.81
Seller value: 0.75 Buyer value: 0.88 Bid: 0.3 Reward: 0 Epsilon: 0.66
Seller value: 0.69 Buyer value: 0.68 Bid: 0.65 Reward: 0 Epsilon: 0.54
Seller value: 0.64 Buyer value: 0.62 Bid: 0.6 Reward: 0 Epsilon: 0.44
Seller value: 0.67 Buyer value: 1.0 Bid: 0.75 Reward: 0.25 Epsilon: 0.36
Seller value: 0.72 Buyer value: 0.52 Bid: 0.0 Reward: 0 Epsilon: 0.3
Seller value: 0.65 Buyer value: 0.82 Bid: 0.9 Reward: -0.08 Epsilon: 0.24
Seller value: 0.62 Buyer value: 0.92 Bid: 0.75 Reward: 0.18 Epsilon: 0.2
Seller value: 0.65 Buyer value: 0.95 Bid: 0.75 Reward: 0.2 Epsilon: 0.16


In [114]:
print(buyer_value_divisions)
print(action2bid[np.argmax(q_table, axis=1)])

[0.5   0.525 0.55  0.575 0.6   0.625 0.65  0.675 0.7   0.725 0.75  0.775
 0.8   0.825 0.85  0.875 0.9   0.925 0.95  0.975 1.   ]
[0.   0.   0.   0.   0.   0.6  0.6  0.65 0.65 0.7  0.7  0.7  0.75 0.75
 0.75 0.75 0.75 0.75 0.8  0.8  0.75]
