### Simple Q-learning buyer making bids to a seller with unknown but fixed reservation price. 

In [1]:
import numpy as np

# Parameters
value_buyer = 1
value_seller = 0.2

# Hyperparameters
num_actions = 21
num_episodes = 5000
alpha = 0.1  # Learning rate
gamma = 0  # No discount for immediate rewards
initial_epsilon = 0.99
epsilon_decay = 0.999  # Decay factor for epsilon
min_epsilon = 0.01

# Q-table initialization
q_table = np.zeros((num_actions,))
bid2action = np.linspace(0, 1, num_actions)  # Mapping of action index to bid value

print('Value Buyer:', value_buyer)
print('Value Seller:', value_seller)

# Training loop
epsilon = initial_epsilon
for episode in range(num_episodes):
    
    # Select action using epsilon-greedy strategy
    if np.random.uniform(0, 1) < epsilon:
        action = np.random.randint(0, num_actions)  # Exploration: Random action
    else:
        action = np.argmax(q_table)  # Exploitation: Choose best action based on Q-values

    bid = bid2action[action]  # Convert action index to bid value
    
    if bid >= value_seller:
        reward = value_buyer - bid  # Calculate reward based on bid and buyer's value
    else:
        reward = 0  # No reward if bid is below seller's value

    # Q-value update using Q-learning equation
    q_table[action] += alpha * (reward + gamma * np.max(q_table) - q_table[action])

    # Decay epsilon to reduce exploration over time
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    # Print relevant information for each episode
    if episode % 500 == 0:
        print("Bid:", round(bid, 2), "Reward:", round(reward, 2), "Epsilon:", round(epsilon, 2))


Value Buyer: 1
Value Seller: 0.2
Bid: 0.95 Reward: 0.05 Epsilon: 0.99
Bid: 0.2 Reward: 0.8 Epsilon: 0.6
Bid: 0.2 Reward: 0.8 Epsilon: 0.36
Bid: 0.2 Reward: 0.8 Epsilon: 0.22
Bid: 0.2 Reward: 0.8 Epsilon: 0.13
Bid: 0.2 Reward: 0.8 Epsilon: 0.08
Bid: 0.2 Reward: 0.8 Epsilon: 0.05
Bid: 0.2 Reward: 0.8 Epsilon: 0.03
Bid: 0.2 Reward: 0.8 Epsilon: 0.02
Bid: 0.2 Reward: 0.8 Epsilon: 0.01


### Q-learning Network to approximate Q-table

In [4]:
import torch
import random
import torch.nn as nn
import torch.optim as optim

# Parameters
value_buyer = 1
value_seller = 0.2

# Hyperparameters
num_actions = 21
num_episodes = 2000
batch_size = 32
alpha = 0.1  # Learning rate
gamma = 0  # No discount for immediate rewards
initial_epsilon = 0.99
epsilon_decay = 0.99  # Decay factor for epsilon
min_epsilon = 0.01

# Q-network definition
class QNetwork(nn.Module):
    def __init__(self):
        super(QNetwork, self).__init__()
        self.fc = nn.Linear(num_actions, num_actions)
    def forward(self, x):
        return self.fc(x)

q_network = QNetwork()
optimizer = optim.Adam(q_network.parameters(), lr=alpha)

# Bid to action mapping
bid2action = torch.linspace(0, 1, num_actions)

print('Value Buyer:', value_buyer)
print('Value Seller:', value_seller)

# Training loop
epsilon = initial_epsilon
for episode in range(num_episodes):
    state = torch.zeros(num_actions)
    
    # Select action using epsilon-greedy strategy
    if random.random() < epsilon:
        action = random.randint(0, num_actions-1)  # Exploration: Random action
    else:
        q_values = q_network(state)
        action = torch.argmax(q_values).item()  # Exploitation: Choose best action based on Q-values

    bid = bid2action[action]  # Convert action index to bid value
    
    if bid >= value_seller:
        reward = value_buyer - bid  # Calculate reward based on bid and buyer's value
    else:
        reward = bid*0  # No reward if bid is below seller's value

    # Q-value update using Q-learning equation
    q_values = q_network(state)
    next_q_value = torch.max(q_values)
    target_q = reward + gamma * next_q_value
    loss = nn.MSELoss()(q_values[action], target_q)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Decay epsilon to reduce exploration over time
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    # Print relevant information for each episode
    if episode % 500 == 0:
        print("Episode:", episode, "Epsilon:", round(epsilon, 2), "Bid:", round(bid.item(), 2), "Reward:", round(reward.item(), 2))



Value Buyer: 1
Value Seller: 0.2
Episode: 0 Epsilon: 0.98 Bid: 0.3 Reward: 0.7
Episode: 500 Epsilon: 0.01 Bid: 0.2 Reward: 0.8
Episode: 1000 Epsilon: 0.01 Bid: 0.2 Reward: 0.8
Episode: 1500 Epsilon: 0.01 Bid: 0.2 Reward: 0.8


In [13]:
print(q_network(state))

tensor([-8.3231e-02,  5.5671e-04, -7.2437e-01, -2.9054e-01,  8.0000e-01,
         7.4481e-01,  5.7152e-01, -3.7087e-01,  6.8265e-01,  6.0429e-01,
         4.9629e-01,  5.1394e-01,  1.4285e-01,  5.9004e-01, -1.5994e-02,
        -5.7252e-01, -9.1316e-01, -3.4263e-01, -3.4104e-01, -3.6717e-01,
         3.3541e-01], grad_fn=<AddBackward0>)


### Q-Network with Experience Replay to stabilize training

In [None]:
import torch
import random
import torch.nn as nn
import torch.optim as optim
from collections import deque

# Hyperparameters
num_actions = 21
num_episodes = 2000
batch_size = 32
alpha = 0.1  # Learning rate
gamma = 0  # No discount for immediate rewards
initial_epsilon = 0.99
epsilon_decay = 0.999  # Decay factor for epsilon
min_epsilon = 0.01
memory_capacity = 1000

# Q-network definition
class QNetwork(nn.Module):
    def __init__(self):
        super(QNetwork, self).__init__()
        self.fc = nn.Linear(num_actions, num_actions)
    def forward(self, x):
        return self.fc(x)

q_network = QNetwork()
optimizer = optim.Adam(q_network.parameters(), lr=alpha)

# Bid to action mapping
bid2action = torch.linspace(0, 1, num_actions)

print('Value Buyer:', value_buyer)
print('Value Seller:', value_seller)

# Experience replay buffer
memory = deque(maxlen=memory_capacity)

# Training loop
epsilon = initial_epsilon
for episode in range(num_episodes):
    state = torch.zeros(num_actions)
    
    # Select action using epsilon-greedy strategy
    if random.random() < epsilon:
        action = random.randint(0, num_actions-1)  # Exploration: Random action
    else:
        q_values = q_network(state)
        action = torch.argmax(q_values).item()  # Exploitation: Choose best action based on Q-values

    bid = bid2action[action]  # Convert action index to bid value
    
    if bid >= value_seller:
        reward = value_buyer - bid  # Calculate reward based on bid and buyer's value
    else:
        reward = bid*0  # No reward if bid is below seller's value
    
    # Store experience in memory
    memory.append((state, action, reward))

    # Sample a batch from memory for training
    if len(memory) >= batch_size:
        batch = random.sample(memory, batch_size)
        for state, action, reward in batch:
            q_values = q_network(state)
            next_q_value = torch.max(q_network(state))
            target_q = reward + gamma * next_q_value
            loss = nn.MSELoss()(q_values[action], target_q)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
    epsilon = max(min_epsilon, epsilon * epsilon_decay)
          
    if episode % 200 == 0:
        print("Episode:", episode, "Epsilon:", round(epsilon, 2), "Bid:", round(bid.item(), 2), "Reward:", round(reward.item(), 2))
    
# Print the optimal action given state at the end of training
optimal_action = torch.argmax(q_network(state)).item()
optimal_bid = bid2action[optimal_action]
print("Optimal Action:", optimal_action, "Optimal Bid:", optimal_bid.item())


Value Buyer: 1
Value Seller: 0.2
Episode: 0 Epsilon: 0.99 Bid: 0.95 Reward: 0.05
Episode: 200 Epsilon: 0.81 Bid: 0.9 Reward: 0.35
Episode: 400 Epsilon: 0.66 Bid: 0.2 Reward: 0.0
Episode: 600 Epsilon: 0.54 Bid: 0.45 Reward: 0.1
Episode: 800 Epsilon: 0.44 Bid: 0.2 Reward: 0.8
Episode: 1000 Epsilon: 0.36 Bid: 0.25 Reward: 0.6
Episode: 1200 Epsilon: 0.3 Bid: 0.2 Reward: 0.55


In [None]:
# Print the optimal action given state at the end of training
optimal_action = torch.argmax(q_network(state)).item()
optimal_bid = bid2action[optimal_action]
print("Optimal Action:", optimal_action, "Optimal Bid:", optimal_bid.item())

In [None]:
q_network(state)