In [5]:
import numpy as np

# Parameters
avg_value_seller = 0.7
std_value_seller = 0.05

# Hyperparameters
num_actions = 11
num_episodes = 5000
alpha = 0.1  # Learning rate
gamma = 0  # No discount for immediate rewards
initial_epsilon = 0.99
epsilon_decay = 0.999  # Decay factor for epsilon
min_epsilon = 0.01

# Divide the buyer value range into 11 divisions
buyer_value_divisions = np.linspace(0.5, 1.0, num_actions)

# Q-table initialization
num_states = num_actions
q_table = np.zeros((num_states, num_actions))
action2bid = np.linspace(0, 1, num_actions)  # Mapping of action index to bid value

# Training loop
epsilon = initial_epsilon
for episode in range(num_episodes):
    
    # Seller realizes value
    value_seller = np.random.normal(avg_value_seller, std_value_seller, 1)[0]
    
    # Buyer realizes value 
    value_buyer = np.random.choice(buyer_value_divisions)
    
    # Find the index of the buyer value division
    state = np.argmin(np.abs(buyer_value_divisions - value_buyer))
    
    # Select action using epsilon-greedy strategy
    if np.random.uniform(0, 1) < epsilon:
        action = np.random.randint(0, num_actions)  # Exploration: Random action
    else:
        action = np.argmax(q_table[state])  # Exploitation: Choose best action based on Q-values

    bid = action2bid[action]  # Convert action index to bid value
    
    if bid >= value_seller:
        reward = value_buyer - bid  # Calculate reward based on bid and buyer's value
    else:
        reward = 0  # No reward if bid is below seller's value

    # Q-value update using Q-learning equation
    q_table[state][action] += alpha * (reward + gamma * np.max(q_table[state]) - q_table[state][action])

    # Decay epsilon to reduce exploration over time
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    # Print relevant information for each episode
    if episode % 500 == 0:
        print("Seller value:", round(value_seller, 2), "Buyer value:", round(value_buyer, 2), "Bid:", round(bid, 2), "Reward:", round(reward, 2), "Epsilon:", round(epsilon, 2))

Seller value: 0.74 Buyer value: 0.9 Bid: 0.6 Reward: 0 Epsilon: 0.99
Seller value: 0.68 Buyer value: 0.9 Bid: 0.8 Reward: 0.1 Epsilon: 0.6
Seller value: 0.69 Buyer value: 0.75 Bid: 0.7 Reward: 0.05 Epsilon: 0.36
Seller value: 0.74 Buyer value: 0.9 Bid: 0.8 Reward: 0.1 Epsilon: 0.22
Seller value: 0.65 Buyer value: 0.5 Bid: 0.0 Reward: 0 Epsilon: 0.13
Seller value: 0.65 Buyer value: 0.75 Bid: 0.7 Reward: 0.05 Epsilon: 0.08
Seller value: 0.75 Buyer value: 0.85 Bid: 0.7 Reward: 0 Epsilon: 0.05
Seller value: 0.68 Buyer value: 0.95 Bid: 0.8 Reward: 0.15 Epsilon: 0.03
Seller value: 0.65 Buyer value: 0.6 Bid: 0.0 Reward: 0 Epsilon: 0.02
Seller value: 0.78 Buyer value: 0.95 Bid: 0.8 Reward: 0.15 Epsilon: 0.01


In [86]:
import numpy as np
import torch
import random
import torch.nn as nn
import torch.optim as optim

# Parameters
avg_value_seller = 0.2
std_value_seller = 0.05

# Hyperparameters
num_actions = 11
num_episodes = 50000
num_states = 1
hidden_units = 100
batch_size = 512
alpha = 0.1  # Learning rate
gamma = 0  # No discount for immediate rewards
initial_epsilon = 0.99
epsilon_decay = 0.9999  # Decay factor for epsilon
min_epsilon = 0.01
memory_capacity = 1000

# Divide the buyer value range into 11 divisions
buyer_value_divisions = np.linspace(0.0, 1.0, num_actions)
action2bid = np.linspace(0, 1, num_actions)  # Mapping of action index to bid value

# Q-network definition
class QNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return self.fc2(x)
    
q_network = QNetwork(num_states, num_actions*2, num_actions)
optimizer = optim.Adam(q_network.parameters(), lr=alpha)

# Experience replay buffer
memory = []

# Training loop
epsilon = initial_epsilon
for episode in range(num_episodes):
    # Seller realizes value
    value_seller = np.random.normal(avg_value_seller, std_value_seller, 1)[0]
    
    # Buyer realizes value 
    value_buyer = np.random.choice(buyer_value_divisions)
    
    # Find the index of the buyer value division
    state = np.argmin(np.abs(buyer_value_divisions - value_buyer))
        
    # Select action using epsilon-greedy strategy
    if random.random() < epsilon:
        action = random.randint(0, num_actions-1)  # Exploration: Random action
    else:
        q_values = q_network(torch.tensor([state], dtype=torch.float32))
        action = torch.argmax(q_values).item()  # Exploitation: Choose best action based on Q-values

    bid = action2bid[action]  # Convert action index to bid value
    
    if bid >= value_seller:
        reward = value_buyer - bid  # Calculate reward based on bid and buyer's value
    else:
        reward = 0  # No reward if bid is below seller's value
    
    # Store experience in memory
    memory.append((state, action, reward))
    
    # Sample a batch from memory for training
    if len(memory) >= batch_size:
        batch = random.sample(memory, batch_size)
        states, actions, rewards = zip(*batch)
        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        q_values = q_network(states.unsqueeze(1))
        target_q_values = rewards + gamma * torch.max(q_values, dim=1).values
        predicted_q_values = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        loss = nn.MSELoss()(predicted_q_values, target_q_values)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    epsilon = max(min_epsilon, epsilon * epsilon_decay)
    
    if episode % 5000 == 0:
        average_rewards = np.mean([m[2] for m in memory[-1000:]])
        print("Seller value:", round(value_seller, 2), "Buyer value:", round(value_buyer, 2),
              "Bid:", round(bid, 2), "Reward:", round(reward, 2), "Epsilon:", round(epsilon, 2),
             "Average Reward:", round(average_rewards, 2))


Seller value: 0.19 Buyer value: 0.6 Bid: 0.9 Reward: -0.3 Epsilon: 0.99 Average Reward: -0.3
Seller value: 0.2 Buyer value: 0.7 Bid: 0.3 Reward: 0.4 Epsilon: 0.6 Average Reward: 0.05
Seller value: 0.22 Buyer value: 0.6 Bid: 0.3 Reward: 0.3 Epsilon: 0.36 Average Reward: 0.13
Seller value: 0.26 Buyer value: 0.5 Bid: 0.3 Reward: 0.2 Epsilon: 0.22 Average Reward: 0.16
Seller value: 0.14 Buyer value: 0.6 Bid: 0.0 Reward: 0 Epsilon: 0.13 Average Reward: 0.2
Seller value: 0.23 Buyer value: 0.4 Bid: 0.2 Reward: 0 Epsilon: 0.08 Average Reward: 0.23
Seller value: 0.31 Buyer value: 0.9 Bid: 0.3 Reward: 0 Epsilon: 0.05 Average Reward: 0.24
Seller value: 0.22 Buyer value: 0.2 Bid: 0.1 Reward: 0 Epsilon: 0.03 Average Reward: 0.24
Seller value: 0.26 Buyer value: 0.9 Bid: 0.3 Reward: 0.6 Epsilon: 0.02 Average Reward: 0.25
Seller value: 0.21 Buyer value: 0.4 Bid: 0.3 Reward: 0.1 Epsilon: 0.01 Average Reward: 0.25


In [87]:
random_input = torch.arange(0, 11, 1, dtype=torch.float32)

with torch.no_grad():
    optimal_actions = []
    for value in random_input:
        q_values = q_network(value.unsqueeze(0))  # Reshape value to match input_size
        optimal_action = torch.argmax(q_values).item()
        optimal_actions.append(optimal_action)

print("Random Input:", random_input)
print("Optimal Actions:", optimal_actions)

Random Input: tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])
Optimal Actions: [0, 1, 1, 2, 3, 3, 3, 3, 3, 3, 3]


In [58]:
random_input.shape

torch.Size([11])