### DDPG

In [None]:
import gym
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt

class OUActionNoise:
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
        self.theta, self.mean, self.std_dev = theta, mean, std_deviation
        self.dt, self.x_initial = dt, x_initial
        self.reset()
    def __call__(self):
        x = (self.x_prev + self.theta * (self.mean - self.x_prev) * self.dt +
             self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape))
        self.x_prev = x
        return x
    def reset(self):
        self.x_prev = self.x_initial if self.x_initial is not None else np.zeros_like(self.mean)

class Buffer:
    def __init__(self, buffer_capacity=100000, batch_size=64):
        self.buffer_capacity, self.batch_size = buffer_capacity, batch_size
        self.buffer_counter = 0
        self.state_buffer = np.zeros((buffer_capacity, num_states))
        self.action_buffer = np.zeros((buffer_capacity, num_actions))
        self.reward_buffer = np.zeros((buffer_capacity, 1))
        self.next_state_buffer = np.zeros((buffer_capacity, num_states))
    def record(self, obs_tuple):
        index = self.buffer_counter % self.buffer_capacity
        (self.state_buffer[index], self.action_buffer[index],
         self.reward_buffer[index], self.next_state_buffer[index]) = obs_tuple
        self.buffer_counter += 1
    @tf.function
    def update(self, state_batch, action_batch, reward_batch, next_state_batch):
        with tf.GradientTape() as tape:
            target_actions = target_actor(next_state_batch, training=True)
            y = reward_batch + gamma * target_critic(
                [next_state_batch, target_actions], training=True
            )
            critic_value = critic_model([state_batch, action_batch], training=True)
            critic_loss = tf.reduce_mean(tf.square(y - critic_value))
        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)
        critic_optimizer.apply_gradients(zip(critic_grad, critic_model.trainable_variables))
        with tf.GradientTape() as tape:
            actions = actor_model(state_batch, training=True)
            critic_value = critic_model([state_batch, actions], training=True)
            actor_loss = -tf.reduce_mean(critic_value)
        actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables)
        actor_optimizer.apply_gradients(zip(actor_grad, actor_model.trainable_variables))
    def learn(self):
        record_range = min(self.buffer_counter, self.buffer_capacity)
        batch_indices = np.random.choice(record_range, self.batch_size)
        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.cast(tf.convert_to_tensor(self.reward_buffer[batch_indices]), dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])
        self.update(state_batch, action_batch, reward_batch, next_state_batch)

@tf.function
def update_target(target_weights, weights, tau):
    for a, b in zip(target_weights, weights):
        a.assign(b * tau + a * (1 - tau))

def get_actor():
    last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)
    inputs = layers.Input(shape=(num_states,))
    out = layers.Dense(256, activation="relu")(inputs)
    out = layers.Dense(256, activation="relu")(out)
    outputs = layers.Dense(1, activation="tanh", kernel_initializer=last_init)(out)
    outputs = outputs * upper_bound
    model = tf.keras.Model(inputs, outputs)
    return model

def get_critic():
    state_input = layers.Input(shape=(num_states))
    state_out = layers.Dense(16, activation="relu")(state_input)
    state_out = layers.Dense(32, activation="relu")(state_out)
    action_input = layers.Input(shape=(num_actions))
    action_out = layers.Dense(32, activation="relu")(action_input)
    concat = layers.Concatenate()([state_out, action_out])
    out = layers.Dense(256, activation="relu")(concat)
    out = layers.Dense(256, activation="relu")(out)
    outputs = layers.Dense(1)(out)
    model = tf.keras.Model([state_input, action_input], outputs)
    return model

def policy(state, noise_object):
    sampled_actions = tf.squeeze(actor_model(state))
    noise = noise_object()
    sampled_actions = sampled_actions.numpy() + noise
    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)
    return [np.squeeze(legal_action)]

# Environment
problem = "Pendulum-v1"
env = gym.make(problem)
num_states, num_actions = env.observation_space.shape[0], env.action_space.shape[0]
upper_bound, lower_bound = env.action_space.high[0], env.action_space.low[0]
print(num_states, num_actions)
print(upper_bound, lower_bound)

# Hyper parameters
std_dev = 0.2
ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))
actor_model = get_actor()
critic_model = get_critic()
target_actor = get_actor()
target_critic = get_critic()
target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())
critic_lr = 0.002
actor_lr = 0.001
critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)
total_episodes = 1
gamma = 0.99
tau = 0.005
buffer = Buffer(50000, 64)
ep_reward_list, avg_reward_list = [], []

# training loop
for ep in range(total_episodes):
    prev_state = env.reset()
    episodic_reward = 0

    while True:
        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
        action = policy(tf_prev_state, ou_noise)
        state, reward, done, info = env.step(action)
        buffer.record((prev_state, action, reward, state))
        episodic_reward += reward
        buffer.learn()
        update_target(target_actor.variables, actor_model.variables, tau)
        update_target(target_critic.variables, critic_model.variables, tau)
        if done:
            break
        prev_state = state

    ep_reward_list.append(episodic_reward)
    avg_reward = np.mean(ep_reward_list[-40:])
    print("Episode * {} * Avg Reward ==> {}".format(ep, avg_reward))
    avg_reward_list.append(avg_reward)

plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Episodic Reward")
plt.show()

### DDPG Custom Env

In [None]:
import numpy as np

class CustomRandomEnv:
    def __init__(self):
        self.num_states = 1
        self.num_actions = 1
        self.upper_bound = 1.0
        self.lower_bound = 0.0
        self.episode_length = 3
        self.current_period = 0
        self.state = 0.0
        self.action_space = gym.spaces.Box(low=self.lower_bound, high=self.upper_bound, shape=(self.num_actions,), dtype=np.float32)
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(self.num_states,), dtype=np.float32)

    def reset(self):
        self.current_period = 0
        self.state = 0.0
        return np.array([self.state])

    def step(self, action):
        if self.current_period >= self.episode_length:
            raise ValueError("Episode has ended. Please reset the environment.")
        action = np.clip(action, self.lower_bound, self.upper_bound)
        self.state += action
        self.current_period += 1
        reward = -np.abs(self.state) 
        done = self.current_period >= self.episode_length
        return np.array([self.state]), reward, done, {}

# Use the custom environment
env = CustomRandomEnv()
num_states, num_actions = env.observation_space.shape[0], env.action_space.shape[0]
upper_bound, lower_bound = env.action_space.high[0], env.action_space.low[0]

# Hyper parameters
std_dev = 0.2
ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))
actor_model = get_actor()
critic_model = get_critic()
target_actor = get_actor()
target_critic = get_critic()
target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())
critic_lr = 0.002
actor_lr = 0.001
critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)
total_episodes = 10
gamma = 0.99
tau = 0.005
buffer = Buffer(50000, 64)
ep_reward_list, avg_reward_list = [], []

# training loop
for ep in range(total_episodes):
    prev_state = env.reset()
    episodic_reward = 0

    while True:
        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
        action = policy(tf_prev_state, ou_noise)
        state, reward, done, info = env.step(action)
        buffer.record((prev_state, action, reward, state))
        episodic_reward += reward
        buffer.learn()
        update_target(target_actor.variables, actor_model.variables, tau)
        update_target(target_critic.variables, critic_model.variables, tau)
        if done:
            break
        prev_state = state

    ep_reward_list.append(episodic_reward)
    avg_reward = np.mean(ep_reward_list[-40:])
    print("Episode * {} * Avg Reward ==> {}".format(ep, avg_reward))
    avg_reward_list.append(avg_reward)

plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Episodic Reward")
plt.show()

### Single DDPG (Monopsony)

In [None]:
import numpy as np
from functions import *
from itertools import count

def graph_period(db, rnd, period):
    period_bids = list(db.get_period(rnd, period).bids)
    period_asks = list(db.get_period(rnd, period).asks)
    period_prices = list(db.get_period(rnd, period).price)
    period_sales = np.cumsum(np.where(db.get_period(rnd, period).price > 0,1,0))
    [_, demand_schedule, supply_schedule, P_grid, redemption_values, token_costs, p_eqbm, q_eqbm] = db.get_round(rnd).iloc[0].tolist()
    graph(demand_schedule, supply_schedule, P_grid, p_eqbm, q_eqbm, period_bids, period_asks, period_prices, period_sales, redemption_values, token_costs, db.ntokens, db.nbuyers, db.nsellers, db.nsteps)
    
def compute_demand_supply(redemption_values,token_costs,nbuyers,ntokens,granularity=100):
    max_price = np.max(redemption_values)
    min_price = np.min(token_costs)
    P_grid = np.linspace(min_price,max_price,granularity)
    demand_schedule = np.zeros((granularity),dtype = 'int')
    supply_schedule = np.zeros((granularity), dtype = 'int')
    for i, p in enumerate(P_grid):
        demand_schedule[i] = np.sum(p<=redemption_values)  
        supply_schedule[i] = np.sum(p>=token_costs) 
    return demand_schedule, supply_schedule, P_grid, min_price, max_price

def equilibrium(demand_schedule,supply_schedule,P_grid):
    p_eqbm, q_eqbm = [], np.nan
    for i, p in enumerate(P_grid):
        if demand_schedule[i] == supply_schedule[i]: # when sellers are ready to sell
            p_eqbm.append(p)
            q_eqbm = demand_schedule[i] 
    return np.nanmean(p_eqbm), q_eqbm

import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt

def graph(demand_schedule, supply_schedule, P_grid, p_eqbm, q_eqbm, 
                        period_bids, period_asks, period_prices, period_sales, 
                        redemption_values, token_costs, ntokens, nbuyers, nsellers, nsteps):
    maxprc, minprc = np.array(period_bids).max(), np.array(period_bids).min()
    plt.plot(demand_schedule, P_grid, color='black', linestyle='--')
    plt.plot(supply_schedule, P_grid, color='black', linestyle='--')
    plt.plot(period_prices, color='green', linestyle='--', label='Transaction Price')
    #plt.axhline(y=np.nanmean(p_eqbm), color='black', linestyle='--', label='Mean Eqbm Prices')
    prices = []
    drl_legend_added = True  # Variable to track whether DRL Agent legend is added
    
    for i in range(nsteps):
        if (i == 0 and period_sales[i] == 1) or (i > 0 and (period_sales[i] - period_sales[i-1] == 1)):  # Check if there is a sale in this period
            if drl_legend_added:
                plt.scatter([i] * len(period_asks[i][1:]), period_asks[i][1:], s=10, marker='o', c='blue', label = 'Bids')
                plt.scatter([i] * len(period_bids[i]), period_bids[i], s=10, marker='o', c='red', label = 'Asks')
                plt.scatter(i, period_bids[i][0], s=30, marker='x', c='orange', label = 'DRL Agent')
                drl_legend_added = False
            else:
                plt.scatter([i] * len(period_asks[i][1:]), period_asks[i][1:], s=10, marker='o', c='blue')
                plt.scatter([i] * len(period_bids[i]), period_bids[i], s=10, marker='o', c='red')
                plt.scatter(i, period_bids[i][0], s=30, marker='x', c='orange')             
        else:
            pass  
    
    plt.legend(loc='upper right')
    plt.xlabel('Quantity Sold')
    plt.ylabel('Offer Value')
    #text_content = f'q*={q_eqbm}, mean(q)={np.round(np.nanmax(period_sales), 1)},p*={np.round(p_eqbm, 1)}, mean(p)={np.round(np.nanmean(period_prices), 1)}'
    try:
        plt.xlim(0, q_eqbm + 2)
    except:
        pass
    plt.ylim(minprc, maxprc)
    plt.savefig('a.png')
    plt.show()
    
    # Create unique colors for each period

    plt.plot(np.array(period_bids)[:,0], c = 'orange', linestyle='--', label = 'DRL Agent')
    plt.plot(np.array(period_bids)[:,1:], c = 'blue', linestyle='--')
    plt.plot(np.array(period_asks), c = 'red', linestyle='--')
    plt.scatter(range(nsteps), period_prices, c = 'g', label = 'Transaction Price')
    plt.xlabel('Time Step')
    plt.ylabel('Offer Value')
    plt.legend(loc='upper right')
    plt.savefig('b.png')
    plt.show()

class TradingEnv(gym.Env):
    def __init__(self, db, nsteps, render_mode=None):
        self.rnd = 0
        self.period = -1
        self.nperiods = nperiods
        self.db = db
        self.action_space = gym.spaces.Box(0, 1, (1,), dtype=np.float32)  # Continuous action space [0, 1]
        self.observation_space = gym.spaces.Box(-1, 200, (13,), dtype=np.float32)  # Continuous state space

    def reset(self):
        self.db.reset_period(self.rnd)
        self.timestep = 0
        self.period += 1
        [buyer.next_token() for buyer in self.db.buyers]
        agent = self.db.buyers[0]
        observation = np.array([0, -1, -1, -1, -1, -1, -1, -1, agent.value, -1, -1, -1, agent.num_tokens_traded],
                               dtype=np.float32)
        return observation  # Return continuous state

    def step(self, action):
        [buyer.next_token() for buyer in self.db.buyers]
        [seller.next_token() for seller in self.db.sellers]
        bid_frac = action[0]
        # Convert action to bid
        self.db.buyers[0].next_token()
        min_bid = self.db.buyers[0].value * min_frac
        max_bid = self.db.buyers[0].value * max_frac
        bid = np.round(max_bid * bid_frac + (1 - bid_frac) * min_bid, 2)

        # Simulate market
        bids = [buyer.bid(self.db) for buyer in self.db.buyers]
        bids[0] = bid
        asks = [seller.ask(self.db) for seller in self.db.sellers]
        current_ask, current_ask_idx, current_bid, current_bid_idx = current_bid_ask(bids, asks)
        sale, price, bprofit, sprofit, buy, sell = buy_sell(self.db, current_bid, current_bid_idx, current_ask,
                                                           current_ask_idx)
        step_data = [self.rnd, self.period, self.timestep, bids, asks, current_bid, current_bid_idx, current_ask,
                     current_ask_idx, buy, sell, price, sale, bprofit, sprofit]
        self.db.add_step(step_data)

        # Compute reward, new state
        reward = 0.0
        if sale == 1 and current_bid_idx == 0:
            reward = bprofit

        agent = self.db.buyers[0]
        observation = np.array([self.timestep + 1, current_ask, current_ask_idx, current_bid, current_bid_idx,
                                sale, price, buy, sell, agent.value, agent.step_profit,
                                agent.sale, agent.num_tokens_traded], dtype=np.float32)
        idx = np.isnan(observation)
        observation[idx] = -1.0

        # Check termination
        self.timestep += 1
        terminated = self.timestep == nsteps
        truncated = terminated  # Truncated episodes are not used in DDPG

        return observation, reward, terminated, truncated

In [None]:
import numpy as np
from functions import *
from itertools import count
buyer_strategies = ['Honest']
seller_strategies = ['Honest', 'Honest', 'Honest','Honest', 'Honest', 'Honest']
nbuyers, nsellers = len(buyer_strategies), len(seller_strategies)
nrounds, nperiods, ntokens, nsteps, gametype, nbuyers, nsellers = 10, 10, 8, 50, '1234', len(buyer_strategies), len(seller_strategies)
R1, R2, R3, R4 = gametype_to_ran(gametype)
game_metadata = [nrounds, nperiods, ntokens, nbuyers, nsellers, nsteps, R1, R2, R3, R4]
db = Database(game_metadata, buyer_strategies, seller_strategies)
rnd = 0
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
period = 0
num_states = nsteps
min_frac = 0.01
max_frac = 1.0

# Use the custom environment
env = TradingEnv(db, env)
num_states, num_actions = env.observation_space.shape[0], env.action_space.shape[0]
upper_bound, lower_bound = env.action_space.high[0], env.action_space.low[0]

# Hyper parameters
std_dev = 0.2
ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))
actor_model = get_actor()
critic_model = get_critic()
target_actor = get_actor()
target_critic = get_critic()
target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())
critic_lr = 0.002
actor_lr = 0.001
critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)
total_episodes = 100
gamma = 0.99
tau = 0.005
buffer = Buffer(50000, 64)
ep_reward_list, avg_reward_list = [], []

# training loop
for ep in range(total_episodes):
    prev_state = env.reset()
    episodic_reward = 0

    while True:
        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
        action = policy(tf_prev_state, ou_noise)
        state, reward, done, info = env.step(action)
        buffer.record((prev_state, action, reward, state))
        episodic_reward += reward
        buffer.learn()
        update_target(target_actor.variables, actor_model.variables, tau)
        update_target(target_critic.variables, critic_model.variables, tau)
        if done:
            break
        prev_state = state

    ep_reward_list.append(episodic_reward)
    avg_reward = np.mean(ep_reward_list[-40:])
    print("Episode * {} * Avg Reward ==> {}".format(ep, avg_reward))
    avg_reward_list.append(avg_reward)

plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Episodic Reward")
plt.show()

In [None]:
db.step_data.head(50).groupby('current_bid_idx').sum()
db.step_data.tail(50).groupby('current_bid_idx').sum()
graph_period(env.db, 0, 1)
graph_period(env.db, 0, 99)

In [None]:
db.step_data.tail(50).groupby('current_bid_idx').sum()

In [None]:
graph_period(env.db, 0, 1)

In [None]:
graph_period(env.db, 0, 99)

### Single DDPG (Monopoly)

In [None]:
import numpy as np
from functions import *
from itertools import count
buyer_strategies = ['Honest','Random', 'Random', 'Random']
seller_strategies = ['Honest']
nbuyers, nsellers = len(buyer_strategies), len(seller_strategies)
nrounds, nperiods, ntokens, nsteps, gametype, nbuyers, nsellers = 10, 10, 8, 50, '1234', len(buyer_strategies), len(seller_strategies)
R1, R2, R3, R4 = gametype_to_ran(gametype)
game_metadata = [nrounds, nperiods, ntokens, nbuyers, nsellers, nsteps, R1, R2, R3, R4]
db = Database(game_metadata, buyer_strategies, seller_strategies)
rnd = 0
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
period = 0
num_states = nsteps
min_frac = 0.01
max_frac = 1.0

# Use the custom environment
env = TradingEnv(db, env)
num_states, num_actions = env.observation_space.shape[0], env.action_space.shape[0]
upper_bound, lower_bound = env.action_space.high[0], env.action_space.low[0]

# Hyper parameters
std_dev = 0.2
ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))
actor_model = get_actor()
critic_model = get_critic()
target_actor = get_actor()
target_critic = get_critic()
target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())
critic_lr = 0.002
actor_lr = 0.001
critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)
total_episodes = 100
gamma = 0.99
tau = 0.005
buffer = Buffer(50000, 64)
ep_reward_list, avg_reward_list = [], []

# training loop
for ep in range(total_episodes):
    prev_state = env.reset()
    episodic_reward = 0

    while True:
        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
        action = policy(tf_prev_state, ou_noise)
        state, reward, done, info = env.step(action)
        buffer.record((prev_state, action, reward, state))
        episodic_reward += reward
        buffer.learn()
        update_target(target_actor.variables, actor_model.variables, tau)
        update_target(target_critic.variables, critic_model.variables, tau)
        if done:
            break
        prev_state = state

    ep_reward_list.append(episodic_reward)
    avg_reward = np.mean(ep_reward_list[-40:])
    print("Episode * {} * Avg Reward ==> {}".format(ep, avg_reward))
    avg_reward_list.append(avg_reward)

plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Episodic Reward")
plt.show()

In [None]:
db.step_data.head(500).groupby('current_bid_idx').sum()

In [None]:
db.step_data.tail(500).groupby('current_bid_idx').sum()

In [None]:
db.step_data.tail(50)

In [None]:
graph_period(env.db, 0, 1)

In [None]:
graph_period(env.db, 0, 99)

### Single DDPG

In [None]:
import numpy as np
from functions import *
from itertools import count
buyer_strategies = ['Honest','Random', 'Random', 'Random']
seller_strategies = ['Random', 'Random', 'Random', 'Random']
nbuyers, nsellers = len(buyer_strategies), len(seller_strategies)
nrounds, nperiods, ntokens, nsteps, gametype, nbuyers, nsellers = 10, 10, 8, 50, '1234', len(buyer_strategies), len(seller_strategies)
R1, R2, R3, R4 = gametype_to_ran(gametype)
game_metadata = [nrounds, nperiods, ntokens, nbuyers, nsellers, nsteps, R1, R2, R3, R4]
db = Database(game_metadata, buyer_strategies, seller_strategies)
rnd = 0
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
period = 0
num_states = nsteps
min_frac = 0.01
max_frac = 1.0

# Use the custom environment
env = TradingEnv(db, env)
num_states, num_actions = env.observation_space.shape[0], env.action_space.shape[0]
upper_bound, lower_bound = env.action_space.high[0], env.action_space.low[0]

# Hyper parameters
std_dev = 0.2
ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))
actor_model = get_actor()
critic_model = get_critic()
target_actor = get_actor()
target_critic = get_critic()
target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())
critic_lr = 0.002
actor_lr = 0.001
critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)
total_episodes = 100
gamma = 0.99
tau = 0.005
buffer = Buffer(50000, 64)
ep_reward_list, avg_reward_list = [], []

# training loop
for ep in range(total_episodes):
    prev_state = env.reset()
    episodic_reward = 0

    while True:
        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
        action = policy(tf_prev_state, ou_noise)
        state, reward, done, info = env.step(action)
        buffer.record((prev_state, action, reward, state))
        episodic_reward += reward
        buffer.learn()
        update_target(target_actor.variables, actor_model.variables, tau)
        update_target(target_critic.variables, critic_model.variables, tau)
        if done:
            break
        prev_state = state

    ep_reward_list.append(episodic_reward)
    avg_reward = np.mean(ep_reward_list[-40:])
    print("Episode * {} * Avg Reward ==> {}".format(ep, avg_reward))
    avg_reward_list.append(avg_reward)

plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Episodic Reward")
plt.show()

In [None]:
db.step_data.head(250).groupby('current_bid_idx').sum()

In [None]:
db.step_data.tail(250).groupby('current_bid_idx').sum()

In [None]:
db.get_period(0, 99)

In [None]:
graph_period(env.db, 0, 1)

In [None]:
graph_period(env.db, 0, 110)

### Multi DDPG

In [148]:
import numpy as np
import gym
import tensorflow as tf
import matplotlib.pyplot as plt

# Define the environment for two agents
class CustomTwoAgentEnv:
    def __init__(self):
        self.num_states = 1
        self.num_actions = 1
        self.upper_bound = 1.0
        self.lower_bound = 0.0
        self.episode_length = 3
        self.current_period = 0
        self.state = [0.0, 0.0]  # Two agents have separate states
        self.action_space = gym.spaces.Box(low=self.lower_bound, high=self.upper_bound, shape=(self.num_actions,), dtype=np.float32)
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(self.num_states,), dtype=np.float32)

    def reset(self):
        self.current_period = 0
        self.state = [0.0, 0.0]  # Reset states for both agents
        return [np.array([self.state[0]]), np.array([self.state[1]])]

    def step(self, actions):
        if self.current_period >= self.episode_length:
            raise ValueError("Episode has ended. Please reset the environment.")
        
        actions = [np.clip(action, self.lower_bound, self.upper_bound) for action in actions]
        self.state[0] += actions[0]
        self.state[1] += actions[1]
        self.current_period += 1
        rewards = [-np.abs(self.state[0]), -np.abs(self.state[1])]  # Separate rewards for each agent
        done = self.current_period >= self.episode_length
        return [np.array([self.state[0]]), np.array([self.state[1]])], rewards, done, {}

# Use the custom two-agent environment
env = CustomTwoAgentEnv()
num_states, num_actions = env.observation_space.shape[0], env.action_space.shape[0]
upper_bound, lower_bound = env.action_space.high[0], env.action_space.low[0]

# Define hyperparameters and networks for each agent (you'll need to implement these functions)

agent1_actor_model = get_actor()
agent1_critic_model = get_critic()
agent1_target_actor = get_actor()
agent1_target_critic = get_critic()
agent2_actor_model = get_actor()
agent2_critic_model = get_critic()
agent2_target_actor = get_actor()
agent2_target_critic = get_critic()

# Define other hyperparameters and variables for each agent as needed

total_episodes = 10
gamma = 0.99
tau = 0.005
buffer = Buffer(50000, 64)
ep_reward_list, avg_reward_list = [], []

# Training loop for two agents independently
for ep in range(total_episodes):
    prev_state = env.reset()
    episodic_reward = [0.0, 0.0]  # Separate rewards for each agent

    while True:
        tf_prev_state = [tf.expand_dims(tf.convert_to_tensor(prev_state[0]), 0), tf.expand_dims(tf.convert_to_tensor(prev_state[1]), 0)]
        agent1_action = agent1_policy(tf_prev_state[0], ou_noise)
        agent2_action = agent2_policy(tf_prev_state[1], ou_noise)
        state, rewards, done, info = env.step([agent1_action, agent2_action])
        buffer.record((prev_state, [agent1_action, agent2_action], rewards, state))
        episodic_reward[0] += rewards[0]
        episodic_reward[1] += rewards[1]
        buffer.learn(agent1_critic_model, agent1_target_critic, agent2_critic_model, agent2_target_critic)
        buffer.learn(agent1_actor_model, agent1_target_actor, agent2_actor_model, agent2_target_actor)
        update_target(agent1_target_actor.variables, agent1_actor_model.variables, tau)
        update_target(agent1_target_critic.variables, agent1_critic_model.variables, tau)
        update_target(agent2_target_actor.variables, agent2_actor_model.variables, tau)
        update_target(agent2_target_critic.variables, agent2_critic_model.variables, tau)
        if done:
            break
        prev_state = state

    ep_reward_list.append(episodic_reward)
    avg_reward = np.mean(episodic_reward)
    print(f"Episode {ep} - Avg Reward: Agent 1: {episodic_reward[0]}, Agent 2: {episodic_reward[1]}")
    avg_reward_list.append(avg_reward)

plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Episodic Reward")
plt.show()


NameError: name 'agent1_policy' is not defined

In [146]:
class MultiAgentTradingEnv(gym.Env):
    def __init__(self, db, nsteps, render_mode=None):
        self.rnd = 0
        self.period = -1
        self.nperiods = nperiods
        self.db = db
        self.nagents = len(self.db.buyers)  # Number of agents
        self.action_space = gym.spaces.Box(0, 1, (self.nagents,), dtype=np.float32)  # Continuous action space [0, 1] for each agent
        self.observation_space = gym.spaces.Box(-1, 200, (self.nagents, 13), dtype=np.float32)  # Continuous state space for each agent

    def reset(self):
        self.db.reset_period(self.rnd)
        self.timestep = 0
        self.period += 1
        [buyer.next_token() for buyer in self.db.buyers]
        
        observations = []
        for agent_idx in range(self.nagents):
            agent = self.db.buyers[agent_idx]
            observation = np.array([0, -1, -1, -1, -1, -1, -1, -1, agent.value, -1, -1, -1, agent.num_tokens_traded],
                                   dtype=np.float32)
            observations.append(observation)
        return observations  # Return continuous states for each agent

    def step(self, actions):
        [buyer.next_token() for buyer in self.db.buyers]
        [seller.next_token() for seller in self.db.sellers]
        
        rewards = [0.0] * self.nagents
        bids = [0.0] * self.nagents
        
        for agent_idx in range(self.nagents):
            bid_frac = actions[agent_idx]
            self.db.buyers[agent_idx].next_token()
            min_bid = self.db.buyers[agent_idx].value * min_frac
            max_bid = self.db.buyers[agent_idx].value * max_frac
            bid = np.round(max_bid * bid_frac + (1 - bid_frac) * min_bid, 2)
            bids[agent_idx] = bid

        # Simulate market
        bids_all = [bids[agent_idx] if agent_idx == 0 else buyer.bid(self.db) for agent_idx, buyer in enumerate(self.db.buyers)]
        asks = [seller.ask(self.db) for seller in self.db.sellers]
        
        current_asks, current_asks_idx, current_bids, current_bids_idx = zip(*[current_bid_ask(bids_all, asks) for _ in range(self.nagents)])
        
        for agent_idx in range(self.nagents):
            sale, price, bprofit, sprofit, buy, sell = buy_sell(self.db, current_bids[agent_idx], current_bids_idx[agent_idx], current_asks[agent_idx],
                                                               current_asks_idx[agent_idx])
            step_data = [self.rnd, self.period, self.timestep, bids_all, asks, current_bids[agent_idx], current_bids_idx[agent_idx], current_asks[agent_idx],
                         current_asks_idx[agent_idx], buy, sell, price, sale, bprofit, sprofit]
            self.db.add_step(step_data)
            
            # Compute reward for the agent
            if sale == 1 and current_bids_idx[agent_idx] == 0:
                rewards[agent_idx] = bprofit

        observations = []
        for agent_idx in range(self.nagents):
            agent = self.db.buyers[agent_idx]
            observation = np.array([self.timestep + 1, current_asks[agent_idx], current_asks_idx[agent_idx], current_bids[agent_idx], current_bids_idx[agent_idx],
                                    sale, price, buy, sell, agent.value, agent.step_profit, agent.sale, agent.num_tokens_traded],
                                   dtype=np.float32)
            idx = np.isnan(observation)
            observation[idx] = -1.0
            observations.append(observation)

        # Check termination
        self.timestep += 1
        terminated = self.timestep == nsteps
        truncated = terminated  # Truncated episodes are not used in DDPG

        return observations, rewards, terminated, truncated


In [147]:
import numpy as np
import gym
from functions import *
from itertools import count
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt

# Define the number of agents
num_agents = 2

# Create the buyer and seller strategies for each agent
buyer_strategies = ['Honest', 'Honest']
seller_strategies = ['Honest', 'Honest', 'Honest', 'Honest', 'Honest', 'Honest']
nbuyers, nsellers = len(buyer_strategies), len(seller_strategies)
nrounds, nperiods, ntokens, nsteps, gametype, nbuyers, nsellers = 10, 10, 8, 50, '1234', len(buyer_strategies), len(seller_strategies)
R1, R2, R3, R4 = gametype_to_ran(gametype)
game_metadata = [nrounds, nperiods, ntokens, nbuyers, nsellers, nsteps, R1, R2, R3, R4]

dbs = []  # List to hold the databases for each agent

# Initialize databases and environments for each agent
for _ in range(num_agents):
    db = Database(game_metadata, buyer_strategies, seller_strategies)
    dbs.append(db)

# Initialize trading environments for each agent
envs = [TradingEnv(db, nsteps) for db in dbs]

# Hyperparameters
std_dev = 0.2
ou_noises = [OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1)) for _ in range(num_agents)]
actor_models = [get_actor() for _ in range(num_agents)]
critic_models = [get_critic() for _ in range(num_agents)]
target_actors = [get_actor() for _ in range(num_agents)]
target_critics = [get_critic() for _ in range(num_agents)]

# Initialize target actor and critic weights
for i in range(num_agents):
    target_actors[i].set_weights(actor_models[i].get_weights())
    target_critics[i].set_weights(critic_models[i].get_weights())

critic_lr = 0.002
actor_lr = 0.001
critic_optimizers = [tf.keras.optimizers.Adam(critic_lr) for _ in range(num_agents)]
actor_optimizers = [tf.keras.optimizers.Adam(actor_lr) for _ in range(num_agents)]

total_episodes = 100
gamma = 0.99
tau = 0.005
buffers = [Buffer(50000, 64) for _ in range(num_agents)]
ep_reward_lists = [[] for _ in range(num_agents)]
avg_reward_lists = [[] for _ in range(num_agents)]

# Training loop for each agent
for ep in range(total_episodes):
    prev_states = [env.reset() for env in envs]
    episodic_rewards = [0.0] * num_agents

    while True:
        actions = []

        for agent_idx in range(num_agents):
            tf_prev_state = [tf.expand_dims(tf.convert_to_tensor(prev_states[agent_idx][i]), 0) for i in range(num_agents)]
            action = policy(tf_prev_state[agent_idx], ou_noises[agent_idx])
            actions.append(action[0])

        states, rewards, dones, infos = zip(*[env.step(actions) for env in envs])

        for agent_idx in range(num_agents):
            buffers[agent_idx].record((prev_states[agent_idx], actions[agent_idx], rewards[agent_idx], states[agent_idx]))
            episodic_rewards[agent_idx] += rewards[agent_idx]

        for agent_idx in range(num_agents):
            update_target(target_actors[agent_idx].variables, actor_models[agent_idx].variables, tau)
            update_target(target_critics[agent_idx].variables, critic_models[agent_idx].variables, tau)

        if all(dones):
            break

        prev_states = states

    for agent_idx in range(num_agents):
        ep_reward_lists[agent_idx].append(episodic_rewards[agent_idx])
        avg_reward = np.mean(ep_reward_lists[agent_idx][-40:])
        print(f"Agent {agent_idx} - Episode {ep} - Avg Reward ==> {avg_reward}")
        avg_reward_lists[agent_idx].append(avg_reward)

# Plot the average rewards for each agent
for agent_idx in range(num_agents):
    plt.plot(avg_reward_lists[agent_idx], label=f'Agent {agent_idx}')

plt.xlabel("Episode")
plt.ylabel("Avg. Episodic Reward")
plt.legend()
plt.show()


ValueError: can only convert an array of size 1 to a Python scalar

In [None]:
def graph_period(db, rnd, period):
    period_bids = list(db.get_period(rnd, period).bids)
    period_asks = list(db.get_period(rnd, period).asks)
    period_prices = list(db.get_period(rnd, period).price)
    period_sales = np.cumsum(np.where(db.get_period(rnd, period).price > 0,1,0))
    [_, demand_schedule, supply_schedule, P_grid, redemption_values, token_costs, p_eqbm, q_eqbm] = db.get_round(rnd).iloc[0].tolist()
    graph(demand_schedule, supply_schedule, P_grid, p_eqbm, q_eqbm, period_bids, period_asks, period_prices, period_sales, redemption_values, token_costs, db.ntokens, db.nbuyers, db.nsellers, db.nsteps)
    
def compute_demand_supply(redemption_values,token_costs,nbuyers,ntokens,granularity=100):
    max_price = np.max(redemption_values)
    min_price = np.min(token_costs)
    P_grid = np.linspace(min_price,max_price,granularity)
    demand_schedule = np.zeros((granularity),dtype = 'int')
    supply_schedule = np.zeros((granularity), dtype = 'int')
    for i, p in enumerate(P_grid):
        demand_schedule[i] = np.sum(p<=redemption_values)  
        supply_schedule[i] = np.sum(p>=token_costs) 
    return demand_schedule, supply_schedule, P_grid, min_price, max_price

def equilibrium(demand_schedule,supply_schedule,P_grid):
    p_eqbm, q_eqbm = [], np.nan
    for i, p in enumerate(P_grid):
        if demand_schedule[i] == supply_schedule[i]: # when sellers are ready to sell
            p_eqbm.append(p)
            q_eqbm = demand_schedule[i] 
    return np.nanmean(p_eqbm), q_eqbm

import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt

def graph(demand_schedule, supply_schedule, P_grid, p_eqbm, q_eqbm, 
                        period_bids, period_asks, period_prices, period_sales, 
                        redemption_values, token_costs, ntokens, nbuyers, nsellers, nsteps):
    maxprc, minprc = np.array(period_bids).max(), np.array(period_bids).min()
    plt.plot(demand_schedule, P_grid, color='black', linestyle='--')
    plt.plot(supply_schedule, P_grid, color='black', linestyle='--')
    plt.plot(period_prices, color='green', linestyle='--', label='Transaction Price')
    #plt.axhline(y=np.nanmean(p_eqbm), color='black', linestyle='--', label='Mean Eqbm Prices')
    prices = []
    drl_legend_added = True  # Variable to track whether DRL Agent legend is added
    
    for i in range(nsteps):
        if (i == 0 and period_sales[i] == 1) or (i > 0 and (period_sales[i] - period_sales[i-1] == 1)):  # Check if there is a sale in this period
            if drl_legend_added:
                plt.scatter([i] * len(period_asks[i][1:]), period_asks[i][1:], s=10, marker='o', c='blue', label = 'Bids')
                plt.scatter([i] * len(period_bids[i]), period_bids[i], s=10, marker='o', c='red', label = 'Asks')
                plt.scatter(i, period_bids[i][0], s=30, marker='x', c='orange', label = 'DRL Agent')
                drl_legend_added = False
            else:
                plt.scatter([i] * len(period_asks[i][1:]), period_asks[i][1:], s=10, marker='o', c='blue')
                plt.scatter([i] * len(period_bids[i]), period_bids[i], s=10, marker='o', c='red')
                plt.scatter(i, period_bids[i][0], s=30, marker='x', c='orange')             
        else:
            pass  
    
    plt.legend(loc='upper right')
    plt.xlabel('Quantity Sold')
    plt.ylabel('Offer Value')
    #text_content = f'q*={q_eqbm}, mean(q)={np.round(np.nanmax(period_sales), 1)},p*={np.round(p_eqbm, 1)}, mean(p)={np.round(np.nanmean(period_prices), 1)}'
    try:
        plt.xlim(0, q_eqbm + 2)
    except:
        pass
    plt.ylim(minprc, maxprc)
    plt.savefig('a.png')
    plt.show()
    
    # Create unique colors for each period

    plt.plot(np.array(period_bids)[:,0], c = 'orange', linestyle='--', label = 'DRL Agent')
    plt.plot(np.array(period_bids)[:,1:], c = 'blue', linestyle='--')
    plt.plot(np.array(period_asks), c = 'red', linestyle='--')
    plt.scatter(range(nsteps), period_prices, c = 'g', label = 'Transaction Price')
    plt.xlabel('Time Step')
    plt.ylabel('Offer Value')
    plt.legend(loc='upper right')
    plt.savefig('b.png')
    plt.show()