In [1]:
import pandas as pd

data = {
    "Action Space Type": ["Both", "Both", "Both", "Both", "Discrete", "Continuous", "Both", "Continuous", "Both", "Continuous", "Discrete"],
    "Key Innovation": [
        "Direct policy optimization with policy gradient. Introduced the idea of optimizing the policy directly using gradient ascent. Enabled learning in high-dimensional action spaces.",
        "Trust region optimization for stable learning. Introduced trust region methods to stabilize policy updates and prevent large policy changes that could lead to divergence.",
        "Clipped surrogate objective for stable learning. Addressed issues with trust region methods by using a clipped surrogate objective, ensuring monotonic improvement.",
        "Combines actor and critic for efficient training. Utilizes both value and policy networks to improve sample efficiency and convergence speed.",
        "Deep Q-network approximation of the Q-function. Introduced deep neural networks to approximate the Q-function, making it possible to handle high-dimensional state spaces.",
        "Continuous action space extension of DQN. Adapted DQN for continuous action spaces using actor-critic architecture and deterministic policy gradients.",
        "Asynchronous training of multiple agents. Parallelizes training by having multiple agents interact with their environments asynchronously, improving data efficiency.",
        "Entropy regularization for improved exploration. Encourages exploration by adding an entropy term to the objective function, balancing exploration and exploitation.",
        "Twin critics and delayed policy updates. Introduced twin Q-networks to improve stability and utilized delayed policy updates for better performance.",
        "Distributional value estimation with deterministic policy gradients. Estimated value distributions instead of single values and combined them with deterministic policy gradients for improved learning.",
        "Value iteration with Q-value updates. Introduced the concept of Q-values and iteratively updates Q-values using the Bellman equation for value estimation."
    ],
    "On - Off Policy": ["On", "On", "On", "On", "Off", "Off", "On", "Off", "On", "Off", "Off"],
    "Value / Policy Based": ["Policy", "Policy", "Policy", "Both", "Value", "Both", "Both", "Both", "Both", "Both", "Value"],
    "Year of Publication": [None, None, None, None, "2015", "2016", "2016", "2018", "2018", "2018", "1957"]
}

df = pd.DataFrame(data, index=["REINFORCE", "TRPO", "PPO", "A2C", "DQN", "DDPG", "A3C", "SAC", "TD3", "D4PG", "Q-Learning"])

df.head(11)

Unnamed: 0,Action Space Type,Key Innovation,On - Off Policy,Value / Policy Based,Year of Publication
REINFORCE,Both,Direct policy optimization with policy gradien...,On,Policy,
TRPO,Both,Trust region optimization for stable learning....,On,Policy,
PPO,Both,Clipped surrogate objective for stable learnin...,On,Policy,
A2C,Both,Combines actor and critic for efficient traini...,On,Both,
DQN,Discrete,Deep Q-network approximation of the Q-function...,Off,Value,2015.0
DDPG,Continuous,Continuous action space extension of DQN. Adap...,Off,Both,2016.0
A3C,Both,Asynchronous training of multiple agents. Para...,On,Both,2016.0
SAC,Continuous,Entropy regularization for improved exploratio...,Off,Both,2018.0
TD3,Both,Twin critics and delayed policy updates. Intro...,On,Both,2018.0
D4PG,Continuous,Distributional value estimation with determini...,Off,Both,2018.0


In [30]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from functions import *
from itertools import count
buyer_strategies = ['Honest', 'Random', 'Random', 'Random']
seller_strategies = ['Honest', 'Honest', 'Honest', 'Honest', 'Honest','Honest', 'Honest', 'Honest', 'Honest', 'Honest']
nbuyers, nsellers = len(buyer_strategies), len(seller_strategies)
nrounds, nperiods, ntokens, nsteps, gametype, nbuyers, nsellers = 10, 10, 8, 30, '1234', len(buyer_strategies), len(seller_strategies)
R1, R2, R3, R4 = gametype_to_ran(gametype)
game_metadata = [nrounds, nperiods, ntokens, nbuyers, nsellers, nsteps, R1, R2, R3, R4]
db = Database(game_metadata, buyer_strategies, seller_strategies)
rnd = 0
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
period = 0
num_states = nsteps
min_frac = 0.01
max_frac = 1.0

In [31]:
class TradingEnv(gym.Env):
    def __init__(self, db, nsteps, render_mode = None):
        self.rnd = 0
        self.period = -1
        self.nperiods = nperiods
        self.db = db
        self.action_space = spaces.Box(0,1,(1,),dtype=np.float)
        self.observation_space = spaces.Box(0,nsteps,(1,),dtype=np.float)

    def reset(self,seed=None):
        #self.db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
        self.db.reset_period(self.rnd)
        self.timestep = 0
        self.period += 1
        observation = np.array([0])
        return observation, {}

    def step(self, action, seed=None, options=None):
        [buyer.next_token() for buyer in self.db.buyers]
        [seller.next_token() for seller in self.db.sellers]
        bid_frac = action.item()
        # convert action to bid
        self.db.buyers[0].next_token()
        min_bid = self.db.buyers[0].value * min_frac
        max_bid = self.db.buyers[0].value * max_frac
        bid = np.round(max_bid * bid_frac + (1 - bid_frac) * min_bid, 2)

        # simulate market
        bids = [buyer.bid(self.db) for buyer in self.db.buyers]
        bids[0] = bid
        asks = [seller.ask(self.db) for seller in self.db.sellers]
        current_ask, current_ask_idx, current_bid, current_bid_idx = current_bid_ask(bids, asks)
        sale, price, bprofit, sprofit, buy, sell = buy_sell(self.db, current_bid, current_bid_idx, current_ask, current_ask_idx)
        step_data = [self.rnd, self.period, self.timestep, bids, asks, current_bid, current_bid_idx, current_ask, current_ask_idx, buy, sell, price, sale, bprofit, sprofit]
        self.db.add_step(step_data)

        # compute reward, new state
        reward = 0.0
        if sale == 1 and current_bid_idx == 0:
            reward = bprofit
        observation = np.array([self.timestep + 1])

        # check termination
        self.timestep += 1
        if self.timestep == nsteps:
            terminated = True
            self.timestep = 0
        else:
            terminated = False
        infos = {"TimeLimit.truncated":True}
        truncated = False
        return observation, reward, terminated, truncated, infos

In [32]:
from stable_baselines3.common.env_checker import check_env
db = Database(game_metadata, buyer_strategies, seller_strategies)
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
env = TradingEnv(db, nsteps)
check_env(env)

In [33]:
rnd = 0
db = Database(game_metadata, buyer_strategies, seller_strategies)
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
env = TradingEnv(db, nsteps)
observation, info = env.reset()
for period in count():
    for timestep in count(): 
        action = env.action_space.sample()
        observation, reward, done, info, _ = env.step(action)
        #print(f"Rnd: {rnd}, Period: {period}, New State: {observation}, Action:{np.round(action,1)}, Reward: {np.round(reward,1)}, Period End: {done}")
        if done:
            # If the episode is done, reset the environment
            #print('done')
            observation, info = env.reset()
            break
    if period == nperiods:
        period = 0
        break
env.close()

In [34]:
db.step_data.head(1000).groupby('current_bid_idx').sum()

Unnamed: 0_level_0,rnd,period,step,current_bid,current_ask,current_ask_idx,buy,sell,price,sale,bprofit,sprofit
current_bid_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,166,435,1814.99,1157.7,157,33,29,1430.115,33,695.085,354.215
1,0,549,1857,4670.1,3891.7,565,66,51,2799.55,66,1016.35,594.35
2,0,496,1480,5376.2,3386.1,573,56,55,2648.8,56,1758.4,1149.5
3,0,439,1013,4896.8,2661.7,368,66,66,3065.6,66,1977.9,1223.8


In [35]:
# Define your environment and parameters (replace with your actual environment setup)
rnd = 0
period = 0
num_states = nsteps
min_frac = 0.01
max_frac = 1.5
eval_steps = 1000
training_step = 50000

In [36]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO, A2C, DQN, SAC
from stable_baselines3.ppo.policies import MlpPolicy
from stable_baselines3.common.base_class import BaseAlgorithm
from stable_baselines3.common.evaluation import evaluate_policy

### Continous Action Space

In [37]:
from stable_baselines3 import SAC, DDPG, TD3, A2C, PPO
db = Database(game_metadata, buyer_strategies, seller_strategies)
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
env = TradingEnv(db, nsteps)
policy_kwargs = dict(net_arch=dict(pi=[128, 128], qf=[128, 128]))
model = SAC("MlpPolicy", env, policy_kwargs=policy_kwargs, verbose=1,)
model.learn(50000, progress_bar = True)

LiveError: Only one live display may be active at once

In [None]:
db.step_data.head(16).groupby('current_bid_idx').sum()

In [38]:
db.step_data.tail(16).groupby('current_bid_idx').sum()

In [25]:
db.round_data.redemption_values.item()

array([[ 59.6,  48.9,  35.3,  18.7],
       [ 79. ,  66.7,  46.6,  34. ],
       [ 92.3,  60.3,  36.8,  24.1],
       [106.1,  79.4,  63.3,  35.9]])

In [27]:
db.step_data.head(16)

Unnamed: 0,rnd,period,step,bids,asks,current_bid,current_bid_idx,current_ask,current_ask_idx,buy,sell,price,sale,bprofit,sprofit
0,0,0,0,"[77.1, 66.7, 68.8, 60.4]","[85.7, 47.5, 34.1, 49.4, 40.3, 36.4, 59.8, 50....",77.1,0,27.5,9,True,True,52.3,1,7.3,24.8
1,0,0,1,"[41.28, 63.9, 76.3, 56.4]","[85.7, 47.5, 34.1, 49.4, 40.3, 36.4, 59.8, 50....",76.3,2,34.1,2,True,True,55.2,1,37.1,21.1
2,0,0,2,"[60.79, 70.7, 57.7, 55.4]","[85.7, 47.5, 69.5, 49.4, 40.3, 36.4, 59.8, 50....",70.7,1,36.4,5,True,True,53.55,1,25.45,17.15
3,0,0,3,"[37.68, 59.5, 59.0, 65.7]","[85.7, 47.5, 69.5, 49.4, 40.3, 83.3, 59.8, 50....",65.7,3,40.3,4,True,True,53.0,1,53.1,12.7
4,0,0,4,"[45.1, 35.3, 43.1, 64.3]","[85.7, 47.5, 69.5, 49.4, 80.9, 83.3, 59.8, 50....",64.3,3,45.8,9,True,True,55.05,1,24.35,9.25
5,0,0,5,"[69.72, 64.2, 44.1, 33.8]","[85.7, 47.5, 69.5, 49.4, 80.9, 83.3, 59.8, 50....",69.72,0,47.5,1,True,True,58.61,1,-9.71,11.11
6,0,0,6,"[12.93, 36.2, 38.5, 57.9]","[85.7, 79.5, 69.5, 49.4, 80.9, 83.3, 59.8, 50....",57.9,3,49.4,3,True,True,53.65,1,9.65,4.25
7,0,0,7,"[28.59, 57.1, 57.8, 31.2]","[85.7, 79.5, 69.5, 61.1, 80.9, 83.3, 59.8, 50....",57.8,2,50.0,7,True,True,53.9,1,6.4,3.9
8,0,0,8,"[19.5, 66.5, 32.0, 30.0]","[85.7, 79.5, 69.5, 61.1, 80.9, 83.3, 59.8, 62....",66.5,1,59.2,8,True,True,62.85,1,3.85,3.65
9,0,0,9,"[35.85, 39.7, 25.0, 25.3]","[85.7, 79.5, 69.5, 61.1, 80.9, 83.3, 59.8, 62....",39.7,1,59.8,6,False,False,,0,0.0,0.0


### Discrete Action Spaces

In [75]:
from stable_baselines3 import DQN
db = Database(game_metadata, buyer_strategies, seller_strategies)
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
env = TradingEnv(db, nsteps)
env.action_space = spaces.Discrete(31)
#policy_kwargs = dict(net_arch=dict(pi=[64, 64], qf=[64, 64]))
model = DQN("MlpPolicy", env, verbose=1,)
model.learn(50000, progress_bar = False)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 16        |
|    ep_rew_mean      | -2.13e+03 |
|    exploration_rate | 0.988     |
| time/               |           |
|    episodes         | 4         |
|    fps              | 169       |
|    time_elapsed     | 0         |
|    total_timesteps  | 64        |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 16        |
|    ep_rew_mean      | -2.58e+03 |
|    exploration_rate | 0.976     |
| time/               |           |
|    episodes         | 8         |
|    fps              | 182       |
|    time_elapsed     | 0         |
|    total_timesteps  | 128       |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 16        |
|    e

-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 16        |
|    ep_rew_mean      | -2.63e+03 |
|    exploration_rate | 0.732     |
| time/               |           |
|    episodes         | 88        |
|    fps              | 171       |
|    time_elapsed     | 8         |
|    total_timesteps  | 1408      |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 16        |
|    ep_rew_mean      | -2.66e+03 |
|    exploration_rate | 0.72      |
| time/               |           |
|    episodes         | 92        |
|    fps              | 171       |
|    time_elapsed     | 8         |
|    total_timesteps  | 1472      |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 16        |
|    ep_rew_mean      | -2.68e+03 |
|    exploration_rate | 0.708     |
| time/               |     

-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 16        |
|    ep_rew_mean      | -2.77e+03 |
|    exploration_rate | 0.477     |
| time/               |           |
|    episodes         | 172       |
|    fps              | 165       |
|    time_elapsed     | 16        |
|    total_timesteps  | 2752      |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 16        |
|    ep_rew_mean      | -2.77e+03 |
|    exploration_rate | 0.465     |
| time/               |           |
|    episodes         | 176       |
|    fps              | 165       |
|    time_elapsed     | 17        |
|    total_timesteps  | 2816      |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 16        |
|    ep_rew_mean      | -2.78e+03 |
|    exploration_rate | 0.453     |
| time/               |     

-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 16        |
|    ep_rew_mean      | -2.79e+03 |
|    exploration_rate | 0.222     |
| time/               |           |
|    episodes         | 256       |
|    fps              | 166       |
|    time_elapsed     | 24        |
|    total_timesteps  | 4096      |
-----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 16       |
|    ep_rew_mean      | -2.8e+03 |
|    exploration_rate | 0.21     |
| time/               |          |
|    episodes         | 260      |
|    fps              | 165      |
|    time_elapsed     | 25       |
|    total_timesteps  | 4160     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 16       |
|    ep_rew_mean      | -2.8e+03 |
|    exploration_rate | 0.197    |
| time/               |          |
|    epis

-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 16        |
|    ep_rew_mean      | -2.77e+03 |
|    exploration_rate | 0.05      |
| time/               |           |
|    episodes         | 340       |
|    fps              | 163       |
|    time_elapsed     | 33        |
|    total_timesteps  | 5440      |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 16        |
|    ep_rew_mean      | -2.77e+03 |
|    exploration_rate | 0.05      |
| time/               |           |
|    episodes         | 344       |
|    fps              | 162       |
|    time_elapsed     | 33        |
|    total_timesteps  | 5504      |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 16        |
|    ep_rew_mean      | -2.76e+03 |
|    exploration_rate | 0.05      |
| time/               |     

KeyboardInterrupt: 

In [39]:
db.step_data.head(100).groupby('current_bid_idx').sum()

In [40]:
db.step_data.tail(100).groupby('current_bid_idx').sum()

## ON POLICY

### DDPG - Deterministic Deep Policy Gradient

In [76]:
from stable_baselines3 import DDPG
db = Database(game_metadata, buyer_strategies, seller_strategies)
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
env = TradingEnv(db, nsteps)
policy_kwargs = dict(net_arch=dict(pi=[64, 64], qf=[64, 64]))
model = DDPG("MlpPolicy", env, policy_kwargs=policy_kwargs, verbose=1,)
model.learn(50000, progress_bar = True)

Output()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 16       |
|    ep_rew_mean     | 153      |
| time/              |          |
|    episodes        | 4        |
|    fps             | 143      |
|    time_elapsed    | 0        |
|    total_timesteps | 64       |
---------------------------------


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [52]:
db.step_data.head(100).groupby('current_bid_idx').sum()

Unnamed: 0_level_0,rnd,period,step,current_bid,current_ask,current_ask_idx,buy,sell,price,sale,bprofit,sprofit
current_bid_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,45,94,1208.78,447.0,43,18,18,827.89,18,727.91,380.89
1,0,77,222,1645.4,819.4,114,19,18,871.9,19,667.9,449.0
2,0,72,166,1544.9,696.9,96,26,25,1122.3,26,736.1,425.4
3,0,70,244,1157.2,819.2,92,19,18,761.45,19,307.15,194.35


In [53]:
db.step_data.tail(100).groupby('current_bid_idx').sum()

Unnamed: 0_level_0,rnd,period,step,current_bid,current_ask,current_ask_idx,buy,sell,price,sale,bprofit,sprofit
current_bid_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,10440,114,1423.45,516.5,61,18,18,876.75,18,679.05,468.55
1,0,11931,163,1321.0,644.5,92,18,16,786.9,18,650.7,358.7
2,0,13416,166,1654.5,720.9,73,27,27,1187.7,27,730.5,466.8
3,0,13901,299,1147.9,927.3,106,19,18,747.1,19,321.5,144.7


### PPO - Proximal Policy Optimization

In [78]:
from stable_baselines3 import PPO
db = Database(game_metadata, buyer_strategies, seller_strategies)
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
env = TradingEnv(db, nsteps)
policy_kwargs = dict(net_arch=dict(pi=[64, 64], qf=[64, 64]))
model = PPO("MlpPolicy", env, policy_kwargs=policy_kwargs, verbose=1)
model.learn(50000, progress_bar = False)

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

### A2C - Advantage Actor-Critic

In [None]:
# Create A2C model
from stable_baselines3.ppo.policies import MlpPolicy
a2c_model = A2C(MlpPolicy, env, verbose=0)

# Train the A2C agent for 10000 steps
a2c_model.learn(total_timesteps=training_step, progress_bar = True)

# Evaluate the trained A2C agent
mean_reward_a2c, std_reward_a2c = evaluate_policy(a2c_model, env, n_eval_episodes=eval_steps)
print(f"A2C mean_reward: {mean_reward_a2c:.2f} +/- {std_reward_a2c:.2f}")