In [5]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from functions import *
from itertools import count
buyer_strategies = ['Honest', 'Random']
seller_strategies = ['Random', 'Random', 'Random', 'Random', 'Random', 'Random']
nbuyers, nsellers = len(buyer_strategies), len(seller_strategies)
nrounds, nperiods, ntokens, nsteps, gametype, nbuyers, nsellers = 1, 100, 4, 6, '1234', len(buyer_strategies), len(seller_strategies)
R1, R2, R3, R4 = gametype_to_ran(gametype)
game_metadata = [nrounds, nperiods, ntokens, nbuyers, nsellers, nsteps, R1, R2, R3, R4]
db = Database(game_metadata, buyer_strategies, seller_strategies)
rnd = 0
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
period = 0
num_states = nsteps
min_frac = 0.01
max_frac = 1.0

In [9]:
class TradingEnv(gym.Env):
    def __init__(self, db, nsteps, render_mode = None):
        self.rnd = 0
        self.period = -1
        self.db = db
        self.action_space = spaces.Box(0,1,(1,),dtype=np.float)
        self.observation_space = spaces.Box(0,nsteps,(1,),dtype=np.float)

    def reset(self,seed=None):
        self.db.reset_period(self.rnd)
        self.timestep = 0
        self.period += 1
        observation = np.array([0])
        return observation, None

    def step(self, action, seed=None, options=None):
        [buyer.next_token() for buyer in self.db.buyers]
        [seller.next_token() for seller in self.db.sellers]
        bid_frac = action.item()
        # convert action to bid
        self.db.buyers[0].next_token()
        min_bid = self.db.buyers[0].value * min_frac
        max_bid = self.db.buyers[0].value * max_frac
        bid = np.round(max_bid * bid_frac + (1 - bid_frac) * min_bid, 2)

        # simulate market
        bids = [buyer.ask(self.db) for buyer in self.db.buyers]
        bids[0] = bid
        asks = [seller.ask(self.db) for seller in self.db.sellers]
        current_ask, current_ask_idx, current_bid, current_bid_idx = current_bid_ask(bids, asks)
        sale, price, bprofit, sprofit, buy, sell = buy_sell(self.db, current_bid, current_bid_idx, current_ask, current_ask_idx)
        step_data = [self.rnd, self.period, self.timestep, bids, asks, current_bid, current_bid_idx, current_ask, current_ask_idx, buy, sell, price, sale, bprofit, sprofit]
        self.db.add_step(step_data)

        # compute reward, new state
        reward = np.array([0])
        if sale == 1 and current_bid_idx == 0:
            reward = np.array([bprofit])
        observation = np.array([self.timestep + 1])

        # check termination
        self.timestep += 1
        if self.timestep == nsteps - 1:
            terminated = True
            self.timestep = 0
        else:
            terminated = False
        return observation, reward, terminated, {}, {}

In [10]:
env = TradingEnv(db, nsteps)
rnd = 0
observation, info = env.reset()
for period in count():
    for timestep in count(): 
        action = env.action_space.sample()
        observation, reward, done, info, _ = env.step(action)
        print(f"Rnd: {rnd}, Period: {period}, New State: {observation}, Action:{np.round(action,1)}, Reward: {np.round(reward,1)}, Period End: {done}")
        if done:
            # If the episode is done, reset the environment
            print('done')
            observation, info = env.reset()
            timestep += 1
            break
    if period == nperiods:
        break
env.close()

Rnd: 0, Period: 0, New State: [1], Action:[0.6], Reward: [0], Period End: False
Rnd: 0, Period: 0, New State: [2], Action:[0.4], Reward: [0], Period End: False
Rnd: 0, Period: 0, New State: [3], Action:[0.2], Reward: [0], Period End: False
Rnd: 0, Period: 0, New State: [4], Action:[0.2], Reward: [0], Period End: False
Rnd: 0, Period: 0, New State: [5], Action:[0.8], Reward: [41.7], Period End: True
done
Rnd: 0, Period: 1, New State: [1], Action:[0.1], Reward: [0], Period End: False
Rnd: 0, Period: 1, New State: [2], Action:[1.], Reward: [0], Period End: False
Rnd: 0, Period: 1, New State: [3], Action:[0.8], Reward: [0], Period End: False
Rnd: 0, Period: 1, New State: [4], Action:[0.8], Reward: [39.1], Period End: False
Rnd: 0, Period: 1, New State: [5], Action:[0.4], Reward: [53.9], Period End: True
done
Rnd: 0, Period: 2, New State: [1], Action:[0.1], Reward: [0], Period End: False
Rnd: 0, Period: 2, New State: [2], Action:[0.6], Reward: [0], Period End: False
Rnd: 0, Period: 2, New S

Rnd: 0, Period: 28, New State: [1], Action:[0.5], Reward: [0], Period End: False
Rnd: 0, Period: 28, New State: [2], Action:[0.5], Reward: [0], Period End: False
Rnd: 0, Period: 28, New State: [3], Action:[0.3], Reward: [0], Period End: False
Rnd: 0, Period: 28, New State: [4], Action:[0.6], Reward: [49.6], Period End: False
Rnd: 0, Period: 28, New State: [5], Action:[0.], Reward: [0], Period End: True
done
Rnd: 0, Period: 29, New State: [1], Action:[0.], Reward: [0], Period End: False
Rnd: 0, Period: 29, New State: [2], Action:[0.4], Reward: [0], Period End: False
Rnd: 0, Period: 29, New State: [3], Action:[0.5], Reward: [0], Period End: False
Rnd: 0, Period: 29, New State: [4], Action:[0.1], Reward: [0], Period End: False
Rnd: 0, Period: 29, New State: [5], Action:[0.8], Reward: [36.], Period End: True
done
Rnd: 0, Period: 30, New State: [1], Action:[0.9], Reward: [0], Period End: False
Rnd: 0, Period: 30, New State: [2], Action:[0.6], Reward: [0], Period End: False
Rnd: 0, Period: 3

Rnd: 0, Period: 54, New State: [1], Action:[0.9], Reward: [0], Period End: False
Rnd: 0, Period: 54, New State: [2], Action:[1.], Reward: [32.1], Period End: False
Rnd: 0, Period: 54, New State: [3], Action:[0.6], Reward: [0], Period End: False
Rnd: 0, Period: 54, New State: [4], Action:[1.], Reward: [0], Period End: False
Rnd: 0, Period: 54, New State: [5], Action:[0.5], Reward: [0], Period End: True
done
Rnd: 0, Period: 55, New State: [1], Action:[0.5], Reward: [0], Period End: False
Rnd: 0, Period: 55, New State: [2], Action:[0.5], Reward: [0], Period End: False
Rnd: 0, Period: 55, New State: [3], Action:[0.9], Reward: [0], Period End: False
Rnd: 0, Period: 55, New State: [4], Action:[0.3], Reward: [0], Period End: False
Rnd: 0, Period: 55, New State: [5], Action:[0.4], Reward: [59.], Period End: True
done
Rnd: 0, Period: 56, New State: [1], Action:[0.7], Reward: [0], Period End: False
Rnd: 0, Period: 56, New State: [2], Action:[0.7], Reward: [0], Period End: False
Rnd: 0, Period: 5

Rnd: 0, Period: 78, New State: [3], Action:[0.6], Reward: [0], Period End: False
Rnd: 0, Period: 78, New State: [4], Action:[0.6], Reward: [52.8], Period End: False
Rnd: 0, Period: 78, New State: [5], Action:[0.6], Reward: [46.2], Period End: True
done
Rnd: 0, Period: 79, New State: [1], Action:[0.8], Reward: [0], Period End: False
Rnd: 0, Period: 79, New State: [2], Action:[0.6], Reward: [0], Period End: False
Rnd: 0, Period: 79, New State: [3], Action:[0.2], Reward: [0], Period End: False
Rnd: 0, Period: 79, New State: [4], Action:[0.5], Reward: [53.5], Period End: False
Rnd: 0, Period: 79, New State: [5], Action:[0.9], Reward: [32.4], Period End: True
done
Rnd: 0, Period: 80, New State: [1], Action:[1.], Reward: [0], Period End: False
Rnd: 0, Period: 80, New State: [2], Action:[0.8], Reward: [0], Period End: False
Rnd: 0, Period: 80, New State: [3], Action:[0.6], Reward: [0], Period End: False
Rnd: 0, Period: 80, New State: [4], Action:[0.3], Reward: [0], Period End: False
Rnd: 0, P

### PPO + A2C

In [11]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO, A2C
from stable_baselines3.ppo.policies import MlpPolicy
from stable_baselines3.common.base_class import BaseAlgorithm
from stable_baselines3.common.evaluation import evaluate_policy

# Define your environment and parameters (replace with your actual environment setup)
rnd = 0
period = 0
num_states = nsteps
min_frac = 0.01
max_frac = 1.0
db = Database(game_metadata, buyer_strategies, seller_strategies)
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
env = TradingEnv(db, nsteps)
eval_steps = 100
training_step = 10000

# Create PPO model
from stable_baselines3.ppo.policies import MlpPolicy
random_model = A2C(MlpPolicy, env, verbose=0)

# Evaluate the random policy agent
mean_reward_random, std_reward_random = evaluate_policy(random_model, env, n_eval_episodes=eval_steps)
print(f"Random Policy mean_reward: {mean_reward_random:.2f} +/- {std_reward_random:.2f}")

# Create PPO model
from stable_baselines3.ppo.policies import MlpPolicy
ppo_model = PPO(MlpPolicy, env, verbose=0)

# Train the PPO agent for 10000 steps
ppo_model.learn(total_timesteps=training_step, progress_bar = True)

# Evaluate the trained PPO agent
mean_reward_ppo, std_reward_ppo = evaluate_policy(ppo_model, env, n_eval_episodes=eval_steps)
print(f"PPO mean_reward: {mean_reward_ppo:.2f} +/- {std_reward_ppo:.2f}")

# Create A2C model
from stable_baselines3.ppo.policies import MlpPolicy
a2c_model = A2C(MlpPolicy, env, verbose=0)

# Train the A2C agent for 10000 steps
a2c_model.learn(total_timesteps=training_step)

# Evaluate the trained A2C agent
mean_reward_a2c, std_reward_a2c = evaluate_policy(a2c_model, env, n_eval_episodes=eval_steps)
print(f"A2C mean_reward: {mean_reward_a2c:.2f} +/- {std_reward_a2c:.2f}")

Output()

PPO mean_reward: 59.91 +/- 2.45
A2C mean_reward: 57.35 +/- 2.68


In [61]:
env.db.step_data.head(9000)

Unnamed: 0,rnd,period,step,bids,asks,current_bid,current_bid_idx,current_ask,current_ask_idx,buy,sell,price,sale,bprofit,sprofit
0,0,0,0,"[0.76, 121.2]","[44.5, 14.6, 19.3, 34.0, 34.0, 61.2]",121.20,1,14.6,1,True,True,67.90,1,23.30,55.60
1,0,0,1,"[0.76, 68.8]","[40.6, 27.9, 17.4, 37.0, 24.9, 80.1]",68.80,1,17.4,2,True,True,43.10,1,5.40,29.20
2,0,0,2,"[0.76, 50.3]","[49.5, 32.4, 41.1, 33.8, 35.7, 67.4]",50.30,1,32.4,1,True,True,41.35,1,1.95,15.35
3,0,0,3,"[0.76, 48.9]","[36.1, 52.4, 39.8, 29.4, 36.8, 56.3]",48.90,1,29.4,3,True,True,39.15,1,-0.35,10.45
4,0,0,4,"[0.76, nan]","[39.7, 40.3, 34.7, 52.3, 35.3, 57.6]",0.76,0,34.7,2,True,False,34.70,1,41.80,1.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,0,1800,0,"[0.76, 110.4]","[47.9, 17.5, 17.3, 30.1, 36.4, 78.3]",110.40,1,17.3,2,True,True,63.85,1,27.35,49.95
8996,0,1800,1,"[35.1, 58.9]","[38.9, 13.5, 37.0, 33.6, 27.7, 62.7]",58.90,1,13.5,1,True,True,36.20,1,12.30,23.90
8997,0,1800,2,"[0.76, 64.5]","[50.0, 34.5, 43.9, 42.9, 36.7, 68.3]",64.50,1,34.5,1,True,True,49.50,1,-6.20,23.50
8998,0,1800,3,"[6.92, 56.7]","[36.9, 53.4, 48.9, 32.6, 34.8, 74.4]",56.70,1,32.6,3,True,True,44.65,1,-5.85,15.95


In [55]:
env.db.step_data.shape

(21740, 15)