In [1]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from functions import *

In [2]:
buyer_strategies = ['Honest']
seller_strategies = ['Honest','Honest','Honest','Honest','Honest','Honest','Honest','Honest']
nbuyers, nsellers = len(buyer_strategies), len(seller_strategies)
nrounds, nperiods, ntokens, nsteps, gametype, nbuyers, nsellers = 1, 10000, 10, 10, '1234', len(buyer_strategies), len(seller_strategies)
R1, R2, R3, R4 = gametype_to_ran(gametype)
game_metadata = [nrounds, nperiods, ntokens, nbuyers, nsellers, nsteps, R1, R2, R3, R4]
db = Database(game_metadata, buyer_strategies, seller_strategies)
rnd = 0
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
period = 0

In [5]:
db.sellers

[<functions.Honest at 0x11e836590>,
 <functions.Honest at 0x11e835540>,
 <functions.Honest at 0x11e835510>,
 <functions.Honest at 0x11e8355a0>,
 <functions.Honest at 0x11e8354b0>,
 <functions.Honest at 0x11e835750>,
 <functions.Honest at 0x11e8344f0>,
 <functions.Honest at 0x11e835570>]

In [15]:
class TradingEnv(gym.Env):
    def __init__(self, db, nsteps, render_mode = None):
        self.db = db
        self.action_space = spaces.Box(low=0, high=1, shape=(1,), dtype=np.float)
        self.observation_space = spaces.Box(low=0, high=nsteps, shape=(1,), dtype=int)

    def reset(self):
        self.db.reset_period(rnd)
        observation = np.array([0])
        return observation, None

    def step(self, action, timestep, seed = None, options = None):
        # convert action to bid
        self.db.buyers[0].next_token()
        min_bid = self.db.buyers[0].value*0.6
        max_bid = self.db.buyers[0].value*1.4
        bid = min_bid * action.item() + (1-action.item())*max_bid
        
        # simulate market
        bids = [bid]
        asks = [seller.ask(self.db) for seller in self.db.sellers]
        current_ask, current_ask_idx, current_bid, current_bid_idx = current_bid_ask(bids, asks) 
        sale, price, bprofit, sprofit, buy, sell = buy_sell(db, current_bid, current_bid_idx, current_ask, current_ask_idx)
        step_data = [rnd,period,timestep,bids,asks,current_bid,current_bid_idx,current_ask,current_ask_idx,buy,sell,price,sale,bprofit,sprofit]
        self.db.add_step(step_data)
        
        # compute reward, new state
        reward = 0
        if (sale == 1) and (current_bid_idx == 0):
            reward = bprofit
        observation = np.array([timestep + 1])
        
        # check termination 
        if timestep == nsteps-1:
            terminated = True
        else:
            terminated = False
        return observation, reward, terminated, False, None

In [16]:
env = TradingEnv(db, nsteps)
observation, info = env.reset()
timestep = 0

for _ in range(30): 
    action = env.action_space.sample()
    observation, reward, done, info, reset_info = env.step(action, timestep)
    print(f"Rnd: {rnd}, Period: {period}, New State: {observation.item()}, Action:{np.round(action.item(),1)}, Reward: {np.round(reward,1)}, Period End: {done}")

    if done:
        # If the episode is done, reset the environment
        print('done')
        observation, info = env.reset()
        timestep = 0
    else:
        timestep += 1

# Close the environment when done
env.close()



Rnd: 0, Period: 0, New State: 1, Action:0.0, Reward: 26.3, Period End: False
Rnd: 0, Period: 0, New State: 2, Action:0.8, Reward: 55.2, Period End: False
Rnd: 0, Period: 0, New State: 3, Action:0.3, Reward: 26.8, Period End: False
Rnd: 0, Period: 0, New State: 4, Action:0.7, Reward: 41.2, Period End: False
Rnd: 0, Period: 0, New State: 5, Action:0.2, Reward: 12.5, Period End: False
Rnd: 0, Period: 0, New State: 6, Action:0.6, Reward: 18.9, Period End: False
Rnd: 0, Period: 0, New State: 7, Action:1.0, Reward: 21.7, Period End: False
Rnd: 0, Period: 0, New State: 8, Action:0.6, Reward: 11.8, Period End: False
Rnd: 0, Period: 0, New State: 9, Action:1.0, Reward: 14.3, Period End: False
Rnd: 0, Period: 0, New State: 10, Action:0.9, Reward: 0, Period End: True
done
Rnd: 0, Period: 0, New State: 1, Action:0.6, Reward: 49.2, Period End: False
Rnd: 0, Period: 0, New State: 2, Action:0.4, Reward: 39.7, Period End: False
Rnd: 0, Period: 0, New State: 3, Action:0.0, Reward: 19.2, Period End: Fal

In [55]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.env_util import make_vec_env
import os
import time
# Saving logs to visulise in Tensorboard, saving models
models_dir = f"models/Mountain-{time.time()}"
logdir = f"logs/Mountain-{time.time()}"

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)

In [None]:
# Parallel environments
env = make_vec_env("MountainCarContinuous-v0", n_envs=1)

# The learning agent and hyperparameters
model = PPO(
    policy=MlpPolicy,
    env=env,
    seed=0,
    batch_size=256,
    ent_coef=0.00429,
    learning_rate=7.77e-05,
    n_epochs=10,
    n_steps=8,
    gae_lambda=0.9,
    gamma=0.9999,
    clip_range=0.1,
    max_grad_norm =5,
    vf_coef=0.19,
    use_sde=True,
    policy_kwargs=dict(log_std_init=-3.29, ortho_init=False),
    verbose=1,
    tensorboard_log=logdir
    )

