In [47]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from functions import *

In [48]:
buyer_strategies = ['Honest']
seller_strategies = ['Honest','Honest','Honest','Honest','Honest','Honest','Honest','Honest']
nbuyers, nsellers = len(buyer_strategies), len(seller_strategies)
nrounds, nperiods, ntokens, nsteps, gametype, nbuyers, nsellers = 1, 10000, 10, 10, '1234', len(buyer_strategies), len(seller_strategies)
R1, R2, R3, R4 = gametype_to_ran(gametype)
game_metadata = [nrounds, nperiods, ntokens, nbuyers, nsellers, nsteps, R1, R2, R3, R4]
db = Database(game_metadata, buyer_strategies, seller_strategies)
rnd = 0
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
period = 0

In [49]:
class TradingEnv(gym.Env):
    def __init__(self, db):
        # Define your environment parameters here
        self.db = db
        self.action_space = spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)
        self.observation_space = spaces.Discrete(10)
        self.state = 0 

    def reset(self):
        # Reset the environment to its initial state and return the initial observation
        self.db.reset_period(rnd)
        self.state = 0
        observation = self.state  # Replace this with your actual observation
        reset_info = None  # Replace this with any reset information you want to return
        return observation, reset_info

    def step(self, action):
        # convert action to bid
        self.db.buyers[0].next_token()
        min_bid = self.db.buyers[0].value*0.1
        max_bid = self.db.buyers[0].value*1.9
        bid = min_bid * action.item() + (1-action.item())*max_bid
        
        # simulate market
        bids = [bid]
        asks = [seller.ask(self.db) for seller in self.db.sellers]
        current_ask, current_ask_idx, current_bid, current_bid_idx = current_bid_ask(bids, asks) 
        sale, price, bprofit, sprofit, buy, sell = buy_sell(db, current_bid, current_bid_idx, current_ask, current_ask_idx)
        step_data = [rnd,period,self.state,bids,asks,current_bid,current_bid_idx,current_ask,current_ask_idx,buy,sell,price,sale,bprofit,sprofit]
        self.db.add_step(step_data)
        
        # compute reward, new state
        reward = 0
        if (sale == 1) and (current_bid_idx == 0):
            reward = bprofit
        new_state = self.state + 1
        
        # check termination 
        if self.state == nsteps-1:
            done = True
        else:
            done = False
        info = None
        self.state = new_state  # Update the current state
        reset_info = None
        return new_state, reward, done, info, reset_info 

In [54]:
# Create an instance of your custom environment
env = TradingEnv(db)

# Reset the environment to its initial state
observation = env.reset()

# Test some steps in the environment
for _ in range(30):  # You can adjust the number of steps
    # Replace this with your action selection logic
    action = env.action_space.sample()  # Random action for testing

    # Take a step in the environment
    new_observation, reward, done, info, reset_info = env.step(action)

    # Print the current observation, reward, and whether the episode is done
    print(f"Rnd: {rnd}, Period: {period}, State: {env.state}, Action:{np.round(action.item(),1)}, Reward: {np.round(reward,1)}, New State: {new_observation}, Period End: {done}")

    if done:
        # If the episode is done, reset the environment
        print('done')
        observation = env.reset()
        pass

# Close the environment when done
env.close()



Rnd: 0, Period: 0, State: 1, Action:0.1, Reward: 4.8, New State: 1, Period End: False
Rnd: 0, Period: 0, State: 2, Action:0.7, Reward: 50.0, New State: 2, Period End: False
Rnd: 0, Period: 0, State: 3, Action:0.6, Reward: 33.7, New State: 3, Period End: False
Rnd: 0, Period: 0, State: 4, Action:0.8, Reward: 40.8, New State: 4, Period End: False
Rnd: 0, Period: 0, State: 5, Action:0.8, Reward: 19.7, New State: 5, Period End: False
Rnd: 0, Period: 0, State: 6, Action:0.4, Reward: 5.2, New State: 6, Period End: False
Rnd: 0, Period: 0, State: 7, Action:0.7, Reward: 13.2, New State: 7, Period End: False
Rnd: 0, Period: 0, State: 8, Action:0.6, Reward: 7.0, New State: 8, Period End: False
Rnd: 0, Period: 0, State: 9, Action:0.1, Reward: -11.2, New State: 9, Period End: False
Rnd: 0, Period: 0, State: 10, Action:0.3, Reward: 0, New State: 10, Period End: True
done
Rnd: 0, Period: 0, State: 1, Action:0.8, Reward: 75.9, New State: 1, Period End: False
Rnd: 0, Period: 0, State: 2, Action:0.8, R

In [55]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.env_util import make_vec_env
import os
import time
# Saving logs to visulise in Tensorboard, saving models
models_dir = f"models/Mountain-{time.time()}"
logdir = f"logs/Mountain-{time.time()}"

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)

In [None]:
# Parallel environments
env = make_vec_env("MountainCarContinuous-v0", n_envs=1)

# The learning agent and hyperparameters
model = PPO(
    policy=MlpPolicy,
    env=env,
    seed=0,
    batch_size=256,
    ent_coef=0.00429,
    learning_rate=7.77e-05,
    n_epochs=10,
    n_steps=8,
    gae_lambda=0.9,
    gamma=0.9999,
    clip_range=0.1,
    max_grad_norm =5,
    vf_coef=0.19,
    use_sde=True,
    policy_kwargs=dict(log_std_init=-3.29, ortho_init=False),
    verbose=1,
    tensorboard_log=logdir
    )

