In [1]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from functions import *
from itertools import count
buyer_strategies = ['Honest', 'Random', 'Random']
seller_strategies = ['Random', 'Random', 'Random']
nbuyers, nsellers = len(buyer_strategies), len(seller_strategies)
nrounds, nperiods, ntokens, nsteps, gametype, nbuyers, nsellers = 10, 10, 3, 12, '1234', len(buyer_strategies), len(seller_strategies)
R1, R2, R3, R4 = gametype_to_ran(gametype)
game_metadata = [nrounds, nperiods, ntokens, nbuyers, nsellers, nsteps, R1, R2, R3, R4]
db = Database(game_metadata, buyer_strategies, seller_strategies)
rnd = 0
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
period = 0
num_states = nsteps
min_frac = 0.01
max_frac = 1.0

In [2]:
class TradingEnv(gym.Env):
    def __init__(self, db, nsteps, render_mode = None):
        self.rnd = 0
        self.period = -1
        self.nperiods = nperiods
        self.db = db
        self.action_space = spaces.Box(0,1,(1,),dtype=np.float32)
        self.observation_space = spaces.Box(-1,200,(13,),dtype=np.float32)

    def reset(self,seed=None):
        [buyer.next_token() for buyer in self.db.buyers]
        [seller.next_token() for seller in self.db.sellers]
        #self.db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
        self.db.reset_period(self.rnd)
        self.timestep = 0
        self.period += 1
        self.db.buyers[0].next_token()
        agent = self.db.buyers[0]
        observation = np.array([0,-1,-1,-1,-1,-1,-1,-1,agent.value,-1,-1,-1,agent.num_tokens_traded], dtype = np.float32)
        return observation, {}

    def step(self, action, seed=None, options=None):
        [buyer.next_token() for buyer in self.db.buyers]
        [seller.next_token() for seller in self.db.sellers]
        bid_frac = action.item()
        # convert action to bid
        self.db.buyers[0].next_token()
        min_bid = self.db.buyers[0].value * min_frac
        max_bid = self.db.buyers[0].value * max_frac
        bid = np.round(max_bid * bid_frac + (1 - bid_frac) * min_bid, 2)

        # simulate market
        bids = [buyer.bid(self.db) for buyer in self.db.buyers]
        bids[0] = bid
        asks = [seller.ask(self.db) for seller in self.db.sellers]
        current_ask, current_ask_idx, current_bid, current_bid_idx = current_bid_ask(bids, asks)
        sale, price, bprofit, sprofit, buy, sell = buy_sell(self.db, current_bid, current_bid_idx, current_ask, current_ask_idx)
        step_data = [self.rnd, self.period, self.timestep, bids, asks, current_bid, current_bid_idx, current_ask, current_ask_idx, buy, sell, price, sale, bprofit, sprofit]
        self.db.add_step(step_data)

        # compute reward, new state
        reward = 0.0
        if sale == 1 and current_bid_idx == 0:
            reward = bprofit
            
        agent = self.db.buyers[0]
        observation = np.array([self.timestep + 1, current_ask, current_ask_idx, current_bid, current_bid_idx,
                                sale, price, buy, sell, agent.value, agent.step_profit,
                                agent.sale, agent.num_tokens_traded],dtype = np.float32)
        idx = np.isnan(observation)
        observation[idx] = -1.0
        # check termination
        self.timestep += 1
        if self.timestep == nsteps:
            terminated = True
            self.timestep = 0
        else:
            terminated = False
        infos = {"TimeLimit.truncated":True}
        truncated = False
        return observation, reward, terminated, truncated, infos

In [3]:
from stable_baselines3.common.env_checker import check_env
db = Database(game_metadata, buyer_strategies, seller_strategies)
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
env = TradingEnv(db, nsteps)
check_env(env)

2023-09-28 15:51:21.415023: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-09-28 15:51:21.415069: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-09-28 15:51:21.415099: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-09-28 15:51:21.425977: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
rnd = 0
db = Database(game_metadata, buyer_strategies, seller_strategies)
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
env = TradingEnv(db, nsteps)
observation, info = env.reset()
for period in count():
    for timestep in count(): 
        action = env.action_space.sample()
        observation, reward, done, info, _ = env.step(action)
        print(f"Rnd: {rnd}, Period: {period}, New State: {observation}, Action:{np.round(action,1)}, Reward: {np.round(reward,1)}, Period End: {done}")
        if done:
            # If the episode is done, reset the environment
            #print('done')
            observation, info = env.reset()
            break
    if period == nperiods:
        period = 0
        break
env.close()

Rnd: 0, Period: 0, New State: [  1.   18.1   2.   93.7   1.    1.   55.9   1.    1.  103.9   0.    1.
   0. ], Action:[0.], Reward: 0.0, Period End: False
Rnd: 0, Period: 0, New State: [  2.   49.8   1.   71.8   1.    1.   60.8   1.    1.  103.9   0.    1.
   0. ], Action:[0.7], Reward: 0.0, Period End: False
Rnd: 0, Period: 0, New State: [  3.    54.3    0.    67.2    2.     1.    60.75   1.     1.   103.9
   0.     1.     0.  ], Action:[0.5], Reward: 0.0, Period End: False
Rnd: 0, Period: 0, New State: [  4.   54.2   1.   34.    2.    0.   -1.    0.    0.  103.9   0.    1.
   0. ], Action:[0.2], Reward: 0.0, Period End: False
Rnd: 0, Period: 0, New State: [  5.   60.1   0.   43.3   2.    0.   -1.    0.    0.  103.9   0.    1.
   0. ], Action:[0.4], Reward: 0.0, Period End: False
Rnd: 0, Period: 0, New State: [  6.   59.8   0.   39.8   2.    0.   -1.    0.    0.  103.9   0.    1.
   0. ], Action:[0.3], Reward: 0.0, Period End: False
Rnd: 0, Period: 0, New State: [  7.   54.8   0.   46

In [5]:
db.step_data.head(1000).groupby('current_bid_idx').sum()

Unnamed: 0_level_0,rnd,period,step,bids,asks,current_bid,current_ask,current_ask_idx,buy,sell,price,sale,bprofit,sprofit
current_bid_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0,117,93,"[90.76, 20.3, 48.4, 36.58, 22.7, 34.0, 92.62, ...","[59.5, 66.0, 81.2, 93.2, 71.0, 81.2, 50.4, 47....",1638.09,1396.2,16,20,19,1417.32,22,513.18,401.42
1,0,110,24,"[5.67, 93.7, 40.7, 71.64, 71.8, 64.0, 48.6, 69...","[38.9, 47.6, 18.1, 53.2, 49.8, 92.7, 54.0, 43....",1638.4,848.7,23,22,22,1243.55,22,745.25,563.85
2,0,433,609,"[52.68, 22.9, 67.2, 22.43, 13.8, 34.0, 37.71, ...","[54.3, 67.7, 117.0, 56.5, 54.2, 104.2, 60.1, 7...",3333.0,6465.7,81,11,6,599.15,11,140.05,135.45


In [6]:
# Define your environment and parameters (replace with your actual environment setup)
rnd = 0
period = 0
num_states = nsteps
min_frac = 0.01
max_frac = 1.5
eval_steps = 1000
training_step = 50000

In [7]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO, A2C, DQN, SAC
from stable_baselines3.ppo.policies import MlpPolicy
from stable_baselines3.common.base_class import BaseAlgorithm
from stable_baselines3.common.evaluation import evaluate_policy

In [14]:
env.db.round_data.redemption_values

0    [[93.4, 75.3, 72.4, 64.0, 60.9, 59.6, 48.7, 24...
Name: redemption_values, dtype: object

### Continous Action Space

In [11]:
from stable_baselines3 import SAC, DDPG, TD3, A2C, PPO
db = Database(game_metadata, buyer_strategies, seller_strategies)
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
db.round_data.token_costs = [np.array([
    [48.7, 62.4, 64.4, 69.7, 73.9, 88.0, 89.9, 100.2],
    [48.7, 62.4, 64.4, 69.7, 73.9, 88.0, 89.9, 100.2],
    [48.7, 62.4, 64.4, 69.7, 73.9, 88.0, 89.9, 100.2]])]
db.round_data.redemption_values = [np.array([
    [93.4, 75.3, 72.4, 64.0, 60.9, 59.6, 48.7, 24.6],
    [93.4, 75.3, 72.4, 64.0, 60.9, 59.6, 48.7, 24.6],
    [93.4, 75.3, 72.4, 64.0, 60.9, 59.6, 48.7, 24.6]])]
env = TradingEnv(db, nsteps)
policy_kwargs = dict(net_arch=dict(pi=[128, 128], qf=[128, 128]))
model = SAC("MlpPolicy", env, policy_kwargs=policy_kwargs, verbose=1,)
model.learn(50000, progress_bar = False)

KeyboardInterrupt: 

In [15]:
model.save("trained_sac_model")

In [None]:
model.learn(50000, progress_bar = False)

In [None]:
db.step_data.head(60).groupby('current_bid_idx').sum()

In [None]:
db.step_data.tail(60).groupby('current_bid_idx').sum()

In [None]:
db.round_data.redemption_values.item()

In [None]:
db.step_data.tail(60)

In [None]:
graph_period(env.db, 0, 2236)

### Play Game

In [37]:
### from stable_baselines3 import SAC
loaded_model = SAC.load("trained_sac_model")

# Assuming you have a trained agent loaded as 'loaded_model'
db2 = Database(game_metadata, buyer_strategies, seller_strategies)
db2.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
db2.round_data.token_costs = [np.array([
    [48.7, 62.4, 64.4, 69.7, 73.9, 88.0, 89.9, 100.2],
    [48.7, 62.4, 64.4, 69.7, 73.9, 88.0, 89.9, 100.2],
    [48.7, 62.4, 64.4, 69.7, 73.9, 88.0, 89.9, 100.2]])]
db2.round_data.redemption_values = [np.array([
    [93.4, 75.3, 72.4, 64.0, 60.9, 59.6, 48.7, 24.6],
    [93.4, 75.3, 72.4, 64.0, 60.9, 59.6, 48.7, 24.6],
    [93.4, 75.3, 72.4, 64.0, 60.9, 59.6, 48.7, 24.6]])]
env2 = TradingEnv2(db2, nsteps)

obs, _ = env2.reset()  # Reset the environment to start a new game
done = False
user_id = 0
if user_id == 0:
    print('You are seller')
else:
    print('You are buyer')
while not done:
    user_input = input("Your offer: ")
    user_input = float(user_input)
    action, _ = loaded_model.predict(obs, deterministic=True)  # Get the agent's action
    obs, reward, done, _, _ = env2.step(action, user_input, user_id)  # Take the action and observe the next state

    if user_id == 0:
        print(
    if db2.sellers[0].sale    



Your bid:  50


[0.51949584]


Your bid:  50


[0.6275569]


Your bid:  50


[0.61427706]


Your bid:  50


[0.62133455]


Your bid:  50


[0.5805974]


Your bid:  50


[0.43714607]


Your bid:  50


[0.46290636]


Your bid:  50


[0.5477744]


Your bid:  50


[0.47734815]


Your bid:  50


[0.424411]


Your bid:  50


[0.42635298]


Your bid:  50


[0.40141302]


In [52]:
env2 = TradingEnv2(db2, nsteps)
obs, _ = env2.reset()  # Reset the environment to start a new game


In [53]:
db2.sellers[0].value

nan

In [38]:
db2.step_data.groupby('current_bid_idx').sum()

Unnamed: 0_level_0,rnd,period,step,bids,asks,current_bid,current_ask,current_ask_idx,buy,sell,price,sale,bprofit,sprofit
current_bid_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0,0,5,"[73.23, 50.0, 68.0, 69.67, 50.0, 45.2, 67.75, ...","[71.5, 56.5, 50.2, 65.4, 87.1, 88.3, 77.2, 91....",210.65,192.8,2,2,3,197.0,3,44.1,37.2
1,0,0,51,"[nan, 50.0, nan, nan, 50.0, nan, nan, 50.0, na...","[79.1, 80.2, nan, nan, 81.6, nan, nan, 76.5, n...",300.0,475.5,5,2,0,143.7,2,25.0,16.9
2,0,0,10,"[71.16, 50.0, 72.5, nan, 50.0, 70.3, nan, 50.0...","[67.1, 56.8, 82.5, 92.8, 84.5, 74.7, 90.5, 86....",209.3,208.3,5,2,3,203.65,3,37.45,28.15


In [39]:
db2.step_data.groupby('current_ask_idx').sum()

Unnamed: 0_level_0,rnd,period,step,bids,asks,current_bid,current_bid_idx,current_ask,buy,sell,price,sale,bprofit,sprofit
current_ask_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0,0,11,"[69.67, 50.0, 45.2, 67.75, 50.0, 57.1, nan, 50...","[65.4, 87.1, 88.3, 77.2, 91.8, 87.2, 79.1, 80....",187.42,1,221.7,2,2,214.385,3,26.715,38.885
1,0,0,46,"[71.16, 50.0, 72.5, nan, 50.0, nan, nan, 50.0,...","[67.1, 56.8, 82.5, nan, 81.6, nan, nan, 76.5, ...",322.5,7,453.2,2,1,129.25,2,39.45,18.15
2,0,0,9,"[73.23, 50.0, 68.0, nan, 50.0, 70.3, nan, 50.0...","[71.5, 56.5, 50.2, 92.8, 84.5, 74.7, 90.5, 86....",210.03,4,201.7,2,3,200.715,3,40.385,25.215


In [23]:
env2.db.step_data

Unnamed: 0,rnd,period,step,bids,asks,current_bid,current_bid_idx,current_ask,current_ask_idx,buy,sell,price,sale,bprofit,sprofit


In [24]:
class TradingEnv2(gym.Env):
    def __init__(self, db, nsteps, render_mode = None):
        self.rnd = 0
        self.period = -1
        self.nperiods = nperiods
        self.db = db
        self.action_space = spaces.Box(0,1,(1,),dtype=np.float32)
        self.observation_space = spaces.Box(-1,200,(13,),dtype=np.float32)

    def reset(self,seed=None):
        [buyer.next_token() for buyer in self.db.buyers]
        [seller.next_token() for seller in self.db.sellers]
        #self.db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
        self.db.reset_period(self.rnd)
        self.timestep = 0
        self.period += 1
        self.db.buyers[0].next_token()
        agent = self.db.buyers[0]
        observation = np.array([0,-1,-1,-1,-1,-1,-1,-1,agent.value,-1,-1,-1,agent.num_tokens_traded], dtype = np.float32)
        return observation, {}

    def step(self, action, user_action, user_id, seed=None, options=None):
        [buyer.next_token() for buyer in self.db.buyers]
        [seller.next_token() for seller in self.db.sellers]
        bid_frac = action.item()
        # convert action to bid
        self.db.buyers[0].next_token()
        min_bid = self.db.buyers[0].value * min_frac
        max_bid = self.db.buyers[0].value * max_frac
        bid = np.round(max_bid * bid_frac + (1 - bid_frac) * min_bid, 2)

        # simulate market
        bids = [buyer.bid(self.db) for buyer in self.db.buyers]
        bids[0] = bid
        asks = [seller.ask(self.db) for seller in self.db.sellers]

        if user_id == 0:
            asks[0] = user_action
        elif user_id == 1:
            bids[1] = user_action
        else:
            pass
            
        current_ask, current_ask_idx, current_bid, current_bid_idx = current_bid_ask(bids, asks)
        sale, price, bprofit, sprofit, buy, sell = buy_sell(self.db, current_bid, current_bid_idx, current_ask, current_ask_idx)
        step_data = [self.rnd, self.period, self.timestep, bids, asks, current_bid, current_bid_idx, current_ask, current_ask_idx, buy, sell, price, sale, bprofit, sprofit]
        self.db.add_step(step_data)

        # compute reward, new state
        reward = 0.0
        if sale == 1 and current_bid_idx == 0:
            reward = bprofit
            
        agent = self.db.buyers[0]
        observation = np.array([self.timestep + 1, current_ask, current_ask_idx, current_bid, current_bid_idx,
                                sale, price, buy, sell, agent.value, agent.step_profit,
                                agent.sale, agent.num_tokens_traded],dtype = np.float32)
        idx = np.isnan(observation)
        observation[idx] = -1.0
        # check termination
        self.timestep += 1
        if self.timestep == nsteps:
            terminated = True
            self.timestep = 0
        else:
            terminated = False
        infos = {"TimeLimit.truncated":True}
        truncated = False
        return observation, reward, terminated, truncated, infos