In [1]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from functions import *
from itertools import count
buyer_strategies = ['Honest', 'Random', 'Random']
seller_strategies = ['Random', 'Random', 'Random']
nbuyers, nsellers = len(buyer_strategies), len(seller_strategies)
nrounds, nperiods, ntokens, nsteps, gametype, nbuyers, nsellers = 10, 10, 8, 50, '1234', len(buyer_strategies), len(seller_strategies)
R1, R2, R3, R4 = gametype_to_ran(gametype)
game_metadata = [nrounds, nperiods, ntokens, nbuyers, nsellers, nsteps, R1, R2, R3, R4]
db = Database(game_metadata, buyer_strategies, seller_strategies)
rnd = 0
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
period = 0
num_states = nsteps
min_frac = 0.01
max_frac = 1.0

In [2]:
class TradingEnv(gym.Env):
    def __init__(self, db, nsteps, render_mode = None):
        self.rnd = 0
        self.period = -1
        self.nperiods = nperiods
        self.db = db
        self.action_space = spaces.Box(0,1,(1,),dtype=np.float)
        self.observation_space = spaces.Box(-1,200,(13,),dtype=np.float32)

    def reset(self,seed=None):
        #self.db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
        self.db.reset_period(self.rnd)
        self.timestep = 0
        self.period += 1
        self.db.buyers[0].next_token()
        agent = self.db.buyers[0]
        observation = np.array([0,-1,-1,-1,-1,-1,-1,-1,agent.value,-1,-1,-1,agent.num_tokens_traded], dtype = np.float32)
        return observation, {}

    def step(self, action, seed=None, options=None):
        [buyer.next_token() for buyer in self.db.buyers]
        [seller.next_token() for seller in self.db.sellers]
        bid_frac = action.item()
        # convert action to bid
        self.db.buyers[0].next_token()
        min_bid = self.db.buyers[0].value * min_frac
        max_bid = self.db.buyers[0].value * max_frac
        bid = np.round(max_bid * bid_frac + (1 - bid_frac) * min_bid, 2)

        # simulate market
        bids = [buyer.bid(self.db) for buyer in self.db.buyers]
        bids[0] = bid
        asks = [seller.ask(self.db) for seller in self.db.sellers]
        current_ask, current_ask_idx, current_bid, current_bid_idx = current_bid_ask(bids, asks)
        sale, price, bprofit, sprofit, buy, sell = buy_sell(self.db, current_bid, current_bid_idx, current_ask, current_ask_idx)
        step_data = [self.rnd, self.period, self.timestep, bids, asks, current_bid, current_bid_idx, current_ask, current_ask_idx, buy, sell, price, sale, bprofit, sprofit]
        self.db.add_step(step_data)

        # compute reward, new state
        reward = 0.0
        if sale == 1 and current_bid_idx == 0:
            reward = bprofit
            
        agent = self.db.buyers[0]
        observation = np.array([self.timestep + 1, current_ask, current_ask_idx, current_bid, current_bid_idx,
                                sale, price, buy, sell, agent.value, agent.step_profit,
                                agent.sale, agent.num_tokens_traded],dtype = np.float32)
        idx = np.isnan(observation)
        observation[idx] = -1.0
        # check termination
        self.timestep += 1
        if self.timestep == nsteps:
            terminated = True
            self.timestep = 0
        else:
            terminated = False
        infos = {"TimeLimit.truncated":True}
        truncated = False
        return observation, reward, terminated, truncated, infos

In [3]:
from stable_baselines3.common.env_checker import check_env
db = Database(game_metadata, buyer_strategies, seller_strategies)
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
env = TradingEnv(db, nsteps)
check_env(env)

In [5]:
rnd = 0
db = Database(game_metadata, buyer_strategies, seller_strategies)
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
env = TradingEnv(db, nsteps)
observation, info = env.reset()
for period in count():
    for timestep in count(): 
        action = env.action_space.sample()
        observation, reward, done, info, _ = env.step(action)
        print(f"Rnd: {rnd}, Period: {period}, New State: {observation}, Action:{np.round(action,1)}, Reward: {np.round(reward,1)}, Period End: {done}")
        if done:
            # If the episode is done, reset the environment
            #print('done')
            observation, info = env.reset()
            break
    if period == nperiods:
        period = 0
        break
env.close()

Rnd: 0, Period: 0, New State: [ 1.    31.2    1.    83.93   0.     1.    57.565  1.     1.    87.6
 30.035  1.     1.   ], Action:[1.], Reward: 30.0, Period End: False
Rnd: 0, Period: 0, New State: [ 2.    30.5    0.    78.5    1.     1.    54.5    1.     1.    73.1
 30.035  1.     1.   ], Action:[0.6], Reward: 0.0, Period End: False
Rnd: 0, Period: 0, New State: [ 3.    33.     1.    94.8    2.     1.    63.9    1.     1.    73.1
 30.035  1.     1.   ], Action:[0.8], Reward: 0.0, Period End: False
Rnd: 0, Period: 0, New State: [ 4.    40.3    2.    73.5    1.     1.    56.9    1.     1.    73.1
 30.035  1.     1.   ], Action:[0.], Reward: 0.0, Period End: False
Rnd: 0, Period: 0, New State: [ 5.    44.6    2.    75.8    2.     1.    60.2    1.     1.    73.1
 30.035  1.     1.   ], Action:[0.4], Reward: 0.0, Period End: False
Rnd: 0, Period: 0, New State: [ 6.    39.7    2.    75.3    1.     1.    57.5    1.     1.    73.1
 30.035  1.     1.   ], Action:[0.3], Reward: 0.0, Period End:

Rnd: 0, Period: 1, New State: [41.  63.4  1.  33.   2.   0.  -1.   0.   0.  53.4  3.   0.   4. ], Action:[0.6], Reward: 0.0, Period End: False
Rnd: 0, Period: 1, New State: [42.  60.9  1.  53.4  2.   0.  -1.   0.   0.  53.4  3.   0.   4. ], Action:[0.9], Reward: 0.0, Period End: False
Rnd: 0, Period: 1, New State: [43.  63.9  1.  43.2  2.   0.  -1.   0.   0.  53.4  3.   0.   4. ], Action:[0.4], Reward: 0.0, Period End: False
Rnd: 0, Period: 1, New State: [44.  69.3  0.  43.4  2.   0.  -1.   0.   0.  53.4  3.   0.   4. ], Action:[0.2], Reward: 0.0, Period End: False
Rnd: 0, Period: 1, New State: [45.   69.6   1.   51.58  0.    0.   -1.    0.    0.   53.4   3.    0.
  4.  ], Action:[1.], Reward: 0.0, Period End: False
Rnd: 0, Period: 1, New State: [46.  60.5  1.  39.4  2.   0.  -1.   0.   0.  53.4  3.   0.   4. ], Action:[0.6], Reward: 0.0, Period End: False
Rnd: 0, Period: 1, New State: [47.   77.8   1.   38.71  0.    0.   -1.    0.    0.   53.4   3.    0.
  4.  ], Action:[0.7], Reward:

Rnd: 0, Period: 3, New State: [27.   68.4   0.   42.3   2.    0.   -1.    0.    0.   53.4   1.55  0.
  4.  ], Action:[0.6], Reward: 0.0, Period End: False
Rnd: 0, Period: 3, New State: [28.   68.9   0.   26.8   1.    0.   -1.    0.    0.   53.4   1.55  0.
  4.  ], Action:[0.1], Reward: 0.0, Period End: False
Rnd: 0, Period: 3, New State: [29.   74.3   2.   50.94  0.    0.   -1.    0.    0.   53.4   1.55  0.
  4.  ], Action:[1.], Reward: 0.0, Period End: False
Rnd: 0, Period: 3, New State: [30.   69.1   1.   43.3   2.    0.   -1.    0.    0.   53.4   1.55  0.
  4.  ], Action:[0.8], Reward: 0.0, Period End: False
Rnd: 0, Period: 3, New State: [31.   67.1   2.   42.3   2.    0.   -1.    0.    0.   53.4   1.55  0.
  4.  ], Action:[0.5], Reward: 0.0, Period End: False
Rnd: 0, Period: 3, New State: [32.   63.5   2.   38.4   2.    0.   -1.    0.    0.   53.4   1.55  0.
  4.  ], Action:[0.1], Reward: 0.0, Period End: False
Rnd: 0, Period: 3, New State: [33.   71.4   1.   41.4   2.    0.   -1. 

Rnd: 0, Period: 5, New State: [ 4.    34.3    2.    79.9    1.     1.    57.1    1.     1.    73.1
 43.125  1.     1.   ], Action:[0.8], Reward: 0.0, Period End: False
Rnd: 0, Period: 5, New State: [ 5.    38.6    1.    71.8    1.     1.    55.2    1.     1.    73.1
 43.125  1.     1.   ], Action:[0.2], Reward: 0.0, Period End: False
Rnd: 0, Period: 5, New State: [ 6.    38.4    2.    52.7    2.     1.    45.55   1.     1.    73.1
 43.125  1.     1.   ], Action:[0.2], Reward: 0.0, Period End: False
Rnd: 0, Period: 5, New State: [ 7.   46.8   1.   63.48  0.    1.   55.14  1.    1.   73.1  17.96  1.
  2.  ], Action:[0.9], Reward: 18.0, Period End: False
Rnd: 0, Period: 5, New State: [ 8.   45.5   0.   47.7   2.    1.   46.6   1.    1.   65.1  17.96  1.
  2.  ], Action:[0.4], Reward: 0.0, Period End: False
Rnd: 0, Period: 5, New State: [ 9.   46.4   2.   46.3   1.    1.   46.35  1.    1.   65.1  17.96  1.
  2.  ], Action:[0.7], Reward: 0.0, Period End: False
Rnd: 0, Period: 5, New State: 

Rnd: 0, Period: 6, New State: [48.  72.6  2.  51.6  0.   0.  -1.   0.   0.  53.4  0.4  0.   4. ], Action:[1.], Reward: 0.0, Period End: False
Rnd: 0, Period: 6, New State: [49.  68.2  0.  38.7  1.   0.  -1.   0.   0.  53.4  0.4  0.   4. ], Action:[0.2], Reward: 0.0, Period End: False
Rnd: 0, Period: 6, New State: [50.  79.1  2.  39.8  0.   0.  -1.   0.   0.  53.4  0.4  0.   4. ], Action:[0.7], Reward: 0.0, Period End: True
Rnd: 0, Period: 7, New State: [ 1.  23.9  1.  98.1  2.   1.  61.   1.   1.  87.6  0.4  0.   0. ], Action:[1.], Reward: 0.0, Period End: False
Rnd: 0, Period: 7, New State: [ 2.  35.5  1.  78.5  2.   1.  57.   1.   1.  87.6  0.4  0.   0. ], Action:[0.7], Reward: 0.0, Period End: False
Rnd: 0, Period: 7, New State: [ 3.   31.4   0.   78.1   1.    1.   54.75  1.    1.   87.6   0.4   0.
  0.  ], Action:[0.], Reward: 0.0, Period End: False
Rnd: 0, Period: 7, New State: [ 4.   43.3   0.   84.06  0.    1.   63.68  1.    1.   87.6  23.92  1.
  1.  ], Action:[1.], Reward: 23.

Rnd: 0, Period: 8, New State: [21.   65.    2.   49.    0.    0.   -1.    0.    0.   53.4   1.44  0.
  4.  ], Action:[0.9], Reward: 0.0, Period End: False
Rnd: 0, Period: 8, New State: [22.   79.5   2.   31.9   1.    0.   -1.    0.    0.   53.4   1.44  0.
  4.  ], Action:[0.2], Reward: 0.0, Period End: False
Rnd: 0, Period: 8, New State: [23.   68.8   2.   32.38  0.    0.   -1.    0.    0.   53.4   1.44  0.
  4.  ], Action:[0.6], Reward: 0.0, Period End: False
Rnd: 0, Period: 8, New State: [24.   63.7   2.   40.3   2.    0.   -1.    0.    0.   53.4   1.44  0.
  4.  ], Action:[0.6], Reward: 0.0, Period End: False
Rnd: 0, Period: 8, New State: [25.   64.5   1.   39.    2.    0.   -1.    0.    0.   53.4   1.44  0.
  4.  ], Action:[0.1], Reward: 0.0, Period End: False
Rnd: 0, Period: 8, New State: [26.   63.4   2.   33.4   2.    0.   -1.    0.    0.   53.4   1.44  0.
  4.  ], Action:[0.2], Reward: 0.0, Period End: False
Rnd: 0, Period: 8, New State: [27.   75.    0.   38.6   2.    0.   -1.

Rnd: 0, Period: 9, New State: [45.   62.4   1.   52.6   2.    0.   -1.    0.    0.   53.4   3.73  0.
  4.  ], Action:[0.7], Reward: 0.0, Period End: False
Rnd: 0, Period: 9, New State: [46.   73.3   0.   30.2   1.    0.   -1.    0.    0.   53.4   3.73  0.
  4.  ], Action:[0.5], Reward: 0.0, Period End: False
Rnd: 0, Period: 9, New State: [47.   62.1   1.   52.1   2.    0.   -1.    0.    0.   53.4   3.73  0.
  4.  ], Action:[1.], Reward: 0.0, Period End: False
Rnd: 0, Period: 9, New State: [48.   76.5   2.   43.2   2.    0.   -1.    0.    0.   53.4   3.73  0.
  4.  ], Action:[0.], Reward: 0.0, Period End: False
Rnd: 0, Period: 9, New State: [49.   63.8   1.   34.8   1.    0.   -1.    0.    0.   53.4   3.73  0.
  4.  ], Action:[0.2], Reward: 0.0, Period End: False
Rnd: 0, Period: 9, New State: [50.   65.1   2.   39.1   2.    0.   -1.    0.    0.   53.4   3.73  0.
  4.  ], Action:[0.6], Reward: 0.0, Period End: True
Rnd: 0, Period: 10, New State: [ 1.   29.2   1.   70.3   1.    1.   49.75

In [6]:
db.step_data.head(1000).groupby('current_bid_idx').sum()

Unnamed: 0_level_0,rnd,period,step,current_bid,current_ask,current_ask_idx,buy,sell,price,sale,bprofit,sprofit
current_bid_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,816,4035,8201.29,10723.9,217,35,38,2479.67,44,674.03,471.97
1,0,651,2597,5343.9,7157.6,140,44,41,2233.95,44,1117.75,696.55
2,0,1283,6843,11814.6,16869.4,301,49,48,2645.6,51,1101.1,859.2


In [7]:
# Define your environment and parameters (replace with your actual environment setup)
rnd = 0
period = 0
num_states = nsteps
min_frac = 0.01
max_frac = 1.5
eval_steps = 1000
training_step = 50000

In [8]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO, A2C, DQN, SAC
from stable_baselines3.ppo.policies import MlpPolicy
from stable_baselines3.common.base_class import BaseAlgorithm
from stable_baselines3.common.evaluation import evaluate_policy

### Continous Action Space

In [9]:
from stable_baselines3 import SAC, DDPG, TD3, A2C, PPO
db = Database(game_metadata, buyer_strategies, seller_strategies)
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
env = TradingEnv(db, nsteps)
policy_kwargs = dict(net_arch=dict(pi=[128, 128], qf=[128, 128]))
model = SAC("MlpPolicy", env, policy_kwargs=policy_kwargs, verbose=1,)
model.learn(50000, progress_bar = True)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -29      |
| time/              |          |
|    episodes        | 8        |
|    fps             | 28       |
|    time_elapsed    | 14       |
|    total_timesteps | 400      |
| train/             |          |
|    actor_loss      | -0.249   |
|    critic_loss     | 6.65     |
|    ent_coef        | 0.915    |
|    ent_coef_loss   | -0.147   |
|    learning_rate   | 0.0003   |
|    n_updates       | 299      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -24.1    |
| time/              |          |
|    episodes        | 12       |
|    fps             | 23       |
|    time_elapsed    | 25       |
|    total_timesteps | 600      |
| train/             |          |
|    actor_loss      | 0.643    |
|    critic_loss     | 8.26     |
|    ent_coef 

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 5.06     |
| time/              |          |
|    episodes        | 68       |
|    fps             | 32       |
|    time_elapsed    | 104      |
|    total_timesteps | 3400     |
| train/             |          |
|    actor_loss      | 4.4      |
|    critic_loss     | 1.69     |
|    ent_coef        | 0.432    |
|    ent_coef_loss   | -0.655   |
|    learning_rate   | 0.0003   |
|    n_updates       | 3299     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 5.56     |
| time/              |          |
|    episodes        | 72       |
|    fps             | 33       |
|    time_elapsed    | 108      |
|    total_timesteps | 3600     |
| train/             |          |
|    actor_loss      | 4.27     |
|    critic_loss     | 1.91     |
|    ent_coef 

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 17.3     |
| time/              |          |
|    episodes        | 128      |
|    fps             | 34       |
|    time_elapsed    | 184      |
|    total_timesteps | 6400     |
| train/             |          |
|    actor_loss      | 5.19     |
|    critic_loss     | 2.98     |
|    ent_coef        | 0.198    |
|    ent_coef_loss   | -0.799   |
|    learning_rate   | 0.0003   |
|    n_updates       | 6299     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 18.1     |
| time/              |          |
|    episodes        | 132      |
|    fps             | 34       |
|    time_elapsed    | 191      |
|    total_timesteps | 6600     |
| train/             |          |
|    actor_loss      | 4.78     |
|    critic_loss     | 2.14     |
|    ent_coef 

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 25.8     |
| time/              |          |
|    episodes        | 188      |
|    fps             | 33       |
|    time_elapsed    | 277      |
|    total_timesteps | 9400     |
| train/             |          |
|    actor_loss      | 3.19     |
|    critic_loss     | 1.96     |
|    ent_coef        | 0.083    |
|    ent_coef_loss   | -1.1     |
|    learning_rate   | 0.0003   |
|    n_updates       | 9299     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 27       |
| time/              |          |
|    episodes        | 192      |
|    fps             | 34       |
|    time_elapsed    | 282      |
|    total_timesteps | 9600     |
| train/             |          |
|    actor_loss      | 3.82     |
|    critic_loss     | 1.64     |
|    ent_coef 

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 33.2     |
| time/              |          |
|    episodes        | 248      |
|    fps             | 36       |
|    time_elapsed    | 341      |
|    total_timesteps | 12400    |
| train/             |          |
|    actor_loss      | 0.975    |
|    critic_loss     | 1.28     |
|    ent_coef        | 0.0384   |
|    ent_coef_loss   | -1.69    |
|    learning_rate   | 0.0003   |
|    n_updates       | 12299    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 33.6     |
| time/              |          |
|    episodes        | 252      |
|    fps             | 36       |
|    time_elapsed    | 345      |
|    total_timesteps | 12600    |
| train/             |          |
|    actor_loss      | 0.775    |
|    critic_loss     | 1.47     |
|    ent_coef 

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 35.6     |
| time/              |          |
|    episodes        | 308      |
|    fps             | 37       |
|    time_elapsed    | 408      |
|    total_timesteps | 15400    |
| train/             |          |
|    actor_loss      | -0.357   |
|    critic_loss     | 0.997    |
|    ent_coef        | 0.0195   |
|    ent_coef_loss   | -1.06    |
|    learning_rate   | 0.0003   |
|    n_updates       | 15299    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 35.7     |
| time/              |          |
|    episodes        | 312      |
|    fps             | 37       |
|    time_elapsed    | 412      |
|    total_timesteps | 15600    |
| train/             |          |
|    actor_loss      | -0.778   |
|    critic_loss     | 1.46     |
|    ent_coef 

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 38.1     |
| time/              |          |
|    episodes        | 368      |
|    fps             | 38       |
|    time_elapsed    | 472      |
|    total_timesteps | 18400    |
| train/             |          |
|    actor_loss      | -0.224   |
|    critic_loss     | 0.35     |
|    ent_coef        | 0.0131   |
|    ent_coef_loss   | -0.684   |
|    learning_rate   | 0.0003   |
|    n_updates       | 18299    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 38.3     |
| time/              |          |
|    episodes        | 372      |
|    fps             | 39       |
|    time_elapsed    | 476      |
|    total_timesteps | 18600    |
| train/             |          |
|    actor_loss      | -0.934   |
|    critic_loss     | 1.42     |
|    ent_coef 

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 39.7     |
| time/              |          |
|    episodes        | 428      |
|    fps             | 39       |
|    time_elapsed    | 536      |
|    total_timesteps | 21400    |
| train/             |          |
|    actor_loss      | -1.21    |
|    critic_loss     | 1.3      |
|    ent_coef        | 0.0121   |
|    ent_coef_loss   | -0.379   |
|    learning_rate   | 0.0003   |
|    n_updates       | 21299    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 39.6     |
| time/              |          |
|    episodes        | 432      |
|    fps             | 39       |
|    time_elapsed    | 540      |
|    total_timesteps | 21600    |
| train/             |          |
|    actor_loss      | -1.49    |
|    critic_loss     | 1.25     |
|    ent_coef 

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 39.3     |
| time/              |          |
|    episodes        | 488      |
|    fps             | 40       |
|    time_elapsed    | 602      |
|    total_timesteps | 24400    |
| train/             |          |
|    actor_loss      | -1.71    |
|    critic_loss     | 0.769    |
|    ent_coef        | 0.0123   |
|    ent_coef_loss   | -0.0184  |
|    learning_rate   | 0.0003   |
|    n_updates       | 24299    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 39.4     |
| time/              |          |
|    episodes        | 492      |
|    fps             | 40       |
|    time_elapsed    | 606      |
|    total_timesteps | 24600    |
| train/             |          |
|    actor_loss      | -1.15    |
|    critic_loss     | 0.741    |
|    ent_coef 

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 39.4     |
| time/              |          |
|    episodes        | 548      |
|    fps             | 40       |
|    time_elapsed    | 671      |
|    total_timesteps | 27400    |
| train/             |          |
|    actor_loss      | -2.8     |
|    critic_loss     | 1.3      |
|    ent_coef        | 0.014    |
|    ent_coef_loss   | -0.0385  |
|    learning_rate   | 0.0003   |
|    n_updates       | 27299    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 39.7     |
| time/              |          |
|    episodes        | 552      |
|    fps             | 40       |
|    time_elapsed    | 676      |
|    total_timesteps | 27600    |
| train/             |          |
|    actor_loss      | -1.6     |
|    critic_loss     | 0.478    |
|    ent_coef 

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 40.4     |
| time/              |          |
|    episodes        | 608      |
|    fps             | 40       |
|    time_elapsed    | 746      |
|    total_timesteps | 30400    |
| train/             |          |
|    actor_loss      | -2.8     |
|    critic_loss     | 0.757    |
|    ent_coef        | 0.0144   |
|    ent_coef_loss   | 0.252    |
|    learning_rate   | 0.0003   |
|    n_updates       | 30299    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 40.5     |
| time/              |          |
|    episodes        | 612      |
|    fps             | 40       |
|    time_elapsed    | 750      |
|    total_timesteps | 30600    |
| train/             |          |
|    actor_loss      | -2.68    |
|    critic_loss     | 1.23     |
|    ent_coef 

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 41.1     |
| time/              |          |
|    episodes        | 668      |
|    fps             | 40       |
|    time_elapsed    | 819      |
|    total_timesteps | 33400    |
| train/             |          |
|    actor_loss      | -2.33    |
|    critic_loss     | 0.909    |
|    ent_coef        | 0.0151   |
|    ent_coef_loss   | 0.25     |
|    learning_rate   | 0.0003   |
|    n_updates       | 33299    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 41.3     |
| time/              |          |
|    episodes        | 672      |
|    fps             | 40       |
|    time_elapsed    | 824      |
|    total_timesteps | 33600    |
| train/             |          |
|    actor_loss      | -1.85    |
|    critic_loss     | 0.801    |
|    ent_coef 

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 41.1     |
| time/              |          |
|    episodes        | 728      |
|    fps             | 40       |
|    time_elapsed    | 897      |
|    total_timesteps | 36400    |
| train/             |          |
|    actor_loss      | -2.11    |
|    critic_loss     | 0.583    |
|    ent_coef        | 0.015    |
|    ent_coef_loss   | -0.34    |
|    learning_rate   | 0.0003   |
|    n_updates       | 36299    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 41.2     |
| time/              |          |
|    episodes        | 732      |
|    fps             | 40       |
|    time_elapsed    | 902      |
|    total_timesteps | 36600    |
| train/             |          |
|    actor_loss      | -1.53    |
|    critic_loss     | 0.936    |
|    ent_coef 

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 39.9     |
| time/              |          |
|    episodes        | 788      |
|    fps             | 40       |
|    time_elapsed    | 977      |
|    total_timesteps | 39400    |
| train/             |          |
|    actor_loss      | -1.91    |
|    critic_loss     | 0.739    |
|    ent_coef        | 0.0152   |
|    ent_coef_loss   | 0.261    |
|    learning_rate   | 0.0003   |
|    n_updates       | 39299    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 40.1     |
| time/              |          |
|    episodes        | 792      |
|    fps             | 40       |
|    time_elapsed    | 983      |
|    total_timesteps | 39600    |
| train/             |          |
|    actor_loss      | -1.73    |
|    critic_loss     | 0.772    |
|    ent_coef 

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 40.6     |
| time/              |          |
|    episodes        | 856      |
|    fps             | 39       |
|    time_elapsed    | 1075     |
|    total_timesteps | 42800    |
| train/             |          |
|    actor_loss      | -2.4     |
|    critic_loss     | 0.538    |
|    ent_coef        | 0.0148   |
|    ent_coef_loss   | -0.691   |
|    learning_rate   | 0.0003   |
|    n_updates       | 42699    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 40.5     |
| time/              |          |
|    episodes        | 860      |
|    fps             | 39       |
|    time_elapsed    | 1080     |
|    total_timesteps | 43000    |
| train/             |          |
|    actor_loss      | -1.78    |
|    critic_loss     | 0.759    |
|    ent_coef 

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 41       |
| time/              |          |
|    episodes        | 920      |
|    fps             | 39       |
|    time_elapsed    | 1163     |
|    total_timesteps | 46000    |
| train/             |          |
|    actor_loss      | -2.21    |
|    critic_loss     | 0.803    |
|    ent_coef        | 0.0136   |
|    ent_coef_loss   | 0.313    |
|    learning_rate   | 0.0003   |
|    n_updates       | 45899    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 41       |
| time/              |          |
|    episodes        | 924      |
|    fps             | 39       |
|    time_elapsed    | 1169     |
|    total_timesteps | 46200    |
| train/             |          |
|    actor_loss      | -2.59    |
|    critic_loss     | 0.887    |
|    ent_coef 

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 42.1     |
| time/              |          |
|    episodes        | 980      |
|    fps             | 39       |
|    time_elapsed    | 1248     |
|    total_timesteps | 49000    |
| train/             |          |
|    actor_loss      | -1.75    |
|    critic_loss     | 0.53     |
|    ent_coef        | 0.012    |
|    ent_coef_loss   | -0.134   |
|    learning_rate   | 0.0003   |
|    n_updates       | 48899    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 42.2     |
| time/              |          |
|    episodes        | 988      |
|    fps             | 39       |
|    time_elapsed    | 1260     |
|    total_timesteps | 49400    |
| train/             |          |
|    actor_loss      | -2.24    |
|    critic_loss     | 0.71     |
|    ent_coef 

<stable_baselines3.sac.sac.SAC at 0x108a137c0>

In [None]:
model.learn(50000, progress_bar = False)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30       |
|    ep_rew_mean     | 123      |
| time/              |          |
|    episodes        | 4        |
|    fps             | 61       |
|    time_elapsed    | 1        |
|    total_timesteps | 120      |
| train/             |          |
|    actor_loss      | -61.4    |
|    critic_loss     | 2.13     |
|    ent_coef        | 0.225    |
|    ent_coef_loss   | 0.0238   |
|    learning_rate   | 0.0003   |
|    n_updates       | 66929    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30       |
|    ep_rew_mean     | 177      |
| time/              |          |
|    episodes        | 8        |
|    fps             | 44       |
|    time_elapsed    | 5        |
|    total_timesteps | 240      |
| train/             |          |
|    actor_loss      | -52.6    |
|    critic_loss     | 1.66     |
|    ent_coef 

In [None]:
db.step_data.head(60).groupby('current_bid_idx').sum()

In [None]:
db.step_data.tail(60).groupby('current_bid_idx').sum()

In [None]:
db.round_data.redemption_values.item()

In [None]:
db.step_data.tail(60)

In [None]:
graph_period(env.db, 0, 2236)

### Discrete Action Spaces

In [None]:
from stable_baselines3 import DQN
db = Database(game_metadata, buyer_strategies, seller_strategies)
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
env = TradingEnv(db, nsteps)
env.action_space = spaces.Discrete(51)
#policy_kwargs = dict(net_arch=dict(pi=[64, 64], qf=[64, 64]))
model = DQN("MlpPolicy", env, verbose=1,)
model.learn(50000, progress_bar = False)

In [None]:
db.step_data.head(100).groupby('current_bid_idx').sum()

In [None]:
db.step_data.tail(100).groupby('current_bid_idx').sum()

## ON POLICY

### DDPG - Deterministic Deep Policy Gradient

In [None]:
from stable_baselines3 import DDPG
db = Database(game_metadata, buyer_strategies, seller_strategies)
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
env = TradingEnv(db, nsteps)
policy_kwargs = dict(net_arch=dict(pi=[64, 64], qf=[64, 64]))
model = DDPG("MlpPolicy", env, policy_kwargs=policy_kwargs, verbose=1,)
model.learn(50000, progress_bar = True)

In [None]:
db.step_data.head(100).groupby('current_bid_idx').sum()

In [None]:
db.step_data.tail(100).groupby('current_bid_idx').sum()

### PPO - Proximal Policy Optimization

In [None]:
from stable_baselines3 import PPO
db = Database(game_metadata, buyer_strategies, seller_strategies)
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
env = TradingEnv(db, nsteps)
policy_kwargs = dict(net_arch=dict(pi=[64, 64], qf=[64, 64]))
model = PPO("MlpPolicy", env, policy_kwargs=policy_kwargs, verbose=1)
model.learn(50000, progress_bar = False)

### A2C - Advantage Actor-Critic

In [None]:
# Create A2C model
from stable_baselines3.ppo.policies import MlpPolicy
a2c_model = A2C(MlpPolicy, env, verbose=0)

# Train the A2C agent for 10000 steps
a2c_model.learn(total_timesteps=training_step, progress_bar = True)

# Evaluate the trained A2C agent
mean_reward_a2c, std_reward_a2c = evaluate_policy(a2c_model, env, n_eval_episodes=eval_steps)
print(f"A2C mean_reward: {mean_reward_a2c:.2f} +/- {std_reward_a2c:.2f}")