In [1]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from functions import *

In [19]:
class TradingEnv(gym.Env):
    def __init__(self, db):
        # Define your environment parameters here
        self.db = db
        self.rnd = 0
        self.period = 0
        self.action_space = spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)
        self.observation_space = spaces.Box(low=0, high=10, shape=(1,), dtype=np.float32)
        self.state = 0 

    def reset(self):
        # Reset the environment to its initial state and return the initial observation
        self.period += 1
        self.db.reset_period(self.rnd)
        self.state = 0
        observation = self.state  # Replace this with your actual observation
        reset_info = None  # Replace this with any reset information you want to return
        return observation, reset_info

    def step(self, action):
        # Take a step in the environment given an action and return a new observation, reward, done flag, and additional information
        # convert action to bid
        self.db.buyers[0].next_token()
        min_bid = self.db.buyers[0].value*0.1
        max_bid = self.db.buyers[0].value*1.9
        bid = min_bid * action.item() + (1-action.item())*max_bid
        
        # simulate market
        bids = [bid]
        asks = [seller.ask(self.db) for seller in self.db.sellers]
        current_ask, current_ask_idx, current_bid, current_bid_idx = current_bid_ask(bids, asks) 
        sale, price, bprofit, sprofit, buy, sell = buy_sell(db, current_bid, current_bid_idx, current_ask, current_ask_idx)
        step_data = [self.rnd,self.period,self.state,bids,asks,current_bid,current_bid_idx,current_ask,current_ask_idx,buy,sell,price,sale,bprofit,sprofit]
        self.db.add_step(step_data)
        
        # compute reward
        reward = 0
        if (sale == 1) and (current_bid_idx == 0):
            reward = bprofit
        new_state = self.state + 1
        
        # check termination 
        done = False
        if self.state == nsteps-1:
            new_state = 0
            done == True
            self.db.reset_period(self.rnd)
        info = None
        
        self.state = new_state  # Update the current state
        reset_info = None
        return new_state, reward, done, info,reset_info 

In [23]:
buyer_strategies = ['Honest']
seller_strategies = ['Honest','Honest','Honest','Honest','Honest','Honest','Honest','Honest']
nbuyers, nsellers = len(buyer_strategies), len(seller_strategies)
nrounds, nperiods, ntokens, nsteps, gametype, nbuyers, nsellers = 1, 10000, 10, 10, '1234', len(buyer_strategies), len(seller_strategies)
R1, R2, R3, R4 = gametype_to_ran(gametype)
game_metadata = [nrounds, nperiods, ntokens, nbuyers, nsellers, nsteps, R1, R2, R3, R4]
db = Database(game_metadata, buyer_strategies, seller_strategies)
rnd = 0
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
period = 0

# Create an instance of your custom environment
env = TradingEnv(db)

# Reset the environment to its initial state
observation = env.reset()

# Test some steps in the environment
for _ in range(20):  # You can adjust the number of steps
    # Replace this with your action selection logic
    action = env.action_space.sample()  # Random action for testing

    # Take a step in the environment
    new_observation, reward, done, info, reset_info = env.step(action)

    # Print the current observation, reward, and whether the episode is done
    print(f"Rnd: {env.rnd}, Period: {env.period}, State: {env.state}, 'Action':{action}, Reward: {reward}, New State: {new_observation}, Done: {done}")

    if done:
        # If the episode is done, reset the environment
        observation = env.reset()

# Close the environment when done
env.close()


Rnd: 0, Period: 1, State: 1, 'Action':[0.24356583], Reward: 13.913632530570027, New State: 1, Done: False
Rnd: 0, Period: 1, State: 2, 'Action':[0.8241091], Reward: 44.1563435268402, New State: 2, Done: False
Rnd: 0, Period: 1, State: 3, 'Action':[0.59829557], Reward: 22.13640349626541, New State: 3, Done: False
Rnd: 0, Period: 1, State: 4, 'Action':[0.05320682], Reward: -5.808864063769583, New State: 4, Done: False
Rnd: 0, Period: 1, State: 5, 'Action':[0.66940695], Reward: 21.158066145777703, New State: 5, Done: False
Rnd: 0, Period: 1, State: 6, 'Action':[0.69720453], Reward: 20.354732521772387, New State: 6, Done: False
Rnd: 0, Period: 1, State: 7, 'Action':[0.81869787], Reward: 23.500000000000004, New State: 7, Done: False
Rnd: 0, Period: 1, State: 8, 'Action':[0.8030623], Reward: 19.5, New State: 8, Done: False
Rnd: 0, Period: 1, State: 9, 'Action':[0.8464067], Reward: 13.200000000000003, New State: 9, Done: False
Rnd: 0, Period: 1, State: 0, 'Action':[0.21017773], Reward: -10.74

In [49]:
import gymnasium as gym
from stable_baselines3 import A2C
env = gym.make("CartPole-v1")
model = A2C("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10_000)
vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)
    print(_state, action, obs, reward)
    #vec_env.render("human")
    # VecEnv resets automatically
    # if done:
    #   obs = vec_env.reset()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 46.8     |
|    ep_rew_mean        | 46.8     |
| time/                 |          |
|    fps                | 593      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.677   |
|    explained_variance | 0.375    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 1.59     |
|    value_loss         | 7.54     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 49.7     |
|    ep_rew_mean        | 49.7     |
| time/                 |          |
|    fps                | 568      |
|    iterations         | 200      |
|    time_elapsed 

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 72.1      |
|    ep_rew_mean        | 72.1      |
| time/                 |           |
|    fps                | 596       |
|    iterations         | 1400      |
|    time_elapsed       | 11        |
|    total_timesteps    | 7000      |
| train/                |           |
|    entropy_loss       | -0.157    |
|    explained_variance | -1.87e-05 |
|    learning_rate      | 0.0007    |
|    n_updates          | 1399      |
|    policy_loss        | -68.9     |
|    value_loss         | 4.22e+03  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 74.1      |
|    ep_rew_mean        | 74.1      |
| time/                 |           |
|    fps                | 593       |
|    iterations         | 1500      |
|    time_elapsed       | 12        |
|    total_timesteps    | 7500      |
| train/    

None [1] [[ 0.39567783  0.41507563  0.09400804 -0.25227627]] [1.]
None [0] [[0.40397936 0.21874581 0.08896252 0.0685158 ]] [1.]
None [1] [[ 0.40835428  0.41248715  0.09033283 -0.19482645]] [1.]
None [0] [[0.416604   0.21619694 0.0864363  0.12493011]] [1.]
None [1] [[ 0.42092794  0.40998107  0.08893491 -0.13928027]] [1.]
None [1] [[ 0.42912757  0.6037241   0.0861493  -0.4026339 ]] [1.]
None [0] [[ 0.44120204  0.4074927   0.07809662 -0.08408178]] [1.]
None [1] [[ 0.4493519   0.6014133   0.07641499 -0.35113907]] [1.]
None [0] [[ 0.46138018  0.40529257  0.0693922  -0.03537224]] [1.]
None [1] [[ 0.46948603  0.5993543   0.06868476 -0.3053794 ]] [1.]
None [0] [[0.48147312 0.40332425 0.06257717 0.00814986]] [1.]
None [1] [[ 0.4895396   0.59749556  0.06274017 -0.2641513 ]] [1.]
None [0] [[0.5014895  0.4015368  0.05745714 0.04764199]] [1.]
None [1] [[ 0.50952023  0.5957898   0.05840998 -0.22637357]] [1.]
None [1] [[ 0.52143604  0.7900304   0.05388251 -0.5000749 ]] [1.]
None [0] [[ 0.53723663  0.

None [1] [[ 0.51042646  0.7887823   0.0968321  -0.47422048]] [1.]
None [0] [[ 0.5262021   0.5924358   0.08734769 -0.15265626]] [1.]
None [1] [[ 0.53805083  0.7862055   0.08429457 -0.41655514]] [1.]
None [0] [[ 0.5537749   0.5899964   0.07596347 -0.09853323]] [1.]
None [1] [[ 0.5655748   0.78395206  0.0739928  -0.36631596]] [1.]
None [1] [[ 0.5812539   0.97794884  0.06666648 -0.6347808 ]] [1.]
None [0] [[ 0.60081285  0.78196347  0.05397086 -0.3218704 ]] [1.]
None [1] [[ 0.61645216  0.97627693  0.04753346 -0.5970565 ]] [1.]
None [0] [[ 0.6359777   0.7805232   0.03559233 -0.28978795]] [1.]
None [1] [[ 0.65158814  0.97512     0.02979657 -0.5710364 ]] [1.]
None [0] [[ 0.67109054  0.77959317  0.01837584 -0.2691174 ]] [1.]
None [1] [[ 0.6866824   0.97444814  0.01299349 -0.5559483 ]] [1.]
None [0] [[ 0.70617133  0.7791462   0.00187452 -0.25920013]] [1.]
None [1] [[ 0.7217543   0.9742413  -0.00330948 -0.5512912 ]] [1.]
None [0] [[ 0.74123913  0.779166   -0.0143353  -0.25965282]] [1.]
None [1] [

None [1] [[ 0.28524485  0.7708638   0.01054477 -0.4273353 ]] [1.]
None [0] [[ 0.30066213  0.57559407  0.00199806 -0.13134693]] [1.]
None [1] [[ 3.1217399e-01  7.7068734e-01 -6.2887440e-04 -4.2339882e-01]] [1.]
None [0] [[ 0.32758775  0.5755743  -0.00909685 -0.13091423]] [1.]
None [1] [[ 0.33909923  0.7708253  -0.01171514 -0.42645314]] [1.]
None [0] [[ 0.35451573  0.5758713  -0.0202442  -0.13748628]] [1.]
None [1] [[ 0.36603317  0.77127725 -0.02299392 -0.43648654]] [1.]
None [0] [[ 0.38145873  0.5764882  -0.03172366 -0.15113986]] [1.]
None [1] [[ 0.39298847  0.7720497  -0.03474645 -0.4536597 ]] [1.]
None [0] [[ 0.40842947  0.5774359  -0.04381965 -0.17212857]] [1.]
None [0] [[ 0.4199782   0.38296762 -0.04726222  0.10641498]] [1.]
None [1] [[ 0.42763755  0.57873386 -0.04513392 -0.20079643]] [1.]
None [0] [[ 0.43921223  0.3842855  -0.04914984  0.07731422]] [1.]
None [1] [[ 0.44689792  0.58007634 -0.04760356 -0.23046161]] [1.]
None [0] [[ 0.45849946  0.3856658  -0.05221279  0.04683368]] [1.

None [0] [[ 0.598315   -0.72655493  0.03355746  0.5124406 ]] [1.]
None [1] [[ 0.5837839  -0.5319213   0.04380627  0.2305186 ]] [1.]
None [0] [[ 0.5731455  -0.727641    0.04841664  0.53669137]] [1.]
None [1] [[ 0.5585927  -0.53323203  0.05915047  0.2596486 ]] [1.]
None [1] [[ 0.54792804 -0.3390022   0.06434345 -0.01380654]] [1.]
None [0] [[ 0.541148   -0.53498507  0.06406731  0.29846337]] [1.]
None [1] [[ 0.5304483  -0.34083208  0.07003658  0.02665431]] [1.]
None [0] [[ 0.52363163 -0.5368849   0.07056966  0.34058648]] [1.]
None [1] [[ 0.512894   -0.34283426  0.07738139  0.0709653 ]] [1.]
None [0] [[ 0.5060373  -0.5389754   0.0788007   0.38702404]] [1.]
None [1] [[ 0.49525777 -0.34505534  0.08654118  0.12019   ]] [1.]
None [0] [[ 0.48835665 -0.54130375  0.08894499  0.4388723 ]] [1.]
None [1] [[ 0.47753057 -0.34754583  0.09772243  0.17549972]] [1.]
None [1] [[ 0.47057965 -0.15394837  0.10123242 -0.08482629]] [1.]
None [0] [[ 0.4675007  -0.35036474  0.0995359   0.23800144]] [1.]
None [1] [

None [0] [[ 0.8203886   0.76042956 -0.02232812 -0.19728075]] [1.]
None [1] [[ 0.8355972   0.95586365 -0.02627374 -0.49692273]] [1.]
None [0] [[ 0.85471445  0.76112187 -0.03621219 -0.21263441]] [1.]
None [1] [[ 0.8699369   0.95674235 -0.04046488 -0.5165168 ]] [1.]
None [0] [[ 0.88907176  0.7622128  -0.05079522 -0.23685487]] [1.]
None [1] [[ 0.904316    0.9580223  -0.05553231 -0.5451176 ]] [1.]
None [0] [[ 0.92347646  0.76372284 -0.06643467 -0.27043578]] [1.]
None [1] [[ 0.9387509   0.9597268  -0.07184339 -0.5833106 ]] [1.]
None [0] [[ 0.95794547  0.76568097 -0.08350959 -0.3140965 ]] [1.]
None [1] [[ 0.9732591   0.9618871  -0.08979152 -0.63190335]] [1.]
None [0] [[ 0.9924968   0.76812506 -0.10242959 -0.3687944 ]] [1.]
None [0] [[ 1.0078593   0.57459635 -0.10980548 -0.1100845 ]] [1.]
None [1] [[ 1.0193512   0.7711065  -0.11200717 -0.43529242]] [1.]
None [0] [[ 1.0347733   0.5777336  -0.12071302 -0.17991196]] [1.]
None [1] [[ 1.0463281   0.7743575  -0.12431125 -0.50810516]] [1.]
None [0] [

In [47]:
import gym
from gym import spaces
import numpy as np

class CustomEnv(gym.Env):
    def __init__(self):
        # Define action and observation spaces
        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)  # Continuous action space between -1 and 1
        self.observation_space = spaces.Discrete(5)  # Discrete state space with 5 states
        self.state = 0

    def reset(self):
        # Reset the environment to its initial state and return the initial observation
        self.state = 0
        return self.state

    def step(self, action):
        # Take a step in the environment given an action and return a new observation, reward, done flag, and additional information
        # In this simple example, we'll update the state based on the action value
        self.state += action.item()
        
        # Clip the state to be within the range [0, 4] (inclusive)
        self.state = np.clip(self.state, 0, 4)
        
        # Define a simple reward function
        reward = 1.0 if self.state == 4 else 0.0
        
        # Check if the episode is done (reached the final state)
        done = self.state == 4
        
        # Additional information (empty dictionary in this case)
        info = {}
        
        return self.state, reward, done, info

# Testing the custom environment
if __name__ == "__main__":
    env = CustomEnv()
    obs = env.reset()
    for _ in range(20):
        action = env.action_space.sample()  # Random action for testing
        obs, reward, done, info = env.step(action)
        print(f"State: {obs}, Reward: {reward}, Done: {done}")


State: 0.9845199584960938, Reward: 0.0, Done: False
State: 0.6282570064067841, Reward: 0.0, Done: False
State: 1.2579917013645172, Reward: 0.0, Done: False
State: 0.57527294754982, Reward: 0.0, Done: False
State: 0.09608867764472961, Reward: 0.0, Done: False
State: 0.7159344255924225, Reward: 0.0, Done: False
State: 0.9144601970911026, Reward: 0.0, Done: False
State: 0.10994555056095123, Reward: 0.0, Done: False
State: 0.176019087433815, Reward: 0.0, Done: False
State: 0.0, Reward: 0.0, Done: False
State: 0.0, Reward: 0.0, Done: False
State: 0.0, Reward: 0.0, Done: False
State: 0.0, Reward: 0.0, Done: False
State: 0.09415487200021744, Reward: 0.0, Done: False
State: 0.0, Reward: 0.0, Done: False
State: 0.46615830063819885, Reward: 0.0, Done: False
State: 1.3211254179477692, Reward: 0.0, Done: False
State: 1.4971325099468231, Reward: 0.0, Done: False
State: 0.8313176929950714, Reward: 0.0, Done: False
State: 1.059971734881401, Reward: 0.0, Done: False
