<a href="https://colab.research.google.com/github/rawatpranjal/double-auctions/blob/main/code/3_deep_reinforcement_learning/07_PPO_A2C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
%cd /content/double-auctions/code/3_deep_reinforcement_learning
from functions import *
from itertools import count
buyer_strategies = ['Honest', 'Random']
seller_strategies = ['Random', 'Random', 'Random', 'Random', 'Random', 'Random']
nbuyers, nsellers = len(buyer_strategies), len(seller_strategies)
nrounds, nperiods, ntokens, nsteps, gametype, nbuyers, nsellers = 1, 100, 4, 6, '1234', len(buyer_strategies), len(seller_strategies)
R1, R2, R3, R4 = gametype_to_ran(gametype)
game_metadata = [nrounds, nperiods, ntokens, nbuyers, nsellers, nsteps, R1, R2, R3, R4]
db = Database(game_metadata, buyer_strategies, seller_strategies)
rnd = 0
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
period = 0
num_states = nsteps
min_frac = 0.01
max_frac = 1.0

/content/double-auctions/code/3_deep_reinforcement_learning


In [5]:
class TradingEnv(gym.Env):
    def __init__(self, db, nsteps, render_mode = None):
        self.rnd = 0
        self.period = -1
        self.db = db
        self.action_space = spaces.Box(0,1,(1,),dtype=np.float)
        self.observation_space = spaces.Box(0,nsteps,(1,),dtype=np.float)

    def reset(self,seed=None):
        self.db.reset_period(self.rnd)
        self.timestep = 0
        self.period += 1
        observation = np.array([0])
        return observation, None

    def step(self, action, seed=None, options=None):
        [buyer.next_token() for buyer in self.db.buyers]
        [seller.next_token() for seller in self.db.sellers]
        bid_frac = action.item()
        # convert action to bid
        self.db.buyers[0].next_token()
        min_bid = self.db.buyers[0].value * min_frac
        max_bid = self.db.buyers[0].value * max_frac
        bid = np.round(max_bid * bid_frac + (1 - bid_frac) * min_bid, 2)

        # simulate market
        bids = [buyer.ask(self.db) for buyer in self.db.buyers]
        bids[0] = bid
        asks = [seller.ask(self.db) for seller in self.db.sellers]
        current_ask, current_ask_idx, current_bid, current_bid_idx = current_bid_ask(bids, asks)
        sale, price, bprofit, sprofit, buy, sell = buy_sell(self.db, current_bid, current_bid_idx, current_ask, current_ask_idx)
        step_data = [self.rnd, self.period, self.timestep, bids, asks, current_bid, current_bid_idx, current_ask, current_ask_idx, buy, sell, price, sale, bprofit, sprofit]
        self.db.add_step(step_data)

        # compute reward, new state
        reward = np.array([0])
        if sale == 1 and current_bid_idx == 0:
            reward = np.array([bprofit])
        observation = np.array([self.timestep + 1])

        # check termination
        self.timestep += 1
        if self.timestep == nsteps - 1:
            terminated = True
            self.timestep = 0
        else:
            terminated = False
        return observation, reward, terminated, {}, {}

In [None]:
env = TradingEnv(db, nsteps)
rnd = 0
observation, info = env.reset()
for period in count():
    for timestep in count():
        action = env.action_space.sample()
        observation, reward, done, info, _ = env.step(action)
        print(f"Rnd: {rnd}, Period: {period}, New State: {observation}, Action:{np.round(action,1)}, Reward: {np.round(reward,1)}, Period End: {done}")
        if done:
            # If the episode is done, reset the environment
            print('done')
            observation, info = env.reset()
            timestep += 1
            break
    if period == nperiods:
        break
env.close()

### PPO + A2C

In [11]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO, A2C
from stable_baselines3.ppo.policies import MlpPolicy
from stable_baselines3.common.base_class import BaseAlgorithm
from stable_baselines3.common.evaluation import evaluate_policy

# Define your environment and parameters (replace with your actual environment setup)
rnd = 0
period = 0
num_states = nsteps
min_frac = 0.01
max_frac = 1.0
db = Database(game_metadata, buyer_strategies, seller_strategies)
db.reset_round(rnd, ntokens, nbuyers, nsellers, R1, R2, R3, R4)
env = TradingEnv(db, nsteps)
eval_steps = 100
training_step = 10000

# Create PPO model
from stable_baselines3.ppo.policies import MlpPolicy
random_model = A2C(MlpPolicy, env, verbose=0)

# Evaluate the random policy agent
mean_reward_random, std_reward_random = evaluate_policy(random_model, env, n_eval_episodes=eval_steps)
print(f"Random Policy mean_reward: {mean_reward_random:.2f} +/- {std_reward_random:.2f}")

# Create PPO model
from stable_baselines3.ppo.policies import MlpPolicy
ppo_model = PPO(MlpPolicy, env, verbose=0, device="cuda")

# Train the PPO agent for 10000 steps
ppo_model.learn(total_timesteps=training_step, progress_bar = True)

# Evaluate the trained PPO agent
mean_reward_ppo, std_reward_ppo = evaluate_policy(ppo_model, env, n_eval_episodes=eval_steps)
print(f"PPO mean_reward: {mean_reward_ppo:.2f} +/- {std_reward_ppo:.2f}")

# Create A2C model
from stable_baselines3.ppo.policies import MlpPolicy
a2c_model = A2C(MlpPolicy, env, verbose=0, device="cuda")

# Train the A2C agent for 10000 steps
a2c_model.learn(total_timesteps=training_step)

# Evaluate the trained A2C agent
mean_reward_a2c, std_reward_a2c = evaluate_policy(a2c_model, env, n_eval_episodes=eval_steps)
print(f"A2C mean_reward: {mean_reward_a2c:.2f} +/- {std_reward_a2c:.2f}")

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.action_space = spaces.Box(0,1,(1,),dtype=np.float)


Output()

Random Policy mean_reward: 0.00 +/- 0.00


PPO mean_reward: 86.20 +/- 2.74
A2C mean_reward: 75.35 +/- 2.42
