## Edgeworth Cycling Multi Agent Environment.

In [1]:
!pip install ray[rllib]==2.7.0

Defaulting to user installation because normal site-packages is not writeable
Collecting ray[rllib]==2.7.0
  Downloading ray-2.7.0-cp310-cp310-manylinux2014_x86_64.whl (62.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: ray
  Attempting uninstall: ray
    Found existing installation: ray 2.11.0
    Uninstalling ray-2.11.0:
      Successfully uninstalled ray-2.11.0
Successfully installed ray-2.7.0


In [2]:
import random
import gymnasium as gym
from gymnasium import spaces, vector
import numpy as np
from ray import tune
from scipy.stats import binom, nbinom, beta, poisson, gamma, norm, geom
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from functools import partial



In [3]:
'''
This code is a slightly simplified version of the environment found in the paper:
Oligopoly competition in fixed cost environments
https://www.sciencedirect.com/science/article/pii/S0167718703001577
'''

import numpy as np
import gymnasium as gym
from functools import partial
from gymnasium import spaces
from ray.rllib.env.multi_agent_env import MultiAgentEnv

def calculate_sales(offers, intercept, slope):
    total_sold = 0
    sales = {}

    offers = sorted([(offer[0], offer[1], aid) for aid, offer in offers.items()])
    i = 0
    while i < len(offers):
        # Find all offers with the same price at the current index
        current_price = offers[i][0]
        next_i = i+1
        while next_i < len(offers) and offers[next_i][0] == current_price:
            next_i += 1
        n_equal = next_i - i

        # Calculate remaining demand at this price
        demand = (intercept - current_price) / slope - total_sold
        demand = max(0,demand)
        for k in range(i, next_i):
            demand_share = demand / n_equal

            quantity = offers[k][1]
            firm_id = offers[k][2]
            actual_sold = min(quantity, demand_share)
            sales[firm_id] = actual_sold
            total_sold += actual_sold
            demand -= actual_sold

            n_equal -= 1
        i = next_i
    return sales


class OligopolyMarket(MultiAgentEnv):
    def __init__(self, seed=None):
        super().__init__()
        self.n_firms = 5
        self._agent_ids = [f'firm_{i}' for i in range(self.n_firms)]

        self.history_length = 5
        self.price_history = []
        self.intercept = 17
        self.slope = 0.01
        self.n_periods = 80

        # Actions are tuples of (price, quantity) for each firm
        self.ind_action_space = spaces.Tuple((
            spaces.Box(low=0,high=self.intercept),
            spaces.Box(low=0,high=300)
        ))

        self.ind_observation_space = spaces.Box(
            low=0,
            high=self.intercept,
            shape=(self.history_length,)
        )
        self.action_space = gym.spaces.Dict(self.make_agent_dictionary(self.ind_action_space))
        self.observation_space = gym.spaces.Dict(self.make_agent_dictionary(self.ind_observation_space))

        self.costs = [4.0]*100 + [4.5]*100 + [5.0]*100
        self.sales_calc = partial(calculate_sales, intercept=self.intercept, slope=self.slope)

    def make_agent_dictionary(self, res):
        return {agent_id: res for agent_id in self._agent_ids}

    def get_observation(self):
        price_history = list(reversed(self.price_history[-self.history_length:]))
        price_history = np.array(price_history, dtype=np.float32)
        return self.make_agent_dictionary(price_history)

    def reset(self, seed=None, options=None):
        self.price_history = [0]*self.history_length
        self.period = 0
        return self.get_observation(), {}

    def step(self, actions):
        self.period += 1
        prices = {aid: np.round(action[0][0],2) for aid,action in actions.items()}
        quantities = {aid: action[1][0] for aid,action in actions.items()}
        offers = {aid: (prices[aid],quantities[aid]) for aid in self._agent_ids}
        sales = self.sales_calc(offers)
        profits = {}
        for aid in self._agent_ids:
            q = sales[aid]
            p = prices[aid]
            costs = sum(self.costs[:int(q)])
            # Add costs in event they sold fraction of a unit
            if q < len(self.costs):
                costs += (q - int(q))*self.costs[int(q)]
            profits[aid] = p*q - costs
        tot_sales = sum(sales.values())
        if tot_sales > 0:
            avg_price = sum(sales[aid]*prices[aid] for aid in sales.keys())/sum(sales.values())
        else:
            avg_price = self.intercept
        self.price_history.append(avg_price)

        rewards = self.get_rewards(profits)
        done = self.period == self.n_periods
        dones = self.make_agent_dictionary(done)
        dones['__all__'] = done
        truncated = dones
        info = {}
        return self.get_observation(), rewards, dones, truncated, info

    def get_evaluation_reward(self, profits):
        return profits

    def get_rewards(self, profits):
        return self.get_evaluation_reward(profits)

In [4]:
# Test Code

offers = {
    'firm1': (12, 50),
    'firm2': (11, 400),
    'firm3': (13, 100)
}

intercept = 17
slope = 0.01

result = calculate_sales(offers, intercept, slope)
print(result)


{'firm2': 400, 'firm1': 50, 'firm3': 0.0}


In [5]:
env = OligopolyMarket()

In [6]:
env.observation_space.sample()

OrderedDict([('firm_0',
              array([ 0.45141014,  0.16443944, 14.493833  , 16.606543  ,  1.2780769 ],
                    dtype=float32)),
             ('firm_1',
              array([12.923358, 12.776037, 13.560927, 10.538004, 13.373651],
                    dtype=float32)),
             ('firm_2',
              array([ 0.48759073, 14.794913  , 14.728542  , 16.857182  ,  9.074473  ],
                    dtype=float32)),
             ('firm_3',
              array([ 3.0088277,  8.1761265,  3.7867527, 11.710337 ,  9.020187 ],
                    dtype=float32)),
             ('firm_4',
              array([11.312584 , 11.576518 ,  4.051092 ,  3.8614366,  4.92761  ],
                    dtype=float32))])

In [7]:
env.reset()

({'firm_0': array([0., 0., 0., 0., 0.], dtype=float32),
  'firm_1': array([0., 0., 0., 0., 0.], dtype=float32),
  'firm_2': array([0., 0., 0., 0., 0.], dtype=float32),
  'firm_3': array([0., 0., 0., 0., 0.], dtype=float32),
  'firm_4': array([0., 0., 0., 0., 0.], dtype=float32)},
 {})

In [8]:
env.observation_space.contains(env.reset()[0])

True

In [9]:
obs, _,_,_,_ = env.step(env.action_space.sample())

In [10]:
env.observation_space.contains(obs)

True

In [11]:
env.reset()
done = False
rewards = []
while not done:
    obs, reward, done, _, _ = env.step(env.action_space.sample())
    done = done['__all__']
    rewards.append(reward)

In [12]:
import ray
from ray import tune, air
from ray.rllib.algorithms.ppo import PPOConfig
from functools import partial

In [13]:
if ray.is_initialized():
  ray.shutdown()
ray.init(num_cpus=4)

2024-05-12 20:54:19,568	INFO worker.py:1642 -- Started a local Ray instance.


0,1
Python version:,3.10.12
Ray version:,2.7.0




## Asymetric Policys

In [14]:
agent_ids = OligopolyMarket()._agent_ids
sym_policies = {agent_id: f"policy_firm_0" for agent_id in agent_ids}  # Symmetric
asym_policies = {agent_id: f"policy_{agent_id}" for agent_id in agent_ids}  # Asymmetric
exploit_policies = asym_policies.copy()
#exploit_policies[list(exploit_policies.keys())[-1]] = "policy_firm_ex"

In [15]:
asym_policies

{'firm_0': 'policy_firm_0',
 'firm_1': 'policy_firm_1',
 'firm_2': 'policy_firm_2',
 'firm_3': 'policy_firm_3',
 'firm_4': 'policy_firm_4'}

In [None]:
def policy_mapping_fn(agent_id, episode, worker, *, policies=None, **kwargs):
    return policies[agent_id]


policies = asym_policies
policy_mapping = partial(policy_mapping_fn, policies=policies)

config = (
        PPOConfig()
        .environment(OligopolyMarket)
        .framework('torch')
        .training(train_batch_size=1024, gamma=0.99)
        .resources(num_gpus=0)
        .rollouts(num_rollout_workers=3)
        .multi_agent(policies=list(policies.values()),
                     policy_mapping_fn=policy_mapping)
    )

stop = {"timesteps_total": 100000}

# THE WAY OF CODING IS A BIT DIFFERENT HERE FROM THE SINGLE AGENT ONE.
tuner = tune.Tuner(
        "PPO",
        param_space=config.to_dict(),
        run_config=air.RunConfig(stop=stop),
    )


tuner.fit()

0,1
Current time:,2024-05-12 20:55:26
Running for:,00:00:55.64
Memory:,7.9/125.7 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_OligopolyMarket_807e6_00000,RUNNING,192.168.1.114:8832,5,42.0197,5120,49988,91295.6,6751.18,80




 **path=/root/ray_results/PPO_2024-05-11_13-40-02/PPO_OligopolyMarket_f84b3_00000_0_2024-05-11_13-40-03/checkpoint_000000**

In [None]:
from ray.rllib.algorithms.algorithm import Algorithm

In [None]:
algo_asym = Algorithm.from_checkpoint("/root/ray_results/PPO_2024-05-11_13-40-02/PPO_OligopolyMarket_f84b3_00000_0_2024-05-11_13-40-03/checkpoint_000000")

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
[2m[36m(pid=120909)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(RolloutWorker pid=120909)[0m   prep = cls(observation_space, options)
[2m[36m(RolloutWorker pid=120909)[0m   _, shape = ModelCatalog.get_action_shape(
[2m[36m(RolloutWorker pid=120909)[0m   self._preproces

In [None]:
algo_asym.save("asym_pricing_model_checkpoint")

TrainingResult(checkpoint=Checkpoint(filesystem=local, path=asym_pricing_model_checkpoint), metrics={})

##Symetric Policys##

In [None]:
agent_ids = OligopolyMarket()._agent_ids
sym_policies = {agent_id: f"policy_firm_0" for agent_id in agent_ids}  # Symmetric
asym_policies = {agent_id: f"policy_{agent_id}" for agent_id in agent_ids}  # Asymmetric
exploit_policies = sym_policies.copy()
#exploit_policies[list(exploit_policies.keys())[-1]] = "policy_firm_ex"

In [None]:
sym_policies

{'firm_0': 'policy_firm_0',
 'firm_1': 'policy_firm_0',
 'firm_2': 'policy_firm_0',
 'firm_3': 'policy_firm_0',
 'firm_4': 'policy_firm_0'}

In [None]:
if ray.is_initialized():
  ray.shutdown()
ray.init(num_cpus=4)

  self.pid = _posixsubprocess.fork_exec(
2024-05-11 14:33:29,398	INFO worker.py:1642 -- Started a local Ray instance.


0,1
Python version:,3.10.12
Ray version:,2.7.0


In [None]:
def policy_mapping_fn(agent_id, episode, worker, *, policies=None, **kwargs):
    return policies[agent_id]


policies = sym_policies
policy_mapping = partial(policy_mapping_fn, policies=policies)

config = (
        PPOConfig()
        .environment(OligopolyMarket)
        .framework('torch')
        .training(train_batch_size=1024, gamma=0.99)
        .resources(num_gpus=0)
        .rollouts(num_rollout_workers=3)
        .multi_agent(policies=list(policies.values()),
                     policy_mapping_fn=policy_mapping)
    )

stop = {"timesteps_total": 100000}

# THE WAY OF CODING IS A BIT DIFFERENT HERE FROM THE SINGLE AGENT ONE.
tuner = tune.Tuner(
        "PPO",
        param_space=config.to_dict(),
        run_config=air.RunConfig(stop=stop),
    )


tuner.fit()

2024-05-11 14:33:35,495	INFO tune.py:654 -- [output] This will use the new output engine with verbosity 1. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949


+------------------------------------------------------------+
| Configuration for experiment     PPO_2024-05-11_14-33-35   |
+------------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator     |
| Scheduler                        FIFOScheduler             |
| Number of trials                 1                         |
+------------------------------------------------------------+

View detailed results here: /root/ray_results/PPO_2024-05-11_14-33-35
To visualize your results with TensorBoard, run: `tensorboard --logdir /root/ray_results/PPO_2024-05-11_14-33-35`

Trial status: 1 PENDING
Current time: 2024-05-11 14:33:35. Total running time: 0s
Logical resource usage: 0/4 CPUs, 0/0 GPUs
+--------------------------------------------+
| Trial name                        status   |
+--------------------------------------------+
| PPO_OligopolyMarket_72fd0_00000   PENDING  |
+--------------------------------------------+


[2m[36m(pid=121598)[0m   if (distutils.version.LooseVersion(tf.__version__) <


Trial status: 1 PENDING
Current time: 2024-05-11 14:34:05. Total running time: 30s
Logical resource usage: 4.0/4 CPUs, 0/0 GPUs
+--------------------------------------------+
| Trial name                        status   |
+--------------------------------------------+
| PPO_OligopolyMarket_72fd0_00000   PENDING  |
+--------------------------------------------+


[2m[36m(pid=121673)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=121672)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(RolloutWorker pid=121673)[0m   prep = cls(observation_space, options)
[2m[36m(RolloutWorker pid=121673)[0m   _, shape = ModelCatalog.get_action_shape(
[2m[36m(RolloutWorker pid=121673)[0m   self._preprocessor = get_preprocessor(obs_space)(



Trial PPO_OligopolyMarket_72fd0_00000 started with configuration:
+---------------------------------------------------------------------------+
| Trial PPO_OligopolyMarket_72fd0_00000 config                              |
+---------------------------------------------------------------------------+
| _AlgorithmConfig__prior_exploration_config/type        StochasticSampling |
| _disable_action_flattening                                          False |
| _disable_execution_plan_api                                          True |
| _disable_initialize_loss_from_dummy_batch                           False |
| _disable_preprocessor_api                                           False |
| _enable_learner_api                                                  True |
| _enable_rl_module_api                                                True |
| _fake_gpus                                                          False |
| _is_atari                                                                

[2m[36m(PPO pid=121598)[0m Trainable.setup took 27.443 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(PPO pid=121598)[0m Install gputil for GPU system monitoring.
[2m[36m(pid=121717)[0m   if (distutils.version.LooseVersion(tf.__version__) <



Trial status: 1 RUNNING
Current time: 2024-05-11 14:34:35. Total running time: 1min 0s
Logical resource usage: 4.0/4 CPUs, 0/0 GPUs
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                        status       iter     total time (s)     ts     reward     episode_reward_max     episode_reward_min     episode_len_mean     episodes_this_iter |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| PPO_OligopolyMarket_72fd0_00000   RUNNING         1            20.8916   1024      82296                 100013                61101.3                   80                     12 |
+--------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(PPO pid=121598)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/PPO_2024-05-11_14-33-35/PPO_OligopolyMarket_72fd0_00000_0_2024-05-11_14-33-35/checkpoint_000000)
[2m[36m(PPO pid=121598)[0m   prep = cls(observation_space, options)[32m [repeated 3x across cluster][0m
[2m[36m(RolloutWorker pid=121717)[0m   _, shape = ModelCatalog.get_action_shape([32m [repeated 3x across cluster][0m
[2m[36m(RolloutWorker pid=121717)[0m   self._preprocessor = get_preprocessor(obs_space)([32m [repeated 3x across cluster][0m





ResultGrid<[
  Result(
    metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'__all__': {'num_agent_steps_trained': 128.0, 'num_env_steps_trained': 5120.0, 'total_loss': 9.968995764255524}, 'policy_firm_0': {'total_loss': 9.968995764255524, 'policy_loss': 0.003862616914945344, 'vf_loss': 9.958984487851462, 'vf_loss_unclipped': 296778487.87333333, 'vf_explained_var': -2.2485852241516114e-06, 'entropy': -1.848565861483415, 'mean_kl_loss': 0.00910908493668103, 'default_optimizer_lr': 5.000000000000001e-05, 'curr_lr': 5e-05, 'curr_entropy_coeff': 0.0, 'curr_kl_coeff': 0.675000011920929}}, 'num_env_steps_sampled': 100352, 'num_env_steps_trained': 0, 'num_agent_steps_sampled': 501760, 'num_agent_steps_trained': 0}, 'sampler_results': {'episode_reward_max': 215479.6006716448, 'episode_reward_min': 207010.75716089614, 'episode_reward_mean': 210840.76620346867, 'episode_len_mean': 80.0, 'episode_media': {}, 'episodes_this_iter': 12, 'policy_reward_min': {'policy_firm_0': 

**path=/root/ray_results/PPO_2024-05-11_14-33-35/PPO_OligopolyMarket_72fd0_00000_0_2024-05-11_14-33-35/checkpoint_000000**

In [None]:
algo_sym = Algorithm.from_checkpoint("/root/ray_results/PPO_2024-05-11_14-33-35/PPO_OligopolyMarket_72fd0_00000_0_2024-05-11_14-33-35/checkpoint_000000")

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
[2m[36m(pid=130963)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(RolloutWorker pid=130963)[0m   prep = cls(observation_space, options)
[2m[36m(pid=130987)[0m   if (distutils.version.LooseVersion(tf.__version__) <[32m [repeated 2x across cluster][0m
[2m[36m(RolloutWorke

In [None]:
algo_sym.save("sym_pricing_model_checkpoint")

TrainingResult(checkpoint=Checkpoint(filesystem=local, path=sym_pricing_model_checkpoint), metrics={})

##Policy For Firm_x

In [None]:
agent_ids = OligopolyMarket()._agent_ids
firm_x_policies = {agent_id: f"policy_firm_0" for agent_id in agent_ids}  # Symmetric
asym_policies = {agent_id: f"policy_{agent_id}" for agent_id in agent_ids}  # Asymmetric
exploit_policies = firm_x_policies.copy()
exploit_policies[list(exploit_policies.keys())[-1]] = "policy_firm_ex"

In [None]:
exploit_policies

{'firm_0': 'policy_firm_0',
 'firm_1': 'policy_firm_0',
 'firm_2': 'policy_firm_0',
 'firm_3': 'policy_firm_0',
 'firm_4': 'policy_firm_ex'}

In [None]:
if ray.is_initialized():
  ray.shutdown()
ray.init(num_cpus=4)

  self.pid = _posixsubprocess.fork_exec(
2024-05-11 15:35:08,936	INFO worker.py:1642 -- Started a local Ray instance.


0,1
Python version:,3.10.12
Ray version:,2.7.0


In [None]:
def policy_mapping_fn(agent_id, episode, worker, *, policies=None, **kwargs):
    return policies[agent_id]


policies = exploit_policies
policy_mapping = partial(policy_mapping_fn, policies=policies)

config = (
        PPOConfig()
        .environment(OligopolyMarket)
        .framework('torch')
        .training(train_batch_size=1024, gamma=0.99)
        .resources(num_gpus=0)
        .rollouts(num_rollout_workers=3)
        .multi_agent(policies=list(policies.values()),
                     policy_mapping_fn=policy_mapping)
    )

stop = {"timesteps_total": 100000}

# THE WAY OF CODING IS A BIT DIFFERENT HERE FROM THE SINGLE AGENT ONE.
tuner = tune.Tuner(
        "PPO",
        param_space=config.to_dict(),
        run_config=air.RunConfig(stop=stop),
    )


tuner.fit()

2024-05-11 15:35:18,417	INFO tune.py:654 -- [output] This will use the new output engine with verbosity 1. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949


+------------------------------------------------------------+
| Configuration for experiment     PPO_2024-05-11_15-35-18   |
+------------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator     |
| Scheduler                        FIFOScheduler             |
| Number of trials                 1                         |
+------------------------------------------------------------+

View detailed results here: /root/ray_results/PPO_2024-05-11_15-35-18
To visualize your results with TensorBoard, run: `tensorboard --logdir /root/ray_results/PPO_2024-05-11_15-35-18`

Trial status: 1 PENDING
Current time: 2024-05-11 15:35:18. Total running time: 0s
Logical resource usage: 0/4 CPUs, 0/0 GPUs
+--------------------------------------------+
| Trial name                        status   |
+--------------------------------------------+
| PPO_OligopolyMarket_121a4_00000   PENDING  |
+--------------------------------------------+


[2m[36m(pid=136723)[0m   if (distutils.version.LooseVersion(tf.__version__) <


Trial status: 1 PENDING
Current time: 2024-05-11 15:35:48. Total running time: 30s
Logical resource usage: 4.0/4 CPUs, 0/0 GPUs
+--------------------------------------------+
| Trial name                        status   |
+--------------------------------------------+
| PPO_OligopolyMarket_121a4_00000   PENDING  |
+--------------------------------------------+


[2m[36m(pid=136796)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=136795)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(RolloutWorker pid=136796)[0m   prep = cls(observation_space, options)
[2m[36m(RolloutWorker pid=136796)[0m   _, shape = ModelCatalog.get_action_shape(
[2m[36m(RolloutWorker pid=136796)[0m   self._preprocessor = get_preprocessor(obs_space)(



Trial PPO_OligopolyMarket_121a4_00000 started with configuration:
+---------------------------------------------------------------------------+
| Trial PPO_OligopolyMarket_121a4_00000 config                              |
+---------------------------------------------------------------------------+
| _AlgorithmConfig__prior_exploration_config/type        StochasticSampling |
| _disable_action_flattening                                          False |
| _disable_execution_plan_api                                          True |
| _disable_initialize_loss_from_dummy_batch                           False |
| _disable_preprocessor_api                                           False |
| _enable_learner_api                                                  True |
| _enable_rl_module_api                                                True |
| _fake_gpus                                                          False |
| _is_atari                                                                

[2m[36m(PPO pid=136723)[0m Trainable.setup took 25.809 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(PPO pid=136723)[0m Install gputil for GPU system monitoring.



Trial status: 1 RUNNING
Current time: 2024-05-11 15:36:18. Total running time: 1min 0s
Logical resource usage: 4.0/4 CPUs, 0/0 GPUs
+--------------------------------------------+
| Trial name                        status   |
+--------------------------------------------+
| PPO_OligopolyMarket_121a4_00000   RUNNING  |
+--------------------------------------------+
Trial status: 1 RUNNING
Current time: 2024-05-11 15:36:48. Total running time: 1min 30s
Logical resource usage: 4.0/4 CPUs, 0/0 GPUs
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                        status       iter     total time (s)     ts     reward     episode_reward_max     episode_reward_min     episode_len_mean     episodes_this_iter |
+------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(PPO pid=136723)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/PPO_2024-05-11_15-35-18/PPO_OligopolyMarket_121a4_00000_0_2024-05-11_15-35-18/checkpoint_000000)
[2m[36m(pid=136827)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(PPO pid=136723)[0m   prep = cls(observation_space, options)[32m [repeated 3x across cluster][0m
[2m[36m(PPO pid=136723)[0m   _, shape = ModelCatalog.get_action_shape([32m [repeated 3x across cluster][0m
[2m[36m(PPO pid=136723)[0m   self._preprocessor = get_preprocessor(obs_space)([32m [repeated 3x across cluster][0m





ResultGrid<[
  Result(
    metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'__all__': {'num_agent_steps_trained': 256.0, 'num_env_steps_trained': 4096.0, 'total_loss': 19.72504282196363}, 'policy_firm_0': {'total_loss': 19.72504282196363, 'policy_loss': 0.0020325454126577823, 'vf_loss': 9.936523481210072, 'vf_loss_unclipped': 374327724.0, 'vf_explained_var': -1.679609219233195e-06, 'entropy': 0.13192211257410236, 'mean_kl_loss': 0.008923660598657079, 'default_optimizer_lr': 5.0000000000000016e-05, 'curr_lr': 5e-05, 'curr_entropy_coeff': 0.0, 'curr_kl_coeff': 0.45000001788139343}, 'policy_firm_ex': {'total_loss': 9.782471082607906, 'policy_loss': -0.020538019388914107, 'vf_loss': 9.794921969374021, 'vf_loss_unclipped': 91016179.35833333, 'vf_explained_var': -3.38988999525706e-06, 'entropy': 0.4793036703020334, 'mean_kl_loss': 0.010649360318529944, 'default_optimizer_lr': 5.0000000000000016e-05, 'curr_lr': 5e-05, 'curr_entropy_coeff': 0.0, 'curr_kl_coeff': 0.7593

In [None]:
algo_firm_x = Algorithm.from_checkpoint("/root/ray_results/PPO_2024-05-11_15-35-18/PPO_OligopolyMarket_121a4_00000_0_2024-05-11_15-35-18/checkpoint_000000")

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
[2m[36m(pid=149821)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(RolloutWorker pid=149821)[0m   prep = cls(observation_space, options)
[2m[36m(RolloutWorker pid=149821)[0m   _, shape = ModelCatalog.get_action_shape(
[2m[36m(RolloutWorker pid=149821)[0m   self._preproces

In [None]:
algo_firm_x.save("firm_x_pricing_model_checkpoint")

TrainingResult(checkpoint=Checkpoint(filesystem=local, path=firm_x_pricing_model_checkpoint), metrics={})

##Chech Which Model Works Best

> Add blockquote



In [None]:
strats = ['sym','sym','sym','sym','firm_x']
MAX_PERIODS = 100  # max number of steps per episode
num_simulations = 1  # Number of simulations to run

env = OligopolyMarket()
data = []
reward_history = []
for i in range(num_simulations):
    obs, info = env.reset()
    terminated = {}
    terminated["__all__"] = False
    steps = 0

    # Run an episode
    while not terminated["__all__"]:
        actions = {}
        for i, agent in enumerate(agent_ids):
            if strats[i] == 'asym':
              algo = algo_asym
              policies = asym_policies
            elif strats[i] == 'sym':
              algo = algo_sym
              policies = sym_policies
            elif strats[i] == 'firm_x':
              algo = algo_firm_x
              policies = exploit_policies
            a = algo.compute_single_action(
                observation=obs[agent],
                policy_id=policies[agent],
            )
            actions[agent] = a
        prev_obs = obs
        obs, reward, terminated, truncated, info = env.step(actions)
        steps += 1
        rewards.append(reward)

[sum(rew[agent] for rew in rewards) for agent in env._agent_ids]

[7868805.483309196,
 7701378.075780671,
 7981175.060317067,
 7866457.872416034,
 7647893.961516673]

In [None]:
sym_policies

{'firm_0': 'policy_firm_0',
 'firm_1': 'policy_firm_0',
 'firm_2': 'policy_firm_0',
 'firm_3': 'policy_firm_0',
 'firm_4': 'policy_firm_0'}

In [None]:
asym_policies

{'firm_0': 'policy_firm_0',
 'firm_1': 'policy_firm_1',
 'firm_2': 'policy_firm_2',
 'firm_3': 'policy_firm_3',
 'firm_4': 'policy_firm_4'}

In [None]:
exploit_policies

{'firm_0': 'policy_firm_0',
 'firm_1': 'policy_firm_0',
 'firm_2': 'policy_firm_0',
 'firm_3': 'policy_firm_0',
 'firm_4': 'policy_firm_ex'}

In [None]:
strats = ['asym','asym','asym','asym','firm_x']
MAX_PERIODS = 100  # max number of steps per episode
num_simulations = 1  # Number of simulations to run

env = OligopolyMarket()
data = []
reward_history = []
for i in range(num_simulations):
    obs, info = env.reset()
    terminated = {}
    terminated["__all__"] = False
    steps = 0

    # Run an episode
    while not terminated["__all__"]:
        actions = {}
        for i, agent in enumerate(agent_ids):
            if strats[i] == 'asym':
              algo = algo_asym
              policies = asym_policies
            elif strats[i] == 'sym':
              algo = algo_sym
              policies = sym_policies
            elif strats[i] == 'firm_x':
              algo = algo_firm_x
              policies = exploit_policies
            a = algo.compute_single_action(
                observation=obs[agent],
                policy_id=policies[agent],
            )
            actions[agent] = a
        prev_obs = obs
        obs, reward, terminated, truncated, info = env.step(actions)
        steps += 1
        rewards.append(reward)

[sum(rew[agent] for rew in rewards) for agent in env._agent_ids]

[8088503.470230487,
 7884724.666012604,
 8219393.389706542,
 8115864.116610322,
 7772624.991377154]

In [None]:
strats = ['sym','sym','sym','sym','sym']
MAX_PERIODS = 100  # max number of steps per episode
num_simulations = 1  # Number of simulations to run

env = OligopolyMarket()
data = []
reward_history = []
for i in range(num_simulations):
    obs, info = env.reset()
    terminated = {}
    terminated["__all__"] = False
    steps = 0

    # Run an episode
    while not terminated["__all__"]:
        actions = {}
        for i, agent in enumerate(agent_ids):
            if strats[i] == 'asym':
              algo = algo_asym
              policies = asym_policies
            elif strats[i] == 'sym':
              algo = algo_sym
              policies = sym_policies
            elif strats[i] == 'firm_x':
              algo = algo_firm_x
              policies = exploit_policies
            a = algo.compute_single_action(
                observation=obs[agent],
                policy_id=policies[agent],
            )
            actions[agent] = a
        prev_obs = obs
        obs, reward, terminated, truncated, info = env.step(actions)
        steps += 1
        rewards.append(reward)

[sum(rew[agent] for rew in rewards) for agent in env._agent_ids]

[8386529.467970644,
 8196877.75645042,
 8525609.49879098,
 8404620.980341293,
 8055432.451647385]

In [None]:
strats = ['sym','asym','sym','asym','sym']
MAX_PERIODS = 100  # max number of steps per episode
num_simulations = 1  # Number of simulations to run

env = OligopolyMarket()
data = []
reward_history = []
for i in range(num_simulations):
    obs, info = env.reset()
    terminated = {}
    terminated["__all__"] = False
    steps = 0

    # Run an episode
    while not terminated["__all__"]:
        actions = {}
        for i, agent in enumerate(agent_ids):
            if strats[i] == 'asym':
              algo = algo_asym
              policies = asym_policies
            elif strats[i] == 'sym':
              algo = algo_sym
              policies = sym_policies
            elif strats[i] == 'firm_x':
              algo = algo_firm_x
              policies = exploit_policies
            a = algo.compute_single_action(
                observation=obs[agent],
                policy_id=policies[agent],
            )
            actions[agent] = a
        prev_obs = obs
        obs, reward, terminated, truncated, info = env.step(actions)
        steps += 1
        rewards.append(reward)

[sum(rew[agent] for rew in rewards) for agent in env._agent_ids]

[9732092.177677829,
 9477876.355834665,
 10000360.248714957,
 9841854.452414248,
 9320810.835670931]

In [None]:
strats = ['asym','sym','asym','sym','asym']
MAX_PERIODS = 100  # max number of steps per episode
num_simulations = 1  # Number of simulations to run

env = OligopolyMarket()
data = []
reward_history = []
for i in range(num_simulations):
    obs, info = env.reset()
    terminated = {}
    terminated["__all__"] = False
    steps = 0

    # Run an episode
    while not terminated["__all__"]:
        actions = {}
        for i, agent in enumerate(agent_ids):
            if strats[i] == 'asym':
              algo = algo_asym
              policies = asym_policies
            elif strats[i] == 'sym':
              algo = algo_sym
              policies = sym_policies
            elif strats[i] == 'firm_x':
              algo = algo_firm_x
              policies = exploit_policies
            a = algo.compute_single_action(
                observation=obs[agent],
                policy_id=policies[agent],
            )
            actions[agent] = a
        prev_obs = obs
        obs, reward, terminated, truncated, info = env.step(actions)
        steps += 1
        rewards.append(reward)

[sum(rew[agent] for rew in rewards) for agent in env._agent_ids]

[9282728.236783892,
 9282265.762449646,
 9527306.937681038,
 9549422.083357979,
 8851212.082166031]

In [None]:
strats = ['sym','sym','asym','sym','sym']
MAX_PERIODS = 1000  # max number of steps per episode
num_simulations = 1  # Number of simulations to run

env = OligopolyMarket()
data = []
reward_history = []
for i in range(num_simulations):
    obs, info = env.reset()
    terminated = {}
    terminated["__all__"] = False
    steps = 0

    # Run an episode
    while not terminated["__all__"]:
        actions = {}
        for i, agent in enumerate(agent_ids):
            if strats[i] == 'asym':
              algo = algo_asym
              policies = asym_policies
            elif strats[i] == 'sym':
              algo = algo_sym
              policies = sym_policies
            elif strats[i] == 'firm_x':
              algo = algo_firm_x
              policies = exploit_policies
            a = algo.compute_single_action(
                observation=obs[agent],
                policy_id=policies[agent],
            )
            actions[agent] = a
        prev_obs = obs
        obs, reward, terminated, truncated, info = env.step(actions)
        steps += 1
        rewards.append(reward)

[sum(rew[agent] for rew in rewards) for agent in env._agent_ids]

[10252252.75328505,
 9892921.008326327,
 10374647.888193736,
 10364021.929887593,
 9813267.192438247]

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CustomModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(CustomModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)  # Input size to be determined based on observation space
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_size)  # Output size to be determined based on action space

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Define the input size and output size of the neural network based on the observation space and action space
input_size = 5  # Assuming observation space size is 5 for Firm_0
output_size = 2  # Assuming action space size is 2 for Firm_0 (Price and Quantity)

# Instantiate the custom neural network model for Firm_0
custom_model = CustomModel(input_size, output_size)

In [None]:
# Get the weights of the first layer
weights_fc1 = custom_model.fc1.weight
print("Weights of the first layer:")
print(weights_fc1)

# Get the weights of the second layer
weights_fc2 = custom_model.fc2.weight
print("Weights of the second layer:")
print(weights_fc2)

# Get the weights of the third layer (output layer)
weights_fc3 = custom_model.fc3.weight
print("Weights of the third layer (output layer):")
print(weights_fc3)

Weights of the first layer:
Parameter containing:
tensor([[ 0.1888, -0.3615,  0.3525,  0.0848, -0.1149],
        [ 0.0977, -0.3887,  0.0709,  0.0625, -0.2448],
        [ 0.3810, -0.0479, -0.3925, -0.2081, -0.3874],
        [ 0.3313, -0.2189,  0.1930,  0.2802,  0.2810],
        [ 0.0347, -0.0509,  0.1860, -0.1898,  0.1588],
        [ 0.0037, -0.2909,  0.4277,  0.4041, -0.0946],
        [ 0.2006, -0.1853, -0.4374, -0.2616,  0.4161],
        [-0.1428,  0.4285, -0.2577,  0.0018, -0.3143],
        [ 0.1251,  0.1332,  0.1848,  0.0267, -0.0420],
        [ 0.0283,  0.0672,  0.3787, -0.2943,  0.2258],
        [ 0.4148,  0.4444, -0.2638,  0.1947, -0.0497],
        [-0.2538,  0.0138, -0.1237,  0.2796, -0.0213],
        [-0.1229, -0.4378, -0.1529, -0.1545, -0.2967],
        [ 0.1381, -0.3336, -0.3458, -0.2506,  0.0820],
        [-0.3352, -0.1835, -0.3241, -0.2992, -0.0797],
        [ 0.3133, -0.0153,  0.1979, -0.3646, -0.4390],
        [ 0.1828, -0.4321,  0.0152, -0.0560, -0.3344],
        [ 0.008

##sym_model_pricing = Algorithm.from_checkpoint("/root/ray_results/PPO_2024-05-10_18-30-07/PPO_OligopolyMarket_53cca_00000_0_2024-05-10_18-30-07/checkpoint_000000")