### Crude Evaluation for Sanity Check!

In [None]:
from copy import deepcopy
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

from env.energy_trading import EnergyTradingEnv

from torch import Tensor
from tensordict import TensorDict
from benchmarl.experiment import Experiment

In [None]:
base_exp_path = "results/mappo_simple_p2p_mlp__35a1cc94_25_08_21-22_11_46/checkpoints/checkpoint_131072.pt"
base_exp = Experiment.reload_from_file(base_exp_path)

# Base environment
config = base_exp.task.config
trading_env = EnergyTradingEnv(config, render_mode=None)

# Agent IDs
agents = base_exp.group_map['agents']

#### Select Policy for the Actors

In [None]:
# Apply policy to trade, assuming single group
def _trade_forward(obs):

    stacked_obs = Tensor([obs[aid] for aid in base_exp.group_map['agents']])
    obs_tdict = TensorDict(agents=TensorDict(observation=stacked_obs))

    actions = base_exp.policy.forward(obs_tdict)['agents']['action'].detach().cpu().numpy()
    action_dict = {aid: actions[i] for i, aid in enumerate(base_exp.group_map['agents'])}
    
    return action_dict

In [None]:
# Apply policy to trade, assuming individual groups
def _trade_forward(obs):
    
    obs_tdict = TensorDict({aid: TensorDict(observation=Tensor([obs[aid]])) for aid in agents})

    actions = base_exp.policy.forward(obs_tdict)
    action_dict = {aid: actions[aid]['action'].detach().cpu().numpy()[0] for aid in agents}
    
    return action_dict

In [None]:
pi = {"0.08":2,
      "0.13":1,
      "0.18":0}

# Baseline heuristic, no RL
def _trade_forward(obs):

    ToU = str(obs["consumer_1"][-2])
    action_dict = {aid: pi[ToU] for aid in base_exp.group_map['agents']}
    
    return action_dict

#### Plot Functions

In [None]:
def plot_daily_average(x, agents, y_axis_label, plot_std=True):

    x_mean = {aid: np.mean(np.array(x[aid]), axis=0) for aid in agents}
    x_std = {aid: np.std(np.array(x[aid]), axis=0) for aid in agents}

    for aid in agents:
        
        mean_vals = x_mean[aid]
        plt.plot(mean_vals, label=aid)

        if plot_std:
            std_vals = x_std[aid]
            plt.fill_between(
                range(len(mean_vals)),
                mean_vals - std_vals,
                mean_vals + std_vals,
                alpha=0.2
            )

    timesteps = range(config['eps_len'])
    hours = [f"{h}:00" for h in timesteps]
    
    plt.xlabel('Time of Day')
    plt.ylabel(y_axis_label)
    plt.xticks(timesteps, hours, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()

#### Rollout on Entire Dataset, Analyze Results

In [None]:
# Initialize logs for all agents and days
log_keys = ['soc_action_log', 'price_action_log', 'soc_log', 'demand_log', 'p2p_log', 'p2p_price_log', 'reward_log']
logs = {key: {aid: [[] for _ in range(trading_env.n_days-1)] for aid in agents} for key in log_keys}

for day in tqdm(range(trading_env.n_days-1), desc="Trading Days"):
    
    obs, infos = trading_env.reset(seed=42, options={"day": day})

    # Step environment for an episode
    for _ in range(trading_env.eps_len):

        # Environment actor
        actions = _trade_forward(obs)

        for aid in agents:
            
            logs['demand_log'][aid][day].append(obs[aid][0].item())
            logs['soc_log'][aid][day].append(obs[aid][1].item())
            
            logs['soc_action_log'][aid][day].append(actions[aid])
            # logs['soc_action_log'][aid][day].append(actions[aid][1].item())
            
            # logs['price_action_log'][aid][day].append(actions[aid][0].item())

        obs, rewards, terminations, truncations, infos = trading_env.step(actions)

        # matches, trades, open_book = trading_env._run_double_auction(trading_env.orderbook)

        for aid in agents:
            # logs['p2p_log'][aid][day].append(trades[aid]["qnt"])
            # logs['p2p_price_log'][aid][day].append(trades[aid]["price"])
            logs['reward_log'][aid][day].append(rewards[aid])

In [None]:
# Avergage rewards across agents, and all days
reward_np = {aid: np.mean(np.sum(logs['reward_log'][aid],axis=-1)) for aid in agents}
total_reward = np.mean(list(reward_np.values()))
print(f"Total reward: {total_reward:.2f}")

In [None]:
# Plotting
plot_daily_average(logs['soc_action_log'], agents, 'State of Charge', plot_std=False)

#### Deep Dive: Specif Days or Agents

In [None]:
day = 324
agent = "consumer_3"
    
obs, infos = trading_env.reset(seed=42, options={"day": day})

# Step environment for an episode
for _ in range(trading_env.eps_len):

    print(f"\nHour: {_:02}")
    print(f"Observation: {obs[agent]}")

    # Environment actor
    actions = _trade_forward(obs)

    obs, rewards, terminations, truncations, infos = trading_env.step(actions)
    
    print(f"Action: {actions[agent]}")
    print(f"Reward: {rewards[agent]}")

In [None]:
# Plotting
x = deepcopy(logs['soc_action_log'])
x["consumer_3"] = x["consumer_3"][:31] # July 2010
plot_daily_average(x, ["consumer_3"], 'State of Charge')

#### Critic Evaluation: Q and V Functions

Currently only tested for MADDPG, single agent environment, only soc action

In [None]:
ToU = 0.08
FiT = 0.04
t = 0
load = 0
soc = 0
soc_action = 0

obs = {"consumer_1": [load, soc, ToU, FiT, np.sin(2*np.pi*t/24), np.cos(2*np.pi*t/24)]}
action = {"consumer_1": [soc_action]}

stacked_obs = Tensor([obs[aid] for aid in base_exp.group_map['agents']])
stacked_action = Tensor([action[aid] for aid in base_exp.group_map['agents']])
obs_tdict = TensorDict(agents=TensorDict(observation=stacked_obs,action=stacked_action))

# Check if Q value makes sense?
print(base_exp.losses["agents"].value_network[0].forward(obs_tdict)['agents']['state_action_value'].cpu().item())

In [None]:
# Get some samples from replay buffer, sanity check
x = base_exp.replay_buffers['agents'].sample(256)

observations = x['agents']['observation']
actions = x['agents']['action']
episode_rewards = x['agents']['episode_reward']
rewards = x['next']['agents']['reward']
param = x['agents']['param']

next_obs = x['next']['agents']['observation']
next_episode_rewards = x['next']['agents']['episode_reward']
terminated = x['next']['agents']['done']

for i in range(256):
    
    obs = observations[i][0]
    next_obs_val = next_obs[i][0]
    terminated_val = terminated[i][0]
    reward = rewards[i][0]
    action = actions[i][0]
    episode_reward = episode_rewards[i][0]
    next_episode_reward = next_episode_rewards[i][0]
    param_val = param[i][0]

    print(f"Observation: {obs}\nAction: {action}\nReward: {reward}\nEpisode Reward: {episode_reward}\nNext Observation: {next_obs_val}\nNext Episode Reward: {next_episode_reward}\nTerminated: {terminated_val}\nParam: {param_val}\n")