In [None]:
# Add parent directory to path to import src modules
import sys
sys.path.append('../')

In [None]:
import numpy as np
from ray.rllib.core.rl_module.rl_module import RLModule
from ray.rllib.core.rl_module.multi_rl_module import MultiRLModule
import torch
from ray.rllib.core.columns import Columns

from src.agent.battery import Battery
from src.agent.der import DERAgent
from src.environment.train import RLAlgorithm, RLTrainer, TrainingMode
from src.grid.base import GridTopology
from src.grid.network import GridNetwork
from src.market.matching import ClearingMechanism, MarketConfig
from src.profile.der import DERProfileHandler
from src.profile.dso import DSOProfileHandler
from src.market.dso import DSOAgent
from src.environment.inference import RLInference
from src.environment.io import EnvConfigHandler
from src.root import __main__

# üöÄ Beginner's Guide: Local Energy Market Environment

This notebook provides a step-by-step guide to setting up and using the Local Energy Market (LEM) environment for reinforcement learning. You'll learn how to:

1. **Configure the environment** - Set up market parameters, agents, and grid network
2. **Train RL agents** - Train agents using different algorithms and training modes
3. **Run inference** - Evaluate trained agents in the environment

Each section builds on the previous one, so follow along sequentially.

---

## üìã Table of Contents

1. [Initial Setup](#1-initial-setup)
   - [Market Configuration](#11-market-configuration-marketconfig)
   - [DER Profile Handler](#12-der-profile-handler-derprofilehandler)
   - [DSO Profile Handler](#13-dso-profile-handler-dsoprofilehandler)
   - [DER + Battery](#14-der--battery-deragent--battery)
   - [Grid Network](#15-grid-network-gridnetwork)
   - [DSO Agent](#16-dso-dsoagent)
   - [Environment Configuration](#17-environment-configuration)
2. [RL Training](#2-rl-training-rltrainer)
3. [Inference](#3-inference-rlinference)

---

## 1. üõ†Ô∏è Initial Setup

In this section, we'll configure all the necessary components for the Local Energy Market environment. This includes setting up the market rules, creating agents with energy profiles, configuring the grid network, and establishing the DSO (Distribution System Operator).

In [None]:
# ============================================================================
# Configuration Parameters
# ============================================================================

# Environment Configuration
STEPS = 24              # Number of timesteps per episode (e.g., 24 = 24 hours)
NUM_AGENTS = 3          # Number of DER agents participating in the market
SEED = 42               # Random seed for reproducibility

# Training Configuration
ITERS_TRAIN = 3         # Number of training iterations (for quick testing; use more in production)
TUNE_SAMPLES = 1        # Number of hyperparameter tuning samples
ALGO = "sac"            # RL algorithm: "ppo", "appo", or "sac"
MODE = "ctce"           # Training mode: "ctce" (centralized training, centralized execution),
                        #                "ctde" (centralized training, decentralized execution),
                        #                "dtde" (decentralized training, decentralized execution)
STORAGE_PATH = "/Users/nasalazar/Documentos/GitHub/simulations/phd/downloads"

# Checkpoint Restoration (for continuing training)
EXPERIMENT_PATH = "/Users/nasalazar/Documentos/GitHub/simulations/phd/downloads/TRAIN/lem_ctce_sac_06September1341"
CHECKPOINT_PATH_TRAIN = "/Users/nasalazar/Documentos/GitHub/simulations/phd/downloads/TRAIN/lem_ctce_sac_06September1341/SAC_GroupedLEM_166ac_00000_0_2025-09-06_13-41-23/checkpoint_000002"
EMBEDDINGS_DIM = 128    # Dimension of agent embeddings (for CTDE/DTDE modes)

# Inference Configuration
ITERS_INFERENCE = 3     # Number of inference episodes to run
EXPLORATION = False     # Whether to use exploration during inference (False = use best policy)
CHECKPOINT_PATH_INFERENCE = "/Users/nasalazar/Documentos/GitHub/simulations/phd/downloads/INFERENCE/lem_ctce_sac_06September1341/SAC_GroupedLEM_166ac_00000_0_2025-09-06_13-41-23/checkpoint_000002"

## 1.1. üí∞ Market Configuration (`MarketConfig`)

The `MarketConfig` defines the rules and constraints of the local energy market. This is the foundation of your market setup.

**Key Components:**
- **Price bounds** (min/max prices) - Define the price range for energy trading
- **Quantity bounds** (min/max trade quantities) - Set limits on trade sizes
- **Price clearing mechanism** - Determines how market price is calculated
- **Blockchain settings** - Configures decentralized validation (optional)
- **Partner preference** - Enables strategic trading relationships

**Available Clearing Mechanisms:**
- `AVERAGE`: Average of matched bid/ask prices
- `FIRST_PRICE`: Price of first matched order
- `LAST_PRICE`: Price of last matched order
- `HIGHEST_BID`: Highest bid price
- `LOWEST_ASK`: Lowest ask price

In [None]:
market_config = MarketConfig(
    min_price=0.0,                          # Minimum allowed price per unit of energy
    max_price=100.0,                        # Maximum allowed price per unit of energy
    min_quantity=0.0,                      # Minimum allowed trade quantity
    max_quantity=100.0,                     # Maximum allowed trade quantity
    price_mechanism=ClearingMechanism.AVERAGE,  # How market clearing price is determined:
                                                 #   AVERAGE: Average of matched bid/ask prices
                                                 #   FIRST_PRICE: Price of first matched order
                                                 #   LAST_PRICE: Price of last matched order
                                                 #   HIGHEST_BID: Highest bid price
                                                 #   LOWEST_ASK: Lowest ask price
    blockchain_difficulty=2,                # Proof-of-work difficulty for blockchain validation
    visualize_blockchain=False,            # Whether to visualize blockchain structure
    enable_partner_preference=True          # Allow agents to prefer trading with specific partners
)

## 1.2. üìä DER Profile Handler (`DERProfileHandler`)

The `DERProfileHandler` generates energy profiles (generation and demand) for DER agents.

**Capabilities:**
- ‚úÖ Generate random profiles based on capacity constraints
- ‚úÖ Load profiles from CSV files (if file paths are provided)
- ‚úÖ Ensure profiles respect market quantity bounds

**Profile Types:**
- **Generation Profile**: Energy the agent produces (e.g., solar panels, wind turbines)
- **Demand Profile**: Energy the agent consumes (e.g., household/business load)

In [None]:
der_profile_handler = DERProfileHandler(
    min_quantity=market_config.min_quantity,    # Minimum quantity bound (from market config)
    max_quantity=market_config.max_quantity,    # Maximum quantity bound (from market config)
    generation_file_path=None,                  # Path to CSV file with generation profiles
                                                #   (None = generate random profiles)
    demand_file_path=None,                      # Path to CSV file with demand profiles
                                                #   (None = generate random profiles)
    seed=SEED                                   # Random seed for reproducible profile generation
)

## 1.3. üè¢ DSO Profile Handler (`DSOProfileHandler`)

The `DSOProfileHandler` generates price profiles for the Distribution System Operator (DSO).

**DSO Price Types:**
- **Feed-in tariff** üí∞: Price the DSO pays agents for excess energy they generate
- **Utility price** üíµ: Price agents pay the DSO when buying energy from the grid

**Profile Sources:**
- Can be loaded from CSV files (real-world data)
- Can be generated randomly (for simulation/testing)

The DSO acts as a fallback market when local peer-to-peer trading cannot satisfy all energy needs.

In [None]:
dso_profile_handler = DSOProfileHandler(
    min_price=market_config.min_price,         # Minimum price bound (from market config)
    max_price=market_config.max_price,         # Maximum price bound (from market config)
    feed_in_tariff_file_path=None,             # Path to CSV file with feed-in tariff prices
                                               #   (None = generate random prices)
    utility_price_file_path=None,              # Path to CSV file with utility prices
                                               #   (None = generate random prices)
    seed=SEED                                  # Random seed for reproducible price generation
)

## 1.4. üë• DER + Battery (`DERAgent` + `Battery`)

Each DER agent represents a participant in the local energy market.

**Agent Components:**
- **Generation profile** ‚òÄÔ∏è: Energy the agent produces (e.g., from solar panels, wind turbines)
- **Demand profile** üè†: Energy the agent consumes (e.g., household/business load)
- **Battery** üîã: Optional energy storage system for time-shifting energy

**Battery Capabilities:**
- **Charge**: Store excess energy when generation exceeds demand
- **Discharge**: Release stored energy when demand exceeds generation
- **Time-shifting**: Enable coordination across different time periods

We create multiple agents with randomized capacities and profiles to simulate a diverse, realistic market environment. 

In [None]:
agents = []

for i in range(NUM_AGENTS):
    # Generate random capacities for diversity in the market
    der_capacity = np.random.randint(100, 200)      # DER capacity (e.g., solar panel kW)
    battery_capacity = der_capacity // 2            # Battery capacity (half of DER capacity)

    # Generate energy profiles (generation and demand) for this agent
    generation, demand = der_profile_handler.get_energy_profiles(
        steps=STEPS,                                 # Number of timesteps
        capacity=der_capacity + battery_capacity,    # Total capacity for profile scaling
        constant=bool(np.random.choice([True, False]))  # Whether profile is constant or variable
    )

    # Create battery storage system
    battery = Battery(
        nominal_capacity=battery_capacity,           # Maximum energy storage capacity
        min_soc=0.0,                                # Minimum state of charge (0 = empty)
        max_soc=1.0,                                # Maximum state of charge (1 = full)
        charge_efficiency=0.95,                      # Efficiency when charging (95% = 5% loss)
        discharge_efficiency=0.95                   # Efficiency when discharging (95% = 5% loss)
    )

    # Create DER agent with generation, demand, and battery
    agent = DERAgent(
        id=f"agent_{i}",                            # Unique agent identifier
        capacity=der_capacity,                       # Maximum generation capacity
        battery=battery,                             # Battery storage system
        node_id=None,                                # Grid node ID (None = auto-assign)
        generation_profile=generation,               # Time series of energy generation
        demand_profile=demand                        # Time series of energy demand
    )

    agents.append(agent)

## 1.5. üîå Grid Network (`GridNetwork`)

The `GridNetwork` represents the physical electrical grid infrastructure.

**Grid Functions:**
- **Topology definition** üó∫Ô∏è: Network structure of the grid (how agents are connected)
- **Distance calculations** üìè: Calculates distances between agents for transmission loss
- **Constraint validation** ‚ö†Ô∏è: Validates grid constraints (capacity, voltage, etc.)

**Available Topologies:**
- **IEEE34**: Standard IEEE test feeder (34 nodes)
- **MESH**: Fully connected network
- **RADIAL**: Tree-like structure
- **RING**: Circular connection pattern
- Custom topologies for specific scenarios

The grid topology affects transmission losses and trading opportunities between agents.

In [None]:
# Calculate total capacity of all agents for grid sizing
capacity = sum([a.capacity for a in agents])

grid_network = GridNetwork(
    topology=GridTopology.IEEE34,    # Grid topology type (IEEE34 is a standard test feeder)
    num_nodes=34,                     # Number of nodes in the grid network
    capacity=capacity,                # Total grid capacity (sum of all agent capacities)
    seed=SEED                         # Random seed for grid node assignment
)

## 1.6. üè¢ DSO (`DSOAgent`)

The Distribution System Operator (DSO) agent manages the grid and provides fallback trading.

**DSO Responsibilities:**
- **Feed-in tariff** üí∞: Buys excess energy from agents at a fixed rate
- **Utility price** üíµ: Sells energy to agents when local market cannot satisfy demand
- **Grid management** ‚ö°: Maintains grid balance and handles unmatched orders

**DSO Role:**
The DSO acts as a safety net for the local energy market, ensuring that:
- Agents can always sell excess energy (even if no local buyers exist)
- Agents can always buy energy (even if no local sellers exist)
- Grid stability is maintained

In [None]:
# Get price profiles for the DSO (feed-in tariff and utility price)
fit, utility = dso_profile_handler.get_price_profiles(steps=STEPS)

dso = DSOAgent(
    id="dso",                           # DSO identifier
    feed_in_tariff=fit,                 # Time series of prices DSO pays for excess energy
    utility_price=utility,               # Time series of prices agents pay for grid energy
    grid_network=grid_network,           # Grid network for distance/constraint calculations
)

## 1.7. ‚öôÔ∏è Environment Configuration

Now we combine all components into a complete environment configuration dictionary that will be used by the RL training and inference systems.

**Configuration Components:**
- ‚úÖ Market rules and constraints
- ‚úÖ Agent setup with profiles and batteries
- ‚úÖ Grid network topology
- ‚úÖ DSO price profiles
- ‚úÖ Environment behavior settings

This configuration dictionary will be passed to the RL trainer and inference systems.

In [None]:
env_config = {
    "max_steps": STEPS,                         # Maximum number of timesteps per episode
    "agents": agents,                           # List of DER agents
    "market_config": market_config,             # Market configuration
    "grid_network": grid_network,               # Grid network topology
    "dso": dso,                                  # Distribution System Operator agent
    "der_profile_handler": der_profile_handler, # Handler for generating DER energy profiles
    "dso_profile_handler": dso_profile_handler, # Handler for generating DSO price profiles
    "enable_reset_dso_profiles": False,         # Whether to reset DSO profiles on env reset
    "enable_asynchronous_order": True,          # Whether agents submit orders asynchronously
    "max_error": 0.3,                           # Maximum allowed error in market clearing
    "num_anchor": 4,                            # Number of anchor points for reputation system
    "seed": SEED                                # Random seed for environment reproducibility
}

In [None]:
# Optional: Save environment configuration to file for later use
EnvConfigHandler.save(env_config,
                      STORAGE_PATH,
                      "env_config2")

# 2. üéì RL Training (`RLTrainer`)

In this section, we'll set up and run reinforcement learning training. The `RLTrainer` supports multiple algorithms and training modes.

## üìö Training Modes Explained

**CTCE** (Centralized Training, Centralized Execution):
- Single shared policy across all agents
- Centralized execution
- Best for: Homogeneous agents, simpler coordination

**CTDE** (Centralized Training, Decentralized Execution):
- Centralized training (shared experience)
- Each agent has its own policy
- Best for: Heterogeneous agents, independent decision-making

**DTDE** (Decentralized Training, Decentralized Execution):
- Fully decentralized training
- Each agent trains independently
- Best for: Privacy-preserving scenarios, realistic deployment

## üß† Available Algorithms

- **PPO** (Proximal Policy Optimization): Stable, sample-efficient
- **APPO** (Asynchronous PPO): Faster training with parallel workers
- **SAC** (Soft Actor-Critic): Off-policy, good for continuous actions

## 2.1. Configuration

In [None]:
# Create RL trainer with specified algorithm and training mode
trainer = RLTrainer(
    env_config=env_config,                    # Environment configuration from section 1
    algorithm=(RLAlgorithm.PPO if ALGO == "ppo" 
               else RLAlgorithm.APPO if ALGO == "appo" 
               else RLAlgorithm.SAC if ALGO == "sac" 
               else None),                    # RL algorithm: PPO, APPO, or SAC
    training=(TrainingMode.CTDE if MODE == "ctde" 
              else TrainingMode.CTCE if MODE == "ctce" 
              else TrainingMode.DTDE if MODE == "dtde" 
              else None),                     # Training mode: CTCE, CTDE, or DTDE
    iters=ITERS_TRAIN,                        # Number of training iterations
    tune_samples=TUNE_SAMPLES,                # Number of hyperparameter tuning samples
    checkpoint_freq=2,                        # Save checkpoint every N iterations
    evaluation_interval=1,                    # Run evaluation every N iterations
    evaluation_duration=3,                    # Number of episodes per evaluation
    cpus=1,                                   # Number of CPU cores to use
    gpus=0,                                   # Number of GPUs to use (0 = CPU only)
    storage_path=STORAGE_PATH                 # Path where checkpoints and results are saved
)

In [None]:
# Test the environment by running a few steps with random actions
# This verifies that the environment is properly configured before training
for step in range(3):
    # Generate random valid actions for all agents
    actions = {}
    for agent_id in trainer.env.agents:
        # Handle different action space formats for different training modes
        if hasattr(trainer.env, 'action_spaces') and trainer.env.action_spaces is not None:
            action_space = trainer.env.action_spaces[agent_id]  # DTDE mode
        else:
            action_space = trainer.env.action_space[agent_id]   # CTCE/CTDE mode
        actions[agent_id] = action_space.sample()

    # Step the environment and observe rewards
    obs, rewards, terminated, truncated, info = trainer.env.step(actions)
    print(f"Step {step + 1} | Reward: {rewards}")

## 2.2. üöÄ Training

Start the training process. This will:
1. ‚úÖ Initialize the RL algorithm with the specified configuration
2. üéì Train agents for the specified number of iterations
3. üíæ Save checkpoints periodically (for recovery and analysis)
4. üìä Run evaluations to track performance

**‚ö†Ô∏è Note**: Uncomment the line below to start training. Training may take a while depending on:
- Number of training iterations
- System resources (CPU/GPU)
- Number of agents
- Environment complexity

**üí° Tip**: Start with a small number of iterations (e.g., 3-5) to test your setup before running longer training sessions.

In [None]:
# Start training (uncomment to run)
# results, metrics = trainer.train()

## 2.3. üîÑ Restore Experiment

If you want to continue working with a previously trained experiment, you can restore it here.

**Use Cases:**
- üìä Analyze results from a previous training run
- üéì Continue training from where you left off
- üîç Inspect model checkpoints
- üìà Compare different training configurations

**Requirements:**
- Experiment path must point to a valid training directory
- Environment configuration should match the original training setup

In [None]:
# Restore a previously trained experiment (uncomment to use)
# trainer.restore_experiment(
#     experiment_path=EXPERIMENT_PATH,      # Path to the experiment directory
#     embeddings_dim=EMBEDDINGS_DIM,        # Dimension of agent embeddings (for CTDE/DTDE)
# )

## 2.4. ‚ûï Continue Training a Checkpoint

Continue training from a specific checkpoint. This is useful when you want to:
- üéØ Fine-tune a trained model
- üîÑ Continue training that was interrupted
- üìà Train for additional iterations
- üî¨ Experiment with extended training

**When to Use:**
- Your training was interrupted and you want to resume
- You want to fine-tune a model with more training
- You're experimenting with different training durations

In [None]:
# Continue training from a checkpoint (uncomment to use)
# trainer.train_checkpoint(
#     checkpoint_path=CHECKPOINT_PATH_TRAIN,  # Path to the checkpoint file
#     iters=3,                                 # Additional training iterations
#     embeddings_dim=128                       # Dimension of agent embeddings (for CTDE/DTDE)
# )

# 3. üî¨ Inference (`RLInference`)

After training, use `RLInference` to evaluate trained agents in the environment.

**Inference Features:**
- üéØ **Deterministic Evaluation**: Runs the trained policy without exploration (default)
- üé≤ **Exploration Mode**: Can enable exploration for testing robustness
- üìä **Performance Metrics**: Collects statistics on agent performance
- üíæ **Results Saving**: Automatically saves inference results to storage path

**What Happens During Inference:**
1. Loads the trained model from checkpoint
2. Runs episodes in the environment
3. Collects rewards, actions, and market statistics
4. Saves results for analysis

**üí° Tip**: Set `exploration=False` to evaluate the best learned policy, or `exploration=True` to test agent robustness.

In [None]:
# Create inference model from a trained checkpoint
model = RLInference(
    env_config=env_config,                    # Environment configuration (must match training)
    exploration=EXPLORATION,                  # Whether to use exploration (False = use best policy)
    checkpoint_path=CHECKPOINT_PATH_INFERENCE, # Path to the trained checkpoint
    storage_path=STORAGE_PATH                 # Path where inference results will be saved
)

In [None]:
# Run inference for the specified number of episodes
# This will evaluate the trained agents and save results to the storage path
model.inference(ITERS_INFERENCE)

---

## üìù Summary & Next Steps

### ‚úÖ What You've Learned

This notebook demonstrated the complete workflow for:

1. **‚öôÔ∏è Configuration**: Setting up market, agents, grid, and DSO
2. **üéì Training**: Training RL agents using different algorithms and modes
3. **üî¨ Inference**: Evaluating trained agents in the environment

### üöÄ Next Steps

**Experimentation:**
- üîß Experiment with different market configurations and agent setups
- üß† Try different RL algorithms (PPO, APPO, SAC) and training modes (CTCE, CTDE, DTDE)
- üìä Analyze the saved results from training and inference
- üìÅ Check the `downloads/` directory for saved checkpoints and metrics

**Advanced Learning:**
- üìö For more advanced examples, see the `cases/` directory for case study notebooks
- üìñ Explore case studies on market mechanisms, agent heterogeneity, DSO intervention, and more

### üìä Key Takeaways

- **Market Configuration** defines trading rules and constraints
- **Agent Profiles** determine energy generation and demand patterns
- **Batteries** enable time-shifting and improved coordination
- **Grid Network** affects transmission losses and trading opportunities
- **DSO** provides fallback trading when local market cannot satisfy needs
- **Training Modes** (CTCE, CTDE, DTDE) offer different coordination paradigms
- **RL Algorithms** (PPO, APPO, SAC) have different strengths and use cases

---

**üéØ Ready to explore more? Check out the case studies in the `cases/` directory!**
