In [None]:
from datetime import timedelta, datetime
from pathlib import Path
import pickle
import random

# CommonPower imports
from commonpower.control.controllers import *
# from commonpower.control.wrappers import *
from commonpower.control.pytupli_interface import *
from commonpower.control.configs.algorithms import *
from commonpower.control.runners import DeploymentRunner, SingleAgentTrainer
from commonpower.control.wrappers import *
from commonpower.control.safety_layer.penalties import *
from commonpower.control.safety_layer.safety_layers import *
from commonpower.control.logging_utils.loggers import *
from commonpower.core import ModelHistory, System
from commonpower.data_forecasting.base import DataProvider, Forecaster
from commonpower.data_forecasting.data_sources import *
from commonpower.data_forecasting.forecasters import *
from commonpower.modeling.param_initialization import *
from commonpower.models.buses import *
from commonpower.models.components import *
from commonpower.models.powerflow import *
from commonpower.utils.helpers import get_adjusted_cost

# PyTupli imports
from pytupli.storage import TupliAPIClient, FileStorage
from pytupli.schema import *
from pytupli.dataset import TupliDataset

# D3rlpy imports
import d3rlpy
from d3rlpy.algos import CQLConfig
from d3rlpy.dataset import MDPDataset

# SB3 imports
from stable_baselines3 import SAC


## Offline RL in CommonPower using PyTupli

This notebook introduces how to use the Python tool PyTupli to create reproducible benchmarks based on CommonPower and record offline data for these benchmarks using our built-in controllers. These datasets can then be used to train an RL agent using offline RL (e.g., using the d3rlpy library). 

### Background: PyTupli
[PyTupli](https://github.com/TUMcps/pytupli) is a tool for seamless hosting of databases for collaborative offline RL projects. It provides a Docker container for setting up your own server such that you can store serialized benchmarks and corresponding RL tuples (state, action, next state, reward), as well as related artifacts such as trained controllers, hyperparameters, or CSV files containing time series data. Furthermore, PyTupli offers an advanced user management for fine-grained access control. Beyond its application in offline RL, it can also be used to simply share serialized benchmarks and the necessary time series data in a secure fashion, which is highly relevant in areas with privacy concerns.

### Part I: Creating a Benchmark

We will first show how to create a serializable benchmark from a `System` in CommonPower that can then be uploaded to a storage instantiated with PyTupli. The main challenge here consists in serializing the time series data used in CommonPower to simulate static loads, PV production, outdoor temperature, etc. PyTupli allows uploading such artifacts separately to avoid duplicate storage if they are referenced by multiple benchmarks. 

We will first create a `TupliStorage` instance. We have two options: local file storage or storage on a hosted server. If you are using the API, follow the instructions in the [README of PyTupli](https://github.com/TUMcps/pytupli) to start the application and log in.

In [None]:
STORAGE_FLAG = 'file'  # "api" or "file"
# which storag to use
if STORAGE_FLAG == 'api':
    storage = TupliAPIClient()
elif STORAGE_FLAG == 'file':
    storage = FileStorage()
else:
    raise ValueError(f"Unknown storage flag: {STORAGE_FLAG}. Has to be 'api' or 'file'.")

Our example system that we want to store as a benchmark is a single-family home with a PV array, a battery energy storage system (BESS), and, optionally, a heatpump. 

In [None]:
def create_scenario(
    forecaster: Forecaster,
    controller: BaseController,
    use_heat_pump: bool
):
    current_path = Path().absolute()
    data_path = current_path / 'data'
    date_format = "%Y-%m-%d %H:%M:00"

    # create system
    train_scenario = BuildingManagementSystemScenario(
        data_path=data_path,
        date_format=date_format,
        forecaster=forecaster,
        use_heat_pump=use_heat_pump,
    )

    sys = train_scenario.get_system()
    
    # add top level node to controller
    controller.add_entity(sys.nodes[0])
    return sys


class BuildingManagementSystemScenario:
    def __init__(
        self,
        forecaster: Forecaster,
        data_path: Path,
        date_format: str,
        use_heat_pump: bool = False,
        price_buying: float = 0.37,
        price_selling: float = 0.08,  # numbers for 2024: https://www.finanztip.de/photovoltaik/einspeiseverguetung/
    ):
        self.use_heat_pump = use_heat_pump
        self.price_buying = price_buying
        self.price_selling = price_selling
        self.data_path = data_path.resolve()
        self.date_format = date_format
        self.forecaster = forecaster
        self.forecast_frequency = forecaster.frequency
        self.forecast_horizon = forecaster.horizon
        self.sys = self.create_system()
    
    def get_system(self) -> System:
        return self.sys

    def create_system(self) -> System:
        return self._create_system()

    def _create_system(self):
        self.define_data_sources()
        self.load_p_dp = DataProvider(self.p_load_ds, self.forecaster)  # [kW]
        self.load_q_dp = DataProvider(self.q_load_ds, self.forecaster)  # [kVA]
        self.price_dp = DataProvider(self.buying_price_ds, self.forecaster)  # [€]
        self.selling_price_dp = DataProvider(self.selling_price_ds, self.forecaster)  # [€]
        self.pv_dp = DataProvider(self.pv_ds, self.forecaster)  # [kW]
        self.hp_dp = DataProvider(self.heat_pump_ds, self.forecaster)
        # Let's first create an instance of the RTPricedBus with lower and upper bounds for its variables
        n1 = RTPricedBus("MultiFamilyHouse", {'p': (-50, 50), 'q': (-50, 50), 'v': (0.95, 1.05), 'd': (-15, 15)})
        # Then, we add the previously defined data providers for the buying and selling price of electricity.
        n1.add_data_provider(self.price_dp).add_data_provider(self.selling_price_dp)

        # external grid
        m1 = ExternalGrid("ExternalGrid")

        # photovoltaic with generation data
        r1 = RenewableGen("PV1").add_data_provider(self.pv_dp)

        # static load with data source
        d1 = Load("Load1").add_data_provider(self.load_p_dp).add_data_provider(self.load_q_dp)

        # battery storage
        capacity = 5  # kWh
        # Since it has proven beneficial for training to use constant initializers, we use this:
        ess_initializer = ConstantInitializer(0.5 * capacity)
        hp_initializer = ConstantInitializer(21)

        e1 = ESSLinear(
            "ESS1",
            {
                'p': (-1.5, 1.5),  # active power limits
                'q': (0, 0),  # reactive power limits
                'soc': (0.1 * capacity, 0.9 * capacity),  # soc limits
                "soc_init": ess_initializer,
            },
        )

        h1 = HeatPumpWithoutStorageButCOP(
            "HeatPump",
            {
                'p': [0, 5],  # kW
                'T_indoor_setpoint': 21,  # Celsius
                'T_indoor': [16, 26],  # Celsius
                'T_indoor_init': hp_initializer,  # Celsius
                'T_ret_FH': [10, 100],  # Celsius
                'T_ret_FH_init': ConstantInitializer(25.0),  # Celsius
                'H_FH': 1.1,  # kW/K
                'H_out': 0.26,  # kW/K
                'tau_building': 240,  # h
                'Cw_FH': 1.1625,  # kWh/K
                'c': 1.0,  # weighting factor (comfort factor) for cost function; multiplied with temperature deviation
            },
        ).add_data_provider(self.hp_dp)

        # add components to the household
        n1.add_node(d1).add_node(r1).add_node(e1)
        if self.use_heat_pump:
            n1.add_node(h1)

        # create the system and add top-level busses
        return System(power_flow_model=PowerBalanceModel()).add_node(n1).add_node(m1)

    def define_data_sources(self):
        # Data source (ds) for active power(p) of a household
        self.p_load_ds = CSVDataSource(
            self.data_path / 'data_ICLR' /'ICLR_load.csv', datetime_format=self.date_format, resample=self.forecast_frequency
        )

        # We neglect reactive power (q) during this tutorial
        self.q_load_ds = ConstantDataSource(
            {"q": 0.0}, date_range=self.p_load_ds.get_date_range(), frequency=self.forecast_frequency
        )

        self.buying_price_ds = ConstantDataSource(
            {"psib": self.price_buying},
            date_range=self.p_load_ds.get_date_range(),
            frequency=self.forecast_frequency,
        )

        # Data source for selling prices of electricity
        self.selling_price_ds = ConstantDataSource(
            {"psis": self.price_selling},
            date_range=self.buying_price_ds.get_date_range(),
            frequency=self.forecast_frequency,
        )

        # Data source for PV generation
        self.pv_ds = CSVDataSource(
            self.data_path / 'data_ICLR' / 'ICLR_pv.csv', datetime_format=self.date_format, resample=self.forecast_frequency
        ).apply_to_column("p", lambda x: -x)

        # Data sources for heat pump: outdoor temperature and coefficient of performance
        # taken from When2Heat dataset: https://data.open-power-system-data.org/when2heat/
        self.heat_pump_ds = CSVDataSource(
            self.data_path / 'open-power-data' / 'DE_Temperature_and_COP2016_PV_Open_Power_Load_BDEW.csv',
            datetime_format="%d.%m.%Y %H:%M",
            rename_dict={"time": "t", "outside_temp": "T_outside", "COP": "COP"},
            auto_drop=True,
            delimiter=";",
            resample=self.forecast_frequency,
        )

For storing the benchmark, we will attach an optimal controller to the system. 
This controller can be replaced later by any other controller type, but is required to instantiate the gym environment in CommonPower.
We will create two system, one with and one without a heatpump.

In [None]:
controller = OptimalControllerPyTupli("opt_ctrl", obs_handler=ObservationHandler(num_forecasts=6),)  # We have to use this controller type to ensure compatibility with PyTupli
controller_hp = OptimalControllerPyTupli("opt_ctrl_hp", obs_handler=ObservationHandler(num_forecasts=6),)
global_controller = OptimalController("global_ctrl")  # The global controller will balance the power flow in the system
global_controller_hp = OptimalController("global_ctrl_hp")

# We use a persistence forecaster that provides forecasts for the next 6 hours based on historic data
forecast_length = 6
forecaster = PersistenceForecaster(
    frequency=timedelta(hours=1), horizon=timedelta(hours=forecast_length), look_back=timedelta(hours=24)
    )
sys = create_scenario(forecaster=forecaster, use_heat_pump=False, controller=controller)
sys_hp = create_scenario(forecaster=forecaster, use_heat_pump=True, controller=controller_hp)

# We need the "global controller" to balance the power flow in the system
global_controller.add_system(sys)
global_controller_hp.add_system(sys_hp)

We now have to initialize the system before we can create a gymnasium environment from it. We use a custom `Wrapper` that realizes the interface to PyTupli. Its main task consists of implementing 
the correct serialization and deserialization of time series data. 

In [None]:
random.seed(42)
np.random.seed(42)
# We have to fix a few parameters before we can initialize the system
horizon = timedelta(hours=forecast_length)
episode_length = 3 * 24  # three days
tau = timedelta(hours=1)  # time step size
solver = get_default_solver()
sys.initialize(horizon=horizon, episode_horizon=timedelta(episode_length), tau=tau, solver=solver)
sys_hp.initialize(horizon=horizon, episode_horizon=timedelta(episode_length), tau=tau, solver=solver)
sys.reset(at_time=sys.sample_start_date())
sys_hp.reset(at_time=sys_hp.sample_start_date())
# instantiate the environment wrappers: One for handling deployment in CommonPower and one for interfacing PyTupli
wrapper_stack = WrapperStack().add(DeploymentWrapper).add(CommonPowerTupliEnvWrapper, storage=storage, rl_tuple_cls=CommonPowerRLTuple)
sys_env = sys.create_env_func(episode_length=episode_length, wrapper=wrapper_stack.get_stack())
sys_hp_env = sys_hp.create_env_func(episode_length=episode_length, wrapper=wrapper_stack.get_stack())

Now, we can upload the environments to our storage. We will start with the more simple system that only has a BESS and a PV array:

In [None]:
sys_env.store(name="building_ess_pv", description="Building with ESS and PV")

Let us have a look at the stored benchmarks and artifacts:

In [None]:
storage.list_benchmarks()

In [None]:
storage.list_artifacts()

The two stored artifacts are the time series data for the static load and the PV power production. Next, we will serialize the benchmark with the heatpump.

In [None]:
sys_hp_env.store(name="building_ess_pv_hp", description="Building with ESS, PV and Heatpump")

Let's look at the number of stored artifacts again. We should have 3 artifacts now, as the second environment uses the same time series data for 
the static load and the PV production as the first one. The additional data is for the outdoor temperature and the coefficient of performance (COP), 
which are stored in one CSV file.

In [None]:
storage.list_artifacts()

### Part 2: Creating Experience Datasets and Storing them with PyTupli

#### Recording Interactions with an Optimal Controller
We will now use CommonPower to simulate the building management system with a built-in optimal controller.
All interactions should be saved as experience tuples (state, action, next state, reward) associated to the benchmark we created. This can be achieved by using our customized TupliEnvWrapper. We will pass some metadata to the episodes for later filtering. 

In [None]:
class MyCallback(EpisodeMetadataCallback):
    def __init__(self, is_expert: bool = False):
        super().__init__()
        # we will compute the cumulative reward for an episode
        self.cum_reward = 0
        # Furthermore, we want to store the fact that the episode was an expert episode
        self.is_expert = is_expert
    def reset(self):
        # we will compute the cumulative reward for an episode
        self.cum_reward = 0
    def __call__(self, tuple):
        self.cum_reward += tuple.reward[0]
        return {"cum_eps_reward": self.cum_reward, "is_expert": self.is_expert}

We show how to load the benchmark from storage, although this would not have been necessary here. 

In [None]:
loaded_sys_env = CommonPowerTupliEnvWrapper.load(
    storage=storage, 
    benchmark_id=sys_env.id, 
    rl_tuple_cls=CommonPowerRLTuple, 
    )

Next, we instantiate a `DeploymentRunner` and pass our customized `TupliEnvWrapper` to record interactions. 

In [None]:
# First, we extract the raw system data from the loaded environment
loaded_sys = loaded_sys_env.get_system()
horizon_loaded = loaded_sys.horizon
# Then, we instantiate the runner and fix its start date
runner_loaded = DeploymentRunner(
    sys=loaded_sys,
    horizon=horizon_loaded,
    normalize_actions=False,  # IMPORTANT!
    wrapper=WrapperStack().add(
        CommonPowerTupliEnvWrapper, 
        storage=storage, 
        rl_tuple_cls=CommonPowerRLTuple, 
        benchmark_id=sys_env.id,
        metadata_callback = MyCallback(is_expert=True)
        ).get_stack()
) 
runner_loaded.set_start_time(datetime.datetime(2016, 1, 2, 0, 0, 0))

Running the simulation takes some time. You can adjust the number of steps to experiment with smaller datasets.

In [None]:
runner_loaded.run(n_steps=182*24)  # run for the first half year of 2016

For later comparison, we will compute the costs obtained with the optimal controller on the second half of the year 2016.
We will not record this data with PyTupli, thus we don't have to pass the respective wrapper.

In [None]:
history_loaded = ModelHistory([loaded_sys])  # create a history with the loaded system
runner_loaded = DeploymentRunner(
    sys=loaded_sys,
    history=history_loaded,
    horizon=horizon_loaded,
    normalize_actions=False,  # IMPORTANT
) 
runner_loaded.set_start_time(datetime.datetime(2016, 1, 2, 0, 0, 0) + timedelta(days=182))
runner_loaded.run(n_steps=182*24)  # run for the second half year of 2016

In [None]:
# Print total cost
total_cost_mpc = sum(get_adjusted_cost(history_loaded, loaded_sys))
print("Total cost obtained with MPC:", total_cost_mpc)

#### Recording Experience from RL Training
Another option for creating experience datasets is to record transitions during online training of an RL agent. We will show an example of this procedure using the SAC implementation of StableBaselines3.

In [None]:
# We will re-load the system to create a new instance
loaded_sys_env = CommonPowerTupliEnvWrapper.load(
    storage=storage, 
    benchmark_id=sys_env.id, 
    rl_tuple_cls=CommonPowerRLTuple, 
    )
loaded_sys_rl = loaded_sys_env.get_system()
# First, we need to instantiate an RL agent
agent1 = RLControllerSB3(
    name='agent1', 
    obs_handler=ObservationHandler(num_forecasts=6),
    safety_layer=ActionProjectionSafetyLayer(penalty=DistanceDependingPenalty(penalty_factor=0.001)),
)
# Then, we can replace the optimal controller in the system with this agent
agent1.add_entity(loaded_sys_rl.nodes[0])  # add the top-level node of the system to the controller

In [None]:
# Now, we configure the algorithm
sac_config = SB3MetaConfig(
    total_steps=182*24,  # same number of environment interactions as optimal controller
    seed=5,
    algorithm=SAC,
    penalty_factor=0.001,
    algorithm_config=SB3SACConfig(
        train_freq=1,
        learning_rate=0.0008,
        batch_size=12,
    )
)

# Set up logger
log_dir = './test_run/'
logger = TensorboardLogger(log_dir='./test_run/')

The RL agent outputs random actions toward the beginning of the training, which is why we will set the `is_expert` parameter for the episode metadata to `False`.

In [None]:
# Specify the path where the model should be saved
model_path = "./saved_models/sac_model"

# Add SingleAgentWrapper for interfacing SB3 and CommonPowerTupliEnvWrapper for interfacing PyTupli
wrapper_stack_sac = WrapperStack().add(SingleAgentWrapper).add(
    CommonPowerTupliEnvWrapper,         
    storage=storage, 
    rl_tuple_cls=CommonPowerRLTuple, 
    benchmark_id=sys_env.id,
    metadata_callback=MyCallback(is_expert=False)
    )
# Now, we create a single-agent runner to train the agent
runner_sac = SingleAgentTrainer(
    sys=loaded_sys_rl, 
    wrapper=wrapper_stack_sac.get_stack(), 
    alg_config=sac_config, 
    horizon=horizon,
    episode_length=24,
    logger=logger,
    save_path=model_path, 
    seed=sac_config.seed,
    limited_date_range=[datetime.datetime(2016, 1, 2, 0, 0, 0), datetime.datetime(2016, 1, 2, 0, 0, 0) + timedelta(days=182)]
)

Finally, we can run the training.

In [None]:
runner_sac.run()

### Part 3: Downloading Datasets for Offline RL Training
Given our benchmark id, we can now use PyTupli to download all associated experience data. Furthermore, we can add filters to curate the obtained datasets. 

In [None]:
# We first download all data associated with the given benchmark.
full_dataset = TupliDataset(storage=storage).with_benchmark_filter(
    FilterEQ(key='id', value=sys_env.id)
)
full_dataset.load()
# Convert to format specified by d3rlpy
obs, act, rew, terminal, truncated = full_dataset.convert_to_tensors()
# Create d3rlpy dataset
full_d3rlpy_dataset = MDPDataset(
    observations=obs, actions=act, rewards=rew, terminals=terminal, timeouts=truncated
)

Let us also create one dataset that uses only the expert data from the MPC controller:

In [None]:
mpc_dataset = TupliDataset(storage=storage).with_benchmark_filter(
    FilterEQ(key='id', value=sys_env.id)
    ).with_episode_filter(
        FilterEQ(key='metadata.is_expert', value=True)
    )
mpc_dataset.load()
# Convert to format specified by d3rlpy
obs, act, rew, terminal, truncated = mpc_dataset.convert_to_tensors()
# Create d3rlpy dataset
mpc_d3rlpy_dataset = MDPDataset(
    observations=obs, actions=act, rewards=rew, terminals=terminal, timeouts=truncated
)

In [None]:
# Save the d3rlpy dataset as a pickle file for later usage 
with open("./saved_models/full_d3rlpy_dataset.pkl", "wb") as f:
    pickle.dump(full_d3rlpy_dataset, f)

### Offline RL Training using D3rlpy
We will now use the CQL implementation by d3rlpy to train an RL agent for the given problem on the full dataset. 

In [None]:
# algorithm for offline training: CQL from d3rlpy
d3rlpy.seed(1)  # for reproducibility
algo = CQLConfig(batch_size=64, alpha_threshold=2.0, conservative_weight=5.0, soft_q_backup=True).create(device='cpu')
# train
algo.fit(dataset=full_d3rlpy_dataset, n_steps=10000, n_steps_per_epoch=100)

In [None]:
algo.save_model("./saved_models/cql_agent.pt")

We can also train an agent on the reduced expert dataset. To save you some time, we have pre-trained this agent and will load it during deployment.

### Part 5: Comparing Controller Performance
Next, we replace the optimal controller in our environment with one of the trained agents and simulate the system for the second half of the year 2016. We can then compare the costs obtained with the offline RL agents with those of the MPC. 

In [None]:
# We first have to create a new controller that will load the trained model during deployment
config = D3RLPyMetaConfig(
    seed=42,
    algorithm=d3rlpy.algos.CQLConfig,
    algorithm_config=D3RLPyCQLConfig(),  # specify the hidden layer size
    penalty_factor=0.0,
    mdp_dataset_path="./saved_models/full_d3rlpy_dataset.pkl"  # Updated to use pickle file
)
trained_agent = RLControllerD3RL(
    name="cql_agent", 
    safety_layer=ActionProjectionSafetyLayer(penalty=DistanceDependingPenalty(penalty_factor=0.0)), 
    pretrained_policy_path="./saved_models/cql_agent.pt",
    obs_handler=ObservationHandler(num_forecasts=6),
)
trained_agent.add_entity(loaded_sys.nodes[0])  # add the top-level node of the system to the controller

In [None]:
history_cql = ModelHistory([loaded_sys])  # create a history with the loaded system
runner_cql = DeploymentRunner(
    alg_config=config,
    sys=loaded_sys,
    history=history_cql,
    horizon=horizon_loaded,
    normalize_actions=False,  # IMPORTANT
    wrapper=SingleAgentWrapper
) 
runner_cql.set_start_time(datetime.datetime(2016, 1, 2, 0, 0, 0) + timedelta(days=182))
runner_cql.run(n_steps=182*24)  # run for the second half year of 2016

Now, we do the same with the offline RL agent trained purely on expert data:

In [None]:
trained_agent_expert = RLControllerD3RL(
    name="cql_agent_expert", 
    safety_layer=ActionProjectionSafetyLayer(penalty=DistanceDependingPenalty(penalty_factor=0.0)), 
    pretrained_policy_path="./saved_models/cql_agent_expert.pt",
    obs_handler=ObservationHandler(num_forecasts=6),
)
trained_agent_expert.add_entity(loaded_sys.nodes[0])  # add the top-level node of the system to the controller

In [None]:
history_cql_expert = ModelHistory([loaded_sys])  # create a history with the loaded system
runner_cql = DeploymentRunner(
    alg_config=config,
    sys=loaded_sys,
    history=history_cql_expert,
    horizon=horizon_loaded,
    normalize_actions=False,  # IMPORTANT
    wrapper=SingleAgentWrapper
) 
runner_cql.set_start_time(datetime.datetime(2016, 1, 2, 0, 0, 0) + timedelta(days=182))
runner_cql.run(n_steps=182*24)  # run for the second half year of 2016

In [None]:
# Print total cost
total_cost_cql = sum(get_adjusted_cost(history_cql, loaded_sys))
total_cost_cql_expert = sum(get_adjusted_cost(history_cql_expert, loaded_sys))
print("Total cost obtained with CQL:", total_cost_cql)
print("Total cost obtained with CQL trained on expert data:", total_cost_cql_expert)
print("Total cost obtained with MPC:", total_cost_mpc)

You can see that the CQL agent trained only on expert data performs better than the one trained on the full dataset which has double the size. However, it is still significantly worse than the optimal controller. To improve performance, one could consider larger datasets, longer training, or hyperparameter tuning. 

### Further Ideas an Clean-up
Within this notebook, we only introduce the basic functionalities of PyTupli, like storing and retrieving benchmarks and datasets. We encourage you to discover its other capabilities, like advanced filtering and user management for large collaborative projects. For some inspiration, feel free to check out the [paper](https://arxiv.org/abs/2505.16754)!

As a last step, we delete the benchmark and associated episodes and artifacts from our storage. 

In [None]:
sys_hp_env.delete(delete_episodes=True, delete_artifacts=True)