In [2]:
import pandas as pd # type: ignore
from datetime import datetime
from env import DiscreteEnv
import numpy as np # type: ignore
import matplotlib.pyplot as plt # type: ignore

In [5]:
df = pd.read_csv("../data/20241121-20251121 CAISO Real-time Price.csv") # for training
df = df[df.hub == "TH_SP15"].reset_index(drop=True)
df.Date = [datetime.strptime(date.strip("'"), "%m/%d/%Y %I:%M:%S %p") for date in df.Date]
df_quarterhr = df.iloc[[i for i in range(len(df.Date)) if df.Date.iloc[i].minute in [0, 15, 30, 45]]]

df2 = pd.read_csv("../data/20231121-20241121 CAISO Real-time Price.csv") # for testing
df2 = df2[df2.hub == "TH_SP15"].reset_index(drop=True)
df2.Date = [datetime.strptime(date.strip("'"), "%m/%d/%Y %I:%M:%S %p") for date in df2.Date]
df2_quarterhr = df2.iloc[[i for i in range(len(df2.Date)) if df2.Date.iloc[i].minute in [0, 15, 30, 45]]]

In [None]:
plt.figure()
plt.plot(df_quarterhr.price)
plt.plot(df2_quarterhr.price)

In [None]:
env = DiscreteEnv(4*24*50, df_quarterhr)
state, info = env.reset()
done = False
cum_rew = [0]
soc = [0]

while not done:
    action = env.action_space.sample() # random baseline
    state, reward, done, _, info = env.step(action)
    cum_rew.append(cum_rew[-1]+reward)
    soc.append(state[0])
    
plt.figure()
plt.plot(range(len(cum_rew)), cum_rew)
#plt.plot(range(len(soc)), soc)

In [4]:
# DQN PARAMS
RL_PARAMS = {
    'policy': "MlpPolicy",
    # 'learning_rate': 0.00176746728919149,  # Default: 1e-4
    'learning_rate': 1e-4,
    'buffer_size': 100_000,  # Default: 1e6
    'learning_starts': 255,  # Default: 50_000
    'batch_size': 256,  # Default: 32
    'tau': 0.5016120493544259,  # Default: 1.0
    'gamma': 0.9999812912592504,  # Default: 0.99
    'train_freq': 84,  # Default: 4
    'gradient_steps': -1,  # Default: 1
    'target_update_interval': 10_000,  # Default: 1e4
    'exploration_fraction': 0.5,  # Default: 0.1
    'exploration_initial_eps': 1.0,  # Default: 1.0
    'exploration_final_eps': 0.005,  # Default: 0.05
    'max_grad_norm': 3.266151433390378,  # Default: 10

    'policy_kwargs': {
        # Defaults reported for MultiInputPolicy
        'net_arch': 'extra_large',  # Default: None
        'activation_fn': 'leaky_relu',  # Default: tanh
    }
}

In [8]:
import os
import time
import json
from typing import Optional

from stable_baselines3 import PPO, SAC, DDPG, DQN, A2C #type: ignore
from stable_baselines3.common.callbacks import EvalCallback #type: ignore
from stable_baselines3.common.evaluation import evaluate_policy #type: ignore
from stable_baselines3.common.logger import configure #type: ignore

"""
    Trains a reinforcement learning agent.

    :param agent: A string that represents the name of the reinforcement learning agent.
    :param run: An integer that represents the run number.
    :param path: A string that represents the path to the directory where the data will be saved.
    :param exp_params: A dictionary that contains the experiment parameters.
    :param env_id: A string that represents the ID of the environment.
    :param env_kwargs: A dictionary that contains the keyword arguments for the environment.
    :param rl_params: A dictionary that contains the reinforcement learning parameters.
    :param verbose: An integer that represents the verbosity level.
    :param discrete_actions: A list of discrete actions.
    :param logger_type: A list that represents the logger type.
"""

start = time.time()

# CREATE ENVIRONMENT
env = DiscreteEnv(8760, df_quarterhr)
    
model = DQN("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000, log_interval=4)

eval_env = DiscreteEnv(8760*4, df2_quarterhr) # corresponds to 8760 hours from the og paper
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10)

print(f"mean_reward: {mean_reward:.2f} +/- {std_reward:.2f}")

env.close()

print()
print(f'Execution time = {time.time() - start}s')

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
mean_reward: 290242.92 +/- 1223.03

Execution time = 171.53819513320923s
