In [1]:
from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MlpPolicy
from stable_baselines3.common.env_checker import check_env
import gym
import numpy as np

In [2]:
from Hack import load

epex = load.epex().load()

In [7]:
stop_loss = 100

price_array = epex['apx_da_hourly'].values

def get_price(idx):
    return price_array[idx]
    
def get_mean_price(idx):
    window_size = 1000
    if idx == 0:
        return price_array[idx]
    elif idx < window_size:
        return np.mean(price_array[:idx])
    else:
        return np.mean(price_array[idx-window_size:idx])

class energy_price_env(gym.Env):
    
    # ### Action Space
    # The action is a `ndarray` with shape `(1,)` which can take values `{0, 1}` indicating the direction of the fixed force the cart is pushed with.
    # | Num | Action                 |
    # |-----|------------------------|
    # | 0   | Push cart to the left  |
    # | 1   | Push cart to the right |
    
    def __init__(self, start_energy=1, start_time=0, max_time = 7*24*2):
        self.action_space = gym.spaces.Discrete(3)
        # current_price, mean_price, current_energy
        self.observation_space = gym.spaces.Box(low=np.array([-np.inf, -np.inf, 0]),
                                                high=np.array([np.inf, np.inf, 1]),
                                                dtype=np.float32)
        # our state is the charge
        self.start_energy = start_energy
        self.state = np.array([get_price(start_time), get_mean_price(start_time), start_energy])
        self.time = start_time
        self.earnings = 0
        self.power = 1 #MW
        self.capacity = 1 #MWh
        self.efficiency = 0.85 
        self.max_time = max_time
        
    def get_state(self):
        return self.state
    
    def step(self, action):
        current_price, mean_price, current_energy = self.state
        mapped_action = env2human(action)
        if mapped_action == -1:
            # discharge === selling for 30 mins
            new_energy = current_energy - (self.power * 0.5)
            
        elif mapped_action == 0:
            # hold === do nothing
            new_energy = current_energy
        elif mapped_action == 1:
            # charge === buy energy for 30 mins
            new_energy = current_energy + (self.power * 0.5 * self.efficiency)
        
        
        # make sure energy cannot be greater than capacity
        new_energy = max(0, new_energy)
        new_energy = min(self.capacity, new_energy)
        # now work out the delta energy
        delta_energy = (new_energy - current_energy)
        
        
        revenue = - delta_energy * current_price
        self.earnings += revenue
        expected_profit = - delta_energy * mean_price
        
        opportunity_cost = revenue - expected_profit
        
        reward = opportunity_cost # profit * multiplier * price_diff_from_expected

        # print("Delta energy: ", delta_energy)
        # print("Price diff from expected: ", price_diff_from_expected)
        # print("Revenue: ", revenue)
        # print("Expected Profit: ", expected_profit)
        # print("Reward ", reward)
        
        # increase the time
        self.time += 1
        
        self.state = (get_price(self.time), get_mean_price(self.time), new_energy)
        
        info = {}
        if self.time >= self.max_time:
            done = True
        else:
            done = False
        
        return np.array(self.state), reward, done, info
        
    def reset(self):
        # this resets the environment so it can try again
        # print('Environment reset')
        self.state = np.array([get_price(0), get_mean_price(0), self.start_energy])
        self.time = 0
        self.earnings = 0
        return self.state

def humans2env(action):
    return int(action+1)

def env2human(action):
    return int(action-1)

check_env(energy_price_env(1), warn=True)



In [8]:
from stable_baselines3.common.vec_env import DummyVecEnv

# env = DummyVecEnv([lambda: energy_price_env(1)])
env = energy_price_env()
model = PPO(MlpPolicy, env, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [10]:
# Random Agent, before training
import matplotlib.pyplot as plt

def evaluate(model, start_time=0, max_time=None, num_episodes=100):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_episodes: (int) number of episodes to evaluate it
    :return: (float) Mean reward for the last num_episodes
    """
    # This function will only work for a single Environment
    env = model.get_env()
    env.reset(start_time = start_time)
    if max_time is not None:
        env.max_time = max_time
    all_episode_rewards = []
    
    for i in range(num_episodes):
        
        episode_rewards = []
        if i == 0:
            current_prices = []
            mean_prices = []
            current_energies = []
            all_earnings = [0]
        
        done = False
        obs = env.reset(start_time = start_time)
        while not done:
            # _states are only useful when using LSTM policies
            action, _states = model.predict(obs)
            # here, action, rewards and dones are arrays
            # because we are using vectorized env
            obs, reward, done, info = env.step(action)
            current_price, mean_price, current_energy = obs[0,0], obs[0,1], obs[0,2]
            episode_rewards.append(reward)
            
            if i == 0:
                if len(current_energies)>0:
                    all_earnings.append(-current_price*(current_energy-current_energies[-1]))
                
                current_prices.append(current_price)
                mean_prices.append(mean_price)
                current_energies.append(current_energy)

        all_episode_rewards.append(sum(episode_rewards))

    fig, axs = plt.subplots(4,1,sharex = True)
    index = np.arange(0, len(current_energies))
    cum_rewards = np.cumsum(episode_rewards)
    bank_total = np.cumsum(all_earnings)
    axs[0].plot(index, cum_rewards, color = 'red', label='Cumalative rewards')
    axs[0].plot(index, bank_total, color = 'blue', label='Bank total')
    axs[0].legend()
    axs[1].plot(index, current_prices, color = 'blue', label='Current prices')
    axs[1].plot(index, mean_prices, color = 'red', label='Mean prices')
    axs[1].legend()
    
    axs[2].plot(index, episode_rewards, color = 'black', label = 'Reward')
    axs[2].legend()
    
    axs[3].plot(index, current_energies, color = 'blue', label='Current energies')

    
    mean_episode_reward = np.mean(all_episode_rewards)
    print("Mean reward:", mean_episode_reward, "Num episodes:", num_episodes)

    return mean_episode_reward

mean_reward_before_train = evaluate(model, num_episodes=1)

TypeError: reset() got an unexpected keyword argument 'start_time'

In [None]:
# Train the agent for 10000 steps
model.learn(total_timesteps=1000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 336      |
|    ep_rew_mean     | 40.5     |
| time/              |          |
|    fps             | 1541     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x21cbdbbb370>

In [None]:
# Trained Agent, after training
mean_reward_after_train = evaluate(model, num_episodes=1)

Mean reward: 57.600433 Num episodes: 1


In [None]:
periods_in_week = 7*24*2
# set the start of the test data
obs = env.set_time(periods_in_week)
# how far into the future
for i in range(periods_in_week):
    action, _states = model.predict(obs)
    env.step(action)

AttributeError: 'energy_price_env' object has no attribute 'set_time'