# 0. Install Dependencies

In [5]:
# !pip install tensorflow
# !pip install gym
# !pip install keras
# !pip install keras-rl2
# !pip install stable-baselines3

# 1. Create environment follows Gym that SB3 supports

* https://www.youtube.com/watch?v=cO5g5qLrLSo&ab_channel=NicholasRenotte
* https://www.youtube.com/watch?v=bD6V3rcr_54&list=PLgNJO2hghbmjlE6cuKMws2ejC54BTAaWV&index=3

In [6]:
import random
import numpy as np
import gymnasium as gym
from gymnasium import spaces

class TaxiChargingEnv(gym.Env):
    def __init__(self):
        super(TaxiChargingEnv, self).__init__()
        # Actions we take: 0 kW, 7 kW, 14 kW, 22 kW
        self.action_space = spaces.Discrete(4)
        # State represents battery charge level (kWh)
        self.observation_space = spaces.Box(low=np.array([0]), high=np.array([100]), dtype=np.float32)
        # Initialize state
        self.state = 0.0
        # Battery capacity
        self.battery_capacity = 100.0  # kWh
        # Initial battery level
        self.battery_level = float(random.randint(20, 30))
        # Charging period (2 hours, every 15 minutes = 8 steps)
        self.steps_left = 8
        # Energy demand parameters
        self.mu = 30.0  # kWh
        self.sigma = 5.0  # kWh
        # Time coefficient for cost function
        self.alpha = 1

    def step(self, action):
        charging_rates = [0, 7, 14, 22]  # kW for each action
        power = charging_rates[action]

        # Calculate cost: charging cost (t,p) = 𝛼𝑡 * e^𝑝
        cost = self.alpha * np.exp(power/10)

        # Update battery level
        self.battery_level = min(self.battery_level + power / 4, self.battery_capacity) # each time 15 min, power * 15min = power * 1hour / 4

        # Reduce the steps left by 1
        self.steps_left -= 1

        # If steps are finished, calculate if the battery meets the demand
        if self.steps_left == 0:
            demand = np.random.normal(self.mu, self.sigma)
            if self.battery_level >= demand:
                reward = -cost  # Minimize cost, no penalty for meeting demand
            else:
                reward = -cost - 1000  # High penalty for not meeting demand
            terminated = True
        else:
            reward = -cost  # Incur cost for charging, continue episode
            terminated = False

        # Always set truncated to False in this environment
        truncated = False

        # State is the current battery level
        self.state = np.array([self.battery_level], dtype=np.float32)

        # Info dictionary
        info = {
            "battery_level": self.battery_level,
            "cost": cost
        }

        return self.state, reward, terminated, truncated, info

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        # Reset battery level
        self.battery_level = float(random.randint(20, 30))
        # Reset steps left
        self.steps_left = 8
        # Reset state
        self.state = np.array([self.battery_level], dtype=np.float32)
        return self.state, {}

    def render(self, mode='human', close=False):
        # For visualization, if needed
        pass

In [7]:
# Create the environment
env = TaxiChargingEnv()

# 2. Validate and test the environment

In [8]:
#Check if the environment follow Gym interface that SB3 support
from stable_baselines3.common.env_checker import check_env
check_env(env)

In [9]:
obs, _ = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

# Random agent
n_steps = 8
for step in range(n_steps):
    action = env.action_space.sample()  # Random action
    print(f"Step {step + 1}")
    obs, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    print("obs=", obs, "reward=", reward, "done=", done, "info=", info)
    env.render()
    if done:
        print("Goal reached!", "reward=", reward)
        break

Box(0.0, 100.0, (1,), float32)
Discrete(4)
3
Step 1
obs= [21.75] reward= -2.0137527074704766 done= False info= {'battery_level': 21.75, 'cost': 2.0137527074704766}
Step 2
obs= [23.5] reward= -2.0137527074704766 done= False info= {'battery_level': 23.5, 'cost': 2.0137527074704766}
Step 3
obs= [29.] reward= -9.025013499434122 done= False info= {'battery_level': 29.0, 'cost': 9.025013499434122}
Step 4
obs= [34.5] reward= -9.025013499434122 done= False info= {'battery_level': 34.5, 'cost': 9.025013499434122}
Step 5
obs= [34.5] reward= -1.0 done= False info= {'battery_level': 34.5, 'cost': 1.0}
Step 6
obs= [34.5] reward= -1.0 done= False info= {'battery_level': 34.5, 'cost': 1.0}
Step 7
obs= [34.5] reward= -1.0 done= False info= {'battery_level': 34.5, 'cost': 1.0}
Step 8
obs= [34.5] reward= -1.0 done= True info= {'battery_level': 34.5, 'cost': 1.0}
Goal reached! reward= -1.0


# 3. Stable-Baselines

In [10]:
from stable_baselines3.common.env_util import make_vec_env
vec_env = make_vec_env(lambda: env, n_envs=1)

In [11]:
from stable_baselines3 import PPO, A2C, DQN

# Train the agent using PPO
model = PPO("MlpPolicy", vec_env, verbose=1)
model.learn(total_timesteps=100000)

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 8        |
|    ep_rew_mean     | -81.3    |
| time/              |          |
|    fps             | 6552     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8           |
|    ep_rew_mean          | -48.9       |
| time/                   |             |
|    fps                  | 4450        |
|    iterations           | 2           |
|    time_elapsed         | 0           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.017819522 |
|    clip_fraction        | 0.288       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.37       |
|    explained_variance   | -6.65e-05   |
|    learning

<stable_baselines3.ppo.ppo.PPO at 0x3136edd30>

In [12]:
# After training, save the model
model.save("ppo_taxi_charging")

In [13]:
model = PPO.load("ppo_taxi_charging")

In [14]:
# Run the model
state = vec_env.reset()
done = False
total_reward = 0

while not done:
    action, _states = model.predict(state)
    state, reward, done, info = vec_env.step(action)
    total_reward += reward

print(f"Total Reward: {total_reward}")

Total Reward: [-18.151468]


# Compare with random action

1.  code below show each rewards if we choose random action for 50 episode, as well as the average of 50 episodes
2.  if the test episode include the case that energy demand is higher than the energy we have, reward will be extremely low

In [26]:
def test_environment(env, num_episodes=50):
    rewards = []
    
    for episode in range(num_episodes):
        state, _ = env.reset()  # Reset the environment to start a new episode
        done = False
        total_reward = 0
        
        while not done:
            action = env.action_space.sample()  # Take a random action
            next_state, reward, done, truncated, info = env.step(action)
            total_reward += reward  # Accumulate the reward
            state = next_state  # Move to the next state
        
        rewards.append(total_reward)  # Store the total reward for this episode
        print(f"Episode {episode + 1} - Total Reward: {total_reward}")

    average_reward = np.mean(rewards)  # Calculate the average reward
    print(f"Average Reward over {num_episodes} episodes: {average_reward}")

# Test the environment with random actions for 20 episodes
test_environment(env, num_episodes=50)

Episode 1 - Total Reward: -25.190613399968147
Episode 2 - Total Reward: -39.199193139462196
Episode 3 - Total Reward: -30.14648508812435
Episode 4 - Total Reward: -31.160237795594824
Episode 5 - Total Reward: -24.162918848064425
Episode 6 - Total Reward: -32.187932347498545
Episode 7 - Total Reward: -40.21294584693267
Episode 8 - Total Reward: -28.231871522379578
Episode 9 - Total Reward: -27.344952508538526
Episode 10 - Total Reward: -21.107718881219753
Episode 11 - Total Reward: -41.22669855440314
Episode 12 - Total Reward: -42.25439310630687
Episode 13 - Total Reward: -38.171498587558474
Episode 14 - Total Reward: -22.248305282319652
Episode 15 - Total Reward: -34.22937960687274
Episode 16 - Total Reward: -27.2181188149091
Episode 17 - Total Reward: -25.176671555534902
Episode 18 - Total Reward: -20.08002432931603
Episode 19 - Total Reward: -21.10771888121975
Episode 20 - Total Reward: -30.273318781753773
Episode 21 - Total Reward: -1018.1793526080046
Episode 22 - Total Reward: -24.