# 0. Install Dependencies

In [39]:
# !pip install tensorflow
# !pip install gym
# !pip install keras
# !pip install keras-rl2
# !pip install stable-baselines3

# 1. Create environment follows Gym that SB3 supports

* https://www.youtube.com/watch?v=cO5g5qLrLSo&ab_channel=NicholasRenotte
* https://www.youtube.com/watch?v=bD6V3rcr_54&list=PLgNJO2hghbmjlE6cuKMws2ejC54BTAaWV&index=3

In [27]:
import random
import numpy as np
import gymnasium as gym
from gymnasium import spaces

class TaxiChargingEnv(gym.Env):
    def __init__(self):
        super(TaxiChargingEnv, self).__init__()
        # Actions we take: 0 kW, 7 kW, 14 kW, 22 kW
        self.action_space = spaces.Discrete(4)
        # State represents battery charge level (kWh)
        self.observation_space = spaces.Box(low=np.array([0]), high=np.array([100]), dtype=np.float32)
        # Initialize state
        self.state = 0.0
        # Battery capacity
        self.battery_capacity = 100.0  # kWh
        # Initial battery level
        self.battery_level = float(random.randint(20, 30))
        # Charging period (2 hours, every 15 minutes = 8 steps)
        self.steps_left = 8
        # Energy demand parameters
        self.mu = 30.0  # kWh
        self.sigma = 5.0  # kWh
        # Time coefficient for cost function
        self.alpha = 1

    def step(self, action):
        charging_rates = [0, 7, 14, 22]  # kW for each action
        power = charging_rates[action]

        # Calculate cost: charging cost (t,p) = 𝛼𝑡 * e^𝑝
        cost = self.alpha * np.exp(power/10)

        # Update battery level
        self.battery_level = min(self.battery_level + power / 4, self.battery_capacity) # each time 15 min, power * 15min = power * 1hour / 4

        # Reduce the steps left by 1
        self.steps_left -= 1

        # If steps are finished, calculate if the battery meets the demand
        if self.steps_left == 0:
            demand = np.random.normal(self.mu, self.sigma)
            if self.battery_level >= demand:
                reward = -cost  # Minimize cost, no penalty for meeting demand
            else:
                reward = -cost - 1000  # High penalty for not meeting demand
            terminated = True
        else:
            reward = -cost  # Incur cost for charging, continue episode
            terminated = False

        # Always set truncated to False in this environment
        truncated = False

        # State is the current battery level
        self.state = np.array([self.battery_level], dtype=np.float32)

        # Info dictionary
        info = {
            "battery_level": self.battery_level,
            "cost": cost
        }

        return self.state, reward, terminated, truncated, info

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        # Reset battery level
        self.battery_level = float(random.randint(20, 30))
        # Reset steps left
        self.steps_left = 8
        # Reset state
        self.state = np.array([self.battery_level], dtype=np.float32)
        return self.state, {}

    def render(self, mode='human', close=False):
        # For visualization, if needed
        pass

In [28]:
# Create the environment
env = TaxiChargingEnv()

# 2. Validate and test the environment

In [29]:
#Check if the environment follow Gym interface that SB3 support
from stable_baselines3.common.env_checker import check_env
check_env(env)

In [30]:
obs, _ = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

# Random agent
n_steps = 8
for step in range(n_steps):
    action = env.action_space.sample()  # Random action
    print(f"Step {step + 1}")
    obs, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    print("obs=", obs, "reward=", reward, "done=", done, "info=", info)
    env.render()
    if done:
        print("Goal reached!", "reward=", reward)
        break

Box(0.0, 100.0, (1,), float32)
Discrete(4)
3
Step 1
obs= [31.5] reward= -4.0551999668446745 done= False info= {'battery_level': 31.5, 'cost': 4.0551999668446745}
Step 2
obs= [35.] reward= -4.0551999668446745 done= False info= {'battery_level': 35.0, 'cost': 4.0551999668446745}
Step 3
obs= [38.5] reward= -4.0551999668446745 done= False info= {'battery_level': 38.5, 'cost': 4.0551999668446745}
Step 4
obs= [44.] reward= -9.025013499434122 done= False info= {'battery_level': 44.0, 'cost': 9.025013499434122}
Step 5
obs= [49.5] reward= -9.025013499434122 done= False info= {'battery_level': 49.5, 'cost': 9.025013499434122}
Step 6
obs= [49.5] reward= -1.0 done= False info= {'battery_level': 49.5, 'cost': 1.0}
Step 7
obs= [51.25] reward= -2.0137527074704766 done= False info= {'battery_level': 51.25, 'cost': 2.0137527074704766}
Step 8
obs= [56.75] reward= -9.025013499434122 done= True info= {'battery_level': 56.75, 'cost': 9.025013499434122}
Goal reached! reward= -9.025013499434122


# 3. Stable-Baselines

In [31]:
from stable_baselines3.common.env_util import make_vec_env
vec_env = make_vec_env(lambda: env, n_envs=1)

In [32]:
from stable_baselines3 import PPO, A2C, DQN

# Train the agent using PPO
model = PPO("MlpPolicy", vec_env, verbose=1)
model.learn(total_timesteps=100000)

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 8        |
|    ep_rew_mean     | -53.8    |
| time/              |          |
|    fps             | 6302     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8           |
|    ep_rew_mean          | -68.8       |
| time/                   |             |
|    fps                  | 4138        |
|    iterations           | 2           |
|    time_elapsed         | 0           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.018566348 |
|    clip_fraction        | 0.282       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.37       |
|    explained_variance   | 9.25e-05    |
|    learning

<stable_baselines3.ppo.ppo.PPO at 0x17e1ee4c0>

In [53]:
# After training, save the model
model.save("ppo_taxi_charging")

# To load and use the model later
model = PPO.load("ppo_taxi_charging")

In [64]:
# Run the model
state = vec_env.reset()
done = False
total_reward = 0

while not done:
    action, _states = model.predict(state)
    state, reward, done, info = vec_env.step(action)
    total_reward += reward

print(f"Total Reward: {total_reward}")

Total Reward: [-18.151468]


# code below show choosing random action for 200 episodes, the average reward is  -81.30168659640917

In [52]:
# def test_environment(env, num_episodes=20):
#     rewards = []
    
#     for episode in range(num_episodes):
#         state, _ = env.reset()  # Reset the environment to start a new episode
#         done = False
#         total_reward = 0
        
#         while not done:
#             action = env.action_space.sample()  # Take a random action
#             next_state, reward, done, truncated, info = env.step(action)
#             total_reward += reward  # Accumulate the reward
#             state = next_state  # Move to the next state
        
#         rewards.append(total_reward)  # Store the total reward for this episode
#         print(f"Episode {episode + 1} - Total Reward: {total_reward}")

#     average_reward = np.mean(rewards)  # Calculate the average reward
#     print(f"Average Reward over {num_episodes} episodes: {average_reward}")

# # Test the environment with random actions for 20 episodes
# test_environment(env, num_episodes=20)

Episode 1 - Total Reward: -25.176671555534902
Episode 2 - Total Reward: -27.2181188149091
Episode 3 - Total Reward: -37.28457957371742
Episode 4 - Total Reward: -13.068763537352384
Episode 5 - Total Reward: -37.157745880087994
Episode 6 - Total Reward: -37.157745880087994
Episode 7 - Total Reward: -25.176671555534902
Episode 8 - Total Reward: -36.27082686624694
Episode 9 - Total Reward: -32.187932347498545
Episode 10 - Total Reward: -60.21903367091989
Episode 11 - Total Reward: -43.268145813777345
Episode 12 - Total Reward: -25.162729711101655
Episode 13 - Total Reward: -37.157745880087994
Episode 14 - Total Reward: -19.193105315474977
Episode 15 - Total Reward: -36.2568850218137
Episode 16 - Total Reward: -19.193105315474977
Episode 17 - Total Reward: -31.174179640028072
Episode 18 - Total Reward: -39.32602683309162
Episode 19 - Total Reward: -21.234552574849175
Episode 20 - Total Reward: -1023.1491661405939
Average Reward over 20 episodes: -81.30168659640917
