In [1]:
%load_ext autoreload
%autoreload 2

# Import

In [2]:
import torch
from matplotlib import pyplot as plt
from IPython.display import clear_output
import math
from time import time
from stable_baselines3.common.callbacks import BaseCallback
from ev2gym.models.ev2gym_env import EV2Gym
from ev2gym.rl_agent.state import V2G_profit_max_loads
from stable_baselines3.sac.policies import SACPolicy
from stable_baselines3.td3.policies import TD3Policy
from sb3_contrib.tqc.policies import TQCPolicy
from stable_baselines3.common.policies import ActorCriticPolicy
from logger.evaluation_log import save_eval_log
from logger.test_log import save_test_log

# Setting

In [3]:
SEED = 42

In [None]:
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"

print("Using device:", device)

In [5]:
Total_timesteps = int(1 * 1e6)
EVAL_FREQ = 2048
TEST_EPISODES = 100
TEST_ENV_COUNT = 5
INIT_WEIGHTS = False

# EV2Gym Setting

In [6]:
config_file = "./config_files/V2GProfitPlusLoads20.yaml"

In [7]:
def reward_function(env, total_costs, user_satisfaction_list, *args):
    reward = total_costs
    
    for tr in env.transformers:
        reward -= 100 * tr.get_how_overloaded()                  
    
    for score in user_satisfaction_list:        
        reward -= 100 * math.exp(-10*score)
        
    return reward

In [8]:
env = EV2Gym(config_file, 
             render_mode=False,
             seed=SEED,
             save_plots=False,
             state_function=V2G_profit_max_loads,
             reward_function=reward_function,
             save_replay=False)

N_CS = env.cs

In [None]:
env_only_render = EV2Gym(config_file, 
                        render_mode=True,
                        seed=SEED,
                        save_plots=False,
                        state_function=V2G_profit_max_loads,
                        reward_function=reward_function,
                        save_replay=False)

# PreLoad

### Policy

In [10]:
def init_weights(m):
    if isinstance(m, torch.nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            torch.nn.init.constant_(m.bias, -1)

class CustomActorCriticPolicy(ActorCriticPolicy):
    def _build(self, lr_schedule):
        super()._build(lr_schedule)
        self.mlp_extractor.apply(init_weights)
        self.action_net.apply(init_weights)
        self.value_net.apply(init_weights)

class CustomSACPolicy(SACPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomSACPolicy, self).__init__(*args, **kwargs)
        # 커스텀 초기화 적용
        self.actor.latent_pi.apply(init_weights)
        self.critic.apply(init_weights)
        self.actor.mu.apply(init_weights)

class CustomTD3Policy(TD3Policy):
    def __init__(self, *args, **kwargs):
        super(CustomTD3Policy, self).__init__(*args, **kwargs)
        # 커스텀 초기화 적용
        self.actor.apply(init_weights)
        self.critic.apply(init_weights)

class CustomTQCPolicy(TQCPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomTQCPolicy, self).__init__(*args, **kwargs)
        # 커스텀 초기화 적용
        self.actor.latent_pi.apply(init_weights)
        self.critic.apply(init_weights)
        self.actor.mu.apply(init_weights)

### Plot

In [11]:
def display_plot(rewards1=[], rewards2=[], title="Episode", label1="Train", label2="Test", erase = True):
    if erase : clear_output(wait=True)
    plt.ylabel(f'{title} reward')
    plt.xlabel(title)

    if len(rewards1) > 0:
        print(f"Last {label1} reward: {rewards1[-1]:.2e}", end=" ")
        plt.plot(rewards1, label=label1)

    if len(rewards2) > 0:
        print(f"Last {label2} reward: {rewards2[-1]:.2e}", end=" ")
        plt.plot(rewards2, label=label2)
    
    print()

    plt.legend()
    plt.show()

### Callback

In [12]:
EVAL_DETERMINISTIC = False

class SBCallback(BaseCallback):
    def __init__(self,total_timesteps, eval_freq, verbose=0):
        super(SBCallback, self).__init__(verbose)
        self.eval_freq = eval_freq
        self.eval_rewards = [0]
        self.total_timesteps = total_timesteps

    def _on_step(self):

        self.eval_rewards[-1] += float(self.locals["rewards"])

        if self.num_timesteps % self.eval_freq == 1:
            self.eval_rewards.append(0)
            display_plot(self.get_data(), title="Evaluation")

        print(f"Step: {self.num_timesteps}/{self.total_timesteps} {int(self.num_timesteps * 100 / self.total_timesteps)}%", end="\r")
        return True
    
    def get_data(self):
        return self.eval_rewards[1:-1]

In [13]:
class PlotCallback():
    def __init__(self, eval_freq):
        self.eval_freq = eval_freq
        self.eval_rewards = [0]

    def __call__(self, reward, num_timesteps):
        self.eval_rewards[-1] += reward

        if num_timesteps % self.eval_freq == 1:
            self.eval_rewards.append(0)
            display_plot(self.get_data(), title="Evaluation")
        
    def get_data(self):
        return self.eval_rewards[1:-1]

### Test

In [14]:
def test(agent, test_env = env, name=""):
    test_episode_rewards = []
    stats = []
    execute_times = []
    for i in range(TEST_EPISODES):
        state, _ = test_env.reset(seed=SEED+1+i)
        episode_reward = 0
        done = False
        start_time = time()
        while not done:
            action, _ = agent.predict(state, deterministic=True)
            next_state, reward, done, _, info = test_env.step(action)
            state = next_state
            episode_reward += reward
        execute_times.append(time()-start_time)
        stats.append(info)
        test_episode_rewards.append(episode_reward)
    display_plot([], test_episode_rewards, title="Episode", erase=False)

    result = {
        "name" : name,
        "num_episodes" : len(stats),
        "profits" : sum([stat['total_profits'] for stat in stats])/len(stats),
        "user_satisfaction" : sum([stat['average_user_satisfaction'] for stat in stats])/len(stats) * 100,
        "energy_charged" : sum([stat['total_energy_charged'] for stat in stats])/len(stats),
        "energy_discharged" : sum([stat['total_energy_discharged'] for stat in stats])/len(stats),
        "transformer_overload" : sum([stat['total_transformer_overload'] for stat in stats])/len(stats),
        "battery_degradation" : sum([stat['battery_degradation'] for stat in stats]),
        "battery_degradation_calendar" : sum([stat['battery_degradation_calendar'] for stat in stats]),
        "battery_degradation_cycling" : sum([stat['battery_degradation_cycling'] for stat in stats]),
        "execution_time" : sum(execute_times)/len(execute_times),
        "reward" : sum([stat['total_reward'] for stat in stats])/len(stats)/1000,
        "test_episode_rewards" : test_episode_rewards
    }

    return result

def print_test_result(result):

    print("=====================================================")
    print(f' Average stats for {result["name"]} algorithm, {result["num_episodes"]} episodes')

    # Profits / Costs
    print("Profits/ Costs(€): %.1f" % result["profits"])

    # User satisfaction (%)
    print("User satisfaction(%%): %d" % int(result["user_satisfaction"]))

    # Energy Charged (kWh)
    print("Energy Charged(kWh): %d" % int(result["energy_charged"]))

    # Energy Discharged (kWh)
    print("Energy Discharged(kWh): %d" % int(result["energy_discharged"]))

    # Transformer Overload (kWh)
    print("Transformer Overload(kWh): %d" % int(result["transformer_overload"]))

    # Total Battery Capacity Loss (e-3)
    print("Total Battery Capacity Loss(e-3): %.2f" % result["battery_degradation"])

    # Total Battery Degradation Calender (e-3)
    print("Total Battery Degradation Calender(e-3): %.2f" % result["battery_degradation_calendar"])

    # Total Battery Degradation Cycle (e-3)
    print("Total Battery Degradation Cycle(e-3): %.2f" % result["battery_degradation_cycling"])

    # Execution Time of episodes (s)
    print("Execution Time of episodes(s): %.2f" % result["execution_time"])

    # Reward (e3)
    print("Reward(e3): %.2f" % result["reward"])

In [15]:
def test_envs(agent, name=""):
    results = []
    for i in range(TEST_ENV_COUNT):
        test_env = EV2Gym(config_file, 
                           render_mode=False,
                           seed=SEED+i,
                           save_plots=False,
                           state_function=V2G_profit_max_loads,
                           reward_function=reward_function,
                           save_replay=False)
        print(f"Testing environment {i+1}")
        result = test(agent, test_env, f"{name} - {i+1}")
        results.append(result)
        print_test_result(result)
    return results

In [16]:
algorithm_results = []

# Select Algorithms

In [17]:
algorithms = [
    #  "PPO",
    #  "PPO_CAPG",
    #  "PPO_MAXA",
    #  "PPO_TANH",
    "TRPO",
    #  "SB3_TRPO",
    #  "SB3_SAC",
    # "SB3_TQC"
]

# Algorithms

##### PPO

In [18]:
from algorithms.Origin_PPO import Origin_PPO as PPO

ppo = PPO(
    env=env,
    actor_lr=0.00003,
    critic_lr=0.00003,
    gamma=0.99,
    epochs=10,
    eps_clip=0.2,
    rollout_length=2048,
    seed=SEED,
    device=device,
    init_weight=init_weights if INIT_WEIGHTS else None
)

run_ppo = "PPO" in algorithms

In [19]:
if run_ppo:
    train_ppo_eval_callback = PlotCallback(EVAL_FREQ)

    train_ppo = ppo.learn(
        total_timestamp=Total_timesteps, 
        eval_callback=train_ppo_eval_callback)
    
    train_ppo_result = (train_ppo_eval_callback.get_data(), "PPO")
    
    algorithm_results.append(train_ppo_result)
    save_eval_log(train_ppo_result, f"{N_CS}/PPO")
    

In [20]:
if run_ppo: 
    test_ppo_result = test_envs(ppo, "PPO")
    save_test_log(test_ppo_result, f"{N_CS}/PPO")

##### PPO-CAPG

In [21]:
from algorithms.Origin_PPO_CAPG import Origin_PPO_CAPG as PPO_CAPG

ppo_capg = PPO_CAPG(
    env=env,
    actor_lr=0.00003,
    critic_lr=0.00003,
    gamma=0.99,
    epochs=10,
    eps_clip=0.2,
    rollout_length=2048,
    seed=SEED,
    device=device,
    init_weight=init_weights if INIT_WEIGHTS else None
)

run_ppo_capg = "PPO_CAPG" in algorithms

In [22]:
if run_ppo_capg:

    train_ppo_capg_eval_callback = PlotCallback(EVAL_FREQ)

    train_ppo_capg = ppo_capg.learn(
        total_timestamp=Total_timesteps, 
        eval_callback=train_ppo_capg_eval_callback
    )

    train_ppo_capg_result = (train_ppo_capg_eval_callback.get_data(), "PPO_CAPG")

    algorithm_results.append(train_ppo_capg_result)
    save_eval_log(train_ppo_capg_result, f"{N_CS}/PPO_CAPG")

In [23]:
if run_ppo_capg: 
    test_ppo_capg_result = test_envs(ppo_capg, "PPO_CAPG")
    save_test_log(test_ppo_capg_result, f"{N_CS}/PPO_CAPG")

##### PPO_MAXA

In [24]:
from algorithms.Origin_PPO_MAXA import Origin_PPO_MAXA as PPO_MAXA

ppo_maxa = PPO_MAXA(
    env=env,
    actor_lr=0.00003,
    critic_lr=0.00003,
    gamma=0.99,
    epochs=10,
    eps_clip=0.2,
    rollout_length=2048,
    seed=SEED,
    device=device,
    init_weight=init_weights if INIT_WEIGHTS else None
)

run_ppo_maxa = "PPO_MAXA" in algorithms

In [25]:
if run_ppo_maxa:

    train_ppo_maxa_eval_callback = PlotCallback(EVAL_FREQ)

    train_ppo_maxa = ppo_maxa.learn(
        total_timestamp=Total_timesteps, 
        eval_callback=train_ppo_maxa_eval_callback
    )

    train_ppo_maxa_result = (train_ppo_maxa_eval_callback.get_data(), "PPO_MAXA")

    algorithm_results.append(train_ppo_maxa_result)
    save_eval_log(train_ppo_maxa_result, f"{N_CS}/PPO_MAXA")

In [26]:
if run_ppo_maxa: 
    test_ppo_maxa_result = test_envs(ppo_maxa, "PPO_MAXA")
    save_test_log(test_ppo_maxa_result, f"{N_CS}/PPO_MAXA")

##### PPO_TANH

In [27]:
from algorithms.Origin_PPO_TANH import Origin_PPO_TANH as PPO_TANH

ppo_tanh = PPO_TANH(
    env=env,
    actor_lr=0.00003,
    critic_lr=0.00003,
    gamma=0.99,
    epochs=10,
    eps_clip=0.2,
    rollout_length=2048,
    seed=SEED,
    device=device,
    init_weight=init_weights if INIT_WEIGHTS else None
)

run_ppo_tanh = "PPO_TANH" in algorithms

In [28]:
if run_ppo_tanh:

    train_ppo_tanh_eval_callback = PlotCallback(EVAL_FREQ)

    train_ppo_tanh = ppo_tanh.learn(
        total_timestamp=Total_timesteps, 
        eval_callback=train_ppo_tanh_eval_callback
    )

    train_ppo_tanh_result = (train_ppo_tanh_eval_callback.get_data(), "PPO_TANH")

    algorithm_results.append(train_ppo_tanh_result)
    save_eval_log(train_ppo_tanh_result, f"{N_CS}/PPO_TANH")

In [29]:
if run_ppo_tanh:
    test_ppo_tanh_result = test_envs(ppo_tanh, "PPO_TANH")
    save_test_log(test_ppo_tanh_result, f"{N_CS}/PPO_TANH")

##### TRPO

In [30]:
from algorithms.Origin_TRPO import Origin_TRPO as TRPO

trpo = TRPO(
    env=env,
    critic_lr=0.00003,
    gamma=0.99,
    rollout_length=2048,
    seed=SEED,
    device=device,
    init_weight=init_weights if INIT_WEIGHTS else None
)

run_trpo = "TRPO" in algorithms

In [None]:
if run_trpo:

    train_trpo_eval_callback = PlotCallback(EVAL_FREQ)

    train_trpo = trpo.learn(
        total_timestamp=Total_timesteps, 
        eval_callback=train_trpo_eval_callback
    )

    train_trpo_result = (train_trpo_eval_callback.get_data(), "TRPO")

    algorithm_results.append(train_trpo_result)
    save_eval_log(train_trpo_result, f"{N_CS}/TRPO")

In [None]:
if run_trpo:
    test_trpo_result = test_envs(trpo, "TRPO")
    save_test_log(test_trpo_result, f"{N_CS}/TRPO")

##### SB3 TRPO

In [54]:
from sb3_contrib import TRPO as SB3TRPO

sb3trpo = SB3TRPO(
    CustomActorCriticPolicy if INIT_WEIGHTS else "MlpPolicy", 
    env, 
    verbose=0, 
    device=device, 
    seed=SEED)
run_sb3trpo = "SB3_TRPO" in algorithms

In [None]:
if run_sb3trpo:
    train_sb3trpo_callback = SBCallback(
        total_timesteps=Total_timesteps,
        eval_freq=EVAL_FREQ,
    )

    sb3trpo.learn(total_timesteps=Total_timesteps, callback=train_sb3trpo_callback)

    train_sb3trpo_result = (train_sb3trpo_callback.get_data(), "SB3_TRPO")

    algorithm_results.append(train_sb3trpo_result)
    save_eval_log(train_sb3trpo_result, f"{N_CS}/SB3_TRPO")

In [None]:
if run_sb3trpo:
    test_sb3trpo_result = test_envs(sb3trpo, "SB3_TRPO")
    save_test_log(test_sb3trpo_result, f"{N_CS}/SB3_TRPO")

##### SB3 SAC

In [57]:
from stable_baselines3 import SAC as SB3SAC

sb3sac = SB3SAC(
                CustomSACPolicy if INIT_WEIGHTS else "MlpPolicy",
                env, 
                verbose=0, 
                device=device, 
                seed=SEED, 
                train_freq=(2048, "step"),
                gradient_steps=100,
                target_update_interval=100,
                )

run_sb3sac = "SB3_SAC" in algorithms

In [58]:
if run_sb3sac:
    train_sb3sac_callback = SBCallback(
        total_timesteps=Total_timesteps,
        eval_freq=EVAL_FREQ,
    )

    sb3sac.learn(total_timesteps=Total_timesteps, callback=train_sb3sac_callback)

    train_sb3sac_result = (train_sb3sac_callback.get_data(), "SB3_SAC")

    algorithm_results.append(train_sb3sac_result)
    save_eval_log(train_sb3sac_result, f"{N_CS}/SB3_SAC")

In [59]:
if run_sb3sac:
    test_sb3sac_result = test_envs(sb3sac, "SB3_SAC")
    save_test_log(test_sb3sac_result, f"{N_CS}/SB3_SAC")

##### SB3 TQC

In [60]:
from sb3_contrib import TQC as SB3TQC

sb3tqc = SB3TQC(
                CustomTQCPolicy if INIT_WEIGHTS else "MlpPolicy",
                env, 
                verbose=0, 
                device=device, 
                seed=SEED,
                train_freq=(2048, "step"),
                gradient_steps=25,
                target_update_interval=100,
                stats_window_size=1000
                )

run_sb3tqc = "SB3_TQC" in algorithms

In [61]:
if run_sb3tqc:
    train_sb3tqc_callback = SBCallback(
        total_timesteps=Total_timesteps,
        eval_freq=EVAL_FREQ,
    )

    sb3tqc.learn(total_timesteps=Total_timesteps, callback=train_sb3tqc_callback)

    train_sb3tqc_result = (train_sb3tqc_callback.get_data(), "SB3_TQC")

    algorithm_results.append(train_sb3tqc_result)
    save_eval_log(train_sb3tqc_result, f"{N_CS}/SB3_TQC")

In [62]:
if run_sb3tqc:
    test_sb3tqc_result = test_envs(sb3tqc, "SB3_TQC")
    save_test_log(test_sb3tqc_result, f"{N_CS}/SB3_TQC")

# Draw All

In [63]:
ignore_list = []
group_size = 10
skip = 0

def draw_rewards_graphs(rewards_list, title="Title"):
    plt.ylabel('reward')
    plt.xlabel(title)

    for rewards in rewards_list:
        if rewards[1] in ignore_list: continue
        values = rewards[0][skip:]
        plt.plot([ sum(values[i*group_size : (i+1)*group_size])/group_size for i in range(int(len(values)/group_size))], label=rewards[1])

    plt.legend()
    plt.show()

In [None]:
draw_rewards_graphs(algorithm_results, title="Train")