In [1]:
import os, sys

base_path = os.path.join(os.getcwd(), "..")
print(f"Base Path: {base_path}")
sys.path.append(base_path)

import stable_baselines3 as sb3
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.logger import configure
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.evaluation import evaluate_policy

import torch as th
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from stable_baselines3 import PPO, A2C, SAC, TD3, DQN, HER, DDPG

from datetime import datetime

import gym
from gym import spaces

import torch
import torch.nn as nn
import torch.nn.functional as F

import torch
from torch.utils.data import Dataset, DataLoader

import pandas as pd

from kube_sim_gym.envs import *

Base Path: /Users/swkim/Documents/coding/thesis/PROMES_colab/notebook/..


In [2]:
class Pr_Dataset(Dataset):
    def __init__(self, csv_path, portion=0.5 ,train=True):
        self.data = pd.read_csv(csv_path)
        # Resize the data to the portion
        self.data = self.data.iloc[:int(len(self.data) * portion), :]

        # Drop the row which has 0 for the last -2, -3 columns
        # self.data = self.data.drop(self.data[(self.data.iloc[:, -2] == 0) & (self.data.iloc[:, -3] == 0)].index)

        if train:
            self.data = self.data.sample(frac=0.8, random_state=42)
        else:
            self.data = self.data.drop(self.data.sample(frac=0.8, random_state=42).index)

        self.data = self.transform(self.data)
        self.input = self.data[:, :12]
        self.label = self.data[:, 12:13].long()

    def transform(self, data):
        return torch.tensor(data.values, dtype=torch.float32)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.input[idx], self.label[idx]

In [3]:
import os

data_path = os.path.join(base_path, "dataset", "data_expert.csv")
train_static_dataset = Pr_Dataset(data_path, 0.5, train=True)
test_static_dataset = Pr_Dataset(data_path, 0.5, train=False)
train_static_dataloader = DataLoader(train_static_dataset, batch_size=64, shuffle=False)
test_static_dataloader = DataLoader(test_static_dataset, batch_size=64, shuffle=False)

In [4]:
for batch in train_static_dataloader:
    input, labels = batch
    print(input.shape, labels.shape)
    print(f"input1: {input[:5]}\nlabels: {labels[:5]}")
    break

torch.Size([64, 12]) torch.Size([64, 1])
input1: tensor([[0.0800, 0.0000, 0.0900, 0.0600, 0.2000, 0.1400, 0.0300, 0.0000, 0.2300,
         0.0300, 0.0400, 0.0500],
        [0.0200, 0.0500, 0.1900, 0.0300, 0.2100, 0.0300, 0.2300, 0.0300, 0.1500,
         0.0400, 0.0800, 0.1300],
        [0.0000, 0.0400, 0.0200, 0.0000, 0.0200, 0.0500, 0.1000, 0.4500, 0.0300,
         0.1600, 0.1500, 0.0800],
        [0.0800, 0.1000, 0.0900, 0.1800, 0.2800, 0.1600, 0.1200, 0.1100, 0.1400,
         0.0800, 0.0300, 0.0500],
        [0.0600, 0.0000, 0.0400, 0.2300, 0.0300, 0.1100, 0.0700, 0.1100, 0.1100,
         0.1000, 0.1100, 0.1200]])
labels: tensor([[3],
        [0],
        [0],
        [3],
        [0]])


In [5]:
env = gym.make('SimKubeEnv-v0', reward_file='train_step_3.py', scenario_file='random')

Base Path: /Users/swkim/Documents/coding/thesis/PROMES_colab/notebook/..


In [6]:
model_untrained = sb3.DQN('MlpPolicy', env, verbose=1)
model_pretrained = sb3.DQN('MlpPolicy', env, verbose=1, exploration_initial_eps=0.05, exploration_final_eps=0.03)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [7]:
def init_eval_env():
    # Prepare Eval ENV & Callback
    eval_env0 = gym.make("SimKubeEnv-v0", reward_file='train_step_3.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')
    eval_env1 = gym.make("SimKubeEnv-v0", reward_file='eval_rur.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')
    eval_env2 = gym.make("SimKubeEnv-v0", reward_file='eval_rbd1.py', scenario_file='scenario-5l-5m-10000p-10m_unbalanced.csv')
    eval_env3 = gym.make("SimKubeEnv-v0", reward_file='eval_rbd2.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')
    eval_env4 = gym.make("SimKubeEnv-v0", reward_file='eval_ct.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')

    return [eval_env0, eval_env1, eval_env2, eval_env3, eval_env4]

def eval_model(model, eval_envs, verbose=False):
    ret = []
    print('Evaluation : train_step_3') if verbose else None
    mean_reward, std_reward = evaluate_policy(model, eval_envs[0], n_eval_episodes=1, deterministic=True)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") if verbose else None
    ret += [mean_reward, std_reward]

    print('Evaluation : eval_rur')  if verbose else None
    mean_reward, std_reward = evaluate_policy(model, eval_envs[1], n_eval_episodes=1, deterministic=True)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") if verbose else None
    ret += [mean_reward, std_reward]

    print('Evaluation : eval_rbd1') if verbose else None
    mean_reward, std_reward = evaluate_policy(model, eval_envs[2], n_eval_episodes=1, deterministic=True)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") if verbose else None
    ret += [mean_reward, std_reward]

    print('Evaluation : eval_rbd2') if verbose else None
    mean_reward, std_reward = evaluate_policy(model, eval_envs[3], n_eval_episodes=1, deterministic=True)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") if verbose else None
    ret += [mean_reward, std_reward]

    print('Episode length :') if verbose else None
    mean_reward, std_reward = evaluate_policy(model, eval_envs[4], n_eval_episodes=1, deterministic=True)
    print(f"Episode length:{mean_reward:.2f} +/- {std_reward:.2f}") if verbose else None
    ret += [mean_reward, std_reward]

    return ret

In [8]:
def pretrain_agent(
    student,
    batch_size=64,
    epochs=1000,
    scheduler_gamma=0.7,
    learning_rate=1.0,
    log_interval=1000,
    no_cuda=True,
    seed=1,
    test_batch_size=64,
):
    use_cuda = not no_cuda and th.cuda.is_available()
    th.manual_seed(seed)
    device = th.device("cuda" if use_cuda else "cpu")
    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

    if isinstance(env.action_space, gym.spaces.Box):
        criterion = nn.MSELoss()
    else:
        criterion = nn.CrossEntropyLoss()
    # criterion = nn.MSELoss()

    # Extract initial policy
    model = student.policy.to(device)

    def train(model, device, train_loader, optimizer):
        model.train()

        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            target = target.squeeze(-1)

            if isinstance(env.action_space, gym.spaces.Box):
                # A2C/PPO policy outputs actions, values, log_prob
                # SAC/TD3 policy outputs actions only
                if isinstance(student, (A2C, PPO)):
                    action, _, _ = model(data)
                else:
                    # SAC/TD3:
                    action = model(data)
                action_prediction = action.double()
            else:
                action_prediction = model.q_net(data)
                # target = model.q_net_target(data)
                target = target.long()
                # if isinstance(student, (DQN)):
                #     action_prediction = model.q_net(data)
                #     # target = model.q_net_target(data)
                #     target = target.long()
                # else:
                #   # Retrieve the logits for A2C/PPO when using discrete actions
                #   dist = model.get_distribution(data)
                #   action_prediction = dist.distribution.logits
                #   target = target.long()

            loss = criterion(action_prediction, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                print(
                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                        epoch,
                        batch_idx * len(data),
                        len(train_loader.dataset),
                        100.0 * batch_idx / len(train_loader),
                        loss.item(),
                    )
                )

    def test(model, device, test_loader):
        model.eval()
        test_loss = 0
        with th.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)

                if isinstance(env.action_space, gym.spaces.Box):
                    # A2C/PPO policy outputs actions, values, log_prob
                    # SAC/TD3 policy outputs actions only
                    if isinstance(student, (A2C, PPO)):
                        action, _, _ = model(data)
                    else:
                        # SAC/TD3:
                        action = model(data)
                    action_prediction = action.double()
                else:
                    if isinstance(student, (DQN)):
                        action_prediction = model.q_net(data)
                        target = model.q_net_target(data)
                    else:
                        # Retrieve the logits for A2C/PPO when using discrete actions
                        dist = model.get_distribution(data)
                        action_prediction = dist.distribution.logits
                        target = target.long()

                test_loss = criterion(action_prediction, target)
        test_loss /= len(test_loader.dataset)
        print(f"Test set: Average loss: {test_loss:.4f}")

    # # Here, we use PyTorch `DataLoader` to our load previously created `ExpertDataset` for training
    # # and testing
    # train_loader = th.utils.data.DataLoader(
    #     dataset=train_expert_dataset, batch_size=batch_size, shuffle=True, **kwargs
    # )
    # test_loader = th.utils.data.DataLoader(
    #     dataset=test_expert_dataset,
    #     batch_size=test_batch_size,
    #     shuffle=True,
    #     **kwargs,
    # )

    # Define an Optimizer and a learning rate schedule.
    optimizer = optim.Adadelta(model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, step_size=1, gamma=scheduler_gamma)

    # Now we are finally ready to train the policy model.
    for epoch in range(1, epochs + 1):
        train(model, device, train_static_dataloader, optimizer)
        test(model, device, test_static_dataloader)
        scheduler.step()

    # Implant the trained policy network back into the RL student agent
    model_pretrained.policy = model

In [9]:
eval_envs = init_eval_env()

In [10]:
untrained_result = eval_model(model_untrained, eval_envs)
pretrained_result = eval_model(model_pretrained, eval_envs)

print('Untrained result:', untrained_result)
print('Pretrained result:', pretrained_result)



Untrained result: [-2820.3200046904385, 0.0, 733.8700128365308, 0.0, 6226.279976069927, 0.0, 3360.8299952447414, 0.0, 4002.0, 0.0]
Pretrained result: [-2811.660005379468, 0.0, 719.4800149668008, 0.0, 6240.909976303577, 0.0, 3373.640003889799, 0.0, 4002.0, 0.0]


In [15]:
eval_env0 = gym.make("SimKubeEnv-v0", reward_file='train_step_3.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')
eval_env1 = gym.make("SimKubeEnv-v0", reward_file='eval_rur.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')
eval_env2 = gym.make("SimKubeEnv-v0", reward_file='eval_rbd1.py', scenario_file='scenario-5l-5m-10000p-10m_unbalanced.csv')
eval_env3 = gym.make("SimKubeEnv-v0", reward_file='eval_rbd2.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')
eval_env4 = gym.make("SimKubeEnv-v0", reward_file='eval_ct.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')

In [19]:
eval_callback = EvalCallback(eval_env2, eval_freq=50000, deterministic=True, render=False, verbose=1) # Change eval_env!

model_test = sb3.DQN("MlpPolicy", env, verbose=1) 

pretrain_agent(
    model_test,
    epochs=5,
    scheduler_gamma=0.7,
    learning_rate=1.0,
    log_interval=10000,
    no_cuda=True,
    seed=1,
    batch_size=64,
    test_batch_size=1000,
)
# model_pretrained.save("a2c_student")

model_test.learn(total_timesteps=1000000, callback=eval_callback)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Test set: Average loss: 0.0000
Test set: Average loss: 0.0000
Test set: Average loss: 0.0000
Test set: Average loss: 0.0000
Test set: Average loss: 0.0000
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 4e+03     |
|    ep_rew_mean      | -2.13e+03 |
|    exploration_rate | 0.848     |
| time/               |           |
|    episodes         | 4         |
|    fps              | 3366      |
|    time_elapsed     | 4         |
|    total_timesteps  | 16008     |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 4e+03     |
|    ep_rew_mean      | -2.12e+03 |
|    exploration_rate | 0.696     |
| time/               |           |
|    episodes         | 8         |
|    fps              | 3391      |
|    time_elapsed     | 9         |
|    total_timesteps  | 32016  



Eval num_timesteps=50000, episode_reward=11771.09 +/- 0.00
Episode length: 13002.00 +/- 0.00
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.3e+04  |
|    mean_reward      | 1.18e+04 |
| rollout/            |          |
|    exploration_rate | 0.525    |
| time/               |          |
|    total_timesteps  | 50000    |
----------------------------------
New best mean reward!
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 4e+03     |
|    ep_rew_mean      | -2.06e+03 |
|    exploration_rate | 0.392     |
| time/               |           |
|    episodes         | 16        |
|    fps              | 323       |
|    time_elapsed     | 197       |
|    total_timesteps  | 64032     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 6.19      |
|    n_updates        | 3507      |
-----------------------------------
------------------------------

<stable_baselines3.dqn.dqn.DQN at 0x7fcf652f45b0>

In [12]:
pretrained_result = eval_model(model_pretrained, eval_envs, True)

print('Pretrained result:', pretrained_result)

Evaluation : train_step_3
mean_reward:-188.44 +/- 0.00
Evaluation : eval_rur
mean_reward:1425.31 +/- 0.00
Evaluation : eval_rbd1
mean_reward:11845.16 +/- 0.00
Evaluation : eval_rbd2
mean_reward:1509.48 +/- 0.00
Episode length :
Episode length:1606.00 +/- 0.00
Pretrained result: [-188.4399991016835, 0.0, 1425.3099972177297, 0.0, 11845.16000956297, 0.0, 1509.4800036549568, 0.0, 1606.0, 0.0]


In [13]:
model_pretrained.exploration_initial_eps = 0.05
model_pretrained.exploration_final_eps = 0.02

model_pretrained.learn(500000)

-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 4e+03     |
|    ep_rew_mean      | -2.11e+03 |
|    exploration_rate | 0.0436    |
| time/               |           |
|    episodes         | 4         |
|    fps              | 2254      |
|    time_elapsed     | 7         |
|    total_timesteps  | 16008     |
-----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 4e+03    |
|    ep_rew_mean      | -2.1e+03 |
|    exploration_rate | 0.0372   |
| time/               |          |
|    episodes         | 8        |
|    fps              | 2233     |
|    time_elapsed     | 14       |
|    total_timesteps  | 32016    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 4e+03    |
|    ep_rew_mean      | -2.1e+03 |
|    exploration_rate | 0.0308   |
| time/               |          |
|    epis

KeyboardInterrupt: 