In [1]:
import os, sys

base_path = os.path.join(os.getcwd(), "..")
print(f"Base Path: {base_path}")
sys.path.append(base_path)

import stable_baselines3 as sb3
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.logger import configure
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.evaluation import evaluate_policy

from datetime import datetime

import gym
from gym import spaces

import torch
import torch.nn as nn
import torch.nn.functional as F

import torch
from torch.utils.data import Dataset, DataLoader

import pandas as pd

from kube_sim_gym.envs import *

Base Path: /Users/swkim/Documents/coding/thesis/PROMES_colab/notebook/..


In [2]:
class Pr_Dataset(Dataset):
    def __init__(self, csv_path, train=True):
        self.data = pd.read_csv(csv_path)
        # Drop the row which has 0 for the last -2, -3 columns
        # self.data = self.data.drop(self.data[(self.data.iloc[:, -2] == 0) & (self.data.iloc[:, -3] == 0)].index)

        if train:
            self.data = self.data.sample(frac=0.8, random_state=42)
        else:
            self.data = self.data.drop(self.data.sample(frac=0.8, random_state=42).index)

        self.data = self.transform(self.data)
        self.input = self.data[:, :-6]
        self.label = self.data[:, -6:]

    def transform(self, data):
        return torch.tensor(data.values, dtype=torch.float32)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.input[idx], self.label[idx]

In [3]:
import os

data_path = os.path.join(base_path, "dataset", "data_step3.csv")
train_static_dataset = Pr_Dataset(data_path, train=True)
test_static_dataset = Pr_Dataset(data_path, train=False)
train_static_dataloader = DataLoader(train_static_dataset, batch_size=64, shuffle=False)
test_static_dataloader = DataLoader(test_static_dataset, batch_size=64, shuffle=False)

In [4]:
for batch in train_static_dataloader:
    input, labels = batch
    print(input.shape, labels.shape)
    print(f"input1: {input[:5]}\nlabels: {labels[:5]}")
    break

torch.Size([64, 12]) torch.Size([64, 6])
input1: tensor([[0.8400, 0.9500, 0.7600, 1.0000, 0.9500, 0.8600, 0.9200, 0.7200, 0.7100,
         0.7000, 0.1100, 0.1400],
        [0.6200, 0.6500, 0.9200, 0.1200, 0.4500, 0.1900, 0.7600, 0.5400, 0.7400,
         0.8400, 0.2500, 0.2400],
        [0.0900, 0.1100, 0.3800, 0.2100, 0.5300, 0.8700, 0.4300, 0.1100, 0.5600,
         0.4900, 0.1100, 0.0000],
        [0.9600, 0.7100, 0.9500, 0.8200, 0.8000, 1.0000, 0.8800, 0.8600, 0.8600,
         0.9700, 0.1200, 0.2400],
        [0.6800, 0.6400, 0.3700, 0.3900, 0.5700, 0.9800, 0.3400, 0.0400, 0.8700,
         0.7200, 0.0100, 0.0400]])
labels: tensor([[-0.0800, -0.5200, -0.6500, -0.5000, -0.6100, -0.0800],
        [-0.1600, -0.2000, -1.2900, -0.4000, -0.7100, -0.5900],
        [-0.1700, -0.2500, -0.4500, -0.4000, -0.6000, -0.3600],
        [-0.0600, -0.6400, -0.5200, -0.5900, -0.4100, -0.5000],
        [-0.1900, -0.2000, -0.2400, -0.9300, -0.4600, -0.3100]])


In [5]:
env = gym.make('SimKubeEnv-v0', reward_file='train_step_3.py', scenario_file='random')

Base Path: /Users/swkim/Documents/coding/thesis/PROMES_colab/notebook/..


In [6]:
model = sb3.DQN('MlpPolicy', env)
model.policy

DQNPolicy(
  (q_net): QNetwork(
    (features_extractor): FlattenExtractor(
      (flatten): Flatten(start_dim=1, end_dim=-1)
    )
    (q_net): Sequential(
      (0): Linear(in_features=12, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): ReLU()
      (4): Linear(in_features=64, out_features=6, bias=True)
    )
  )
  (q_net_target): QNetwork(
    (features_extractor): FlattenExtractor(
      (flatten): Flatten(start_dim=1, end_dim=-1)
    )
    (q_net): Sequential(
      (0): Linear(in_features=12, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): ReLU()
      (4): Linear(in_features=64, out_features=6, bias=True)
    )
  )
)

In [7]:
class FlattenExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space: spaces.Box, features_dim: int):
        super(FlattenExtractor, self).__init__(observation_space, features_dim)
        # Flatten the observation space
        self.flatten = nn.Flatten()
        # self.fe = nn.Linear(observation_space.shape[0], features_dim)


    def forward(self, x):
        x = self.flatten(x)
        return x

In [8]:
policy_kwargs = dict(
    features_extractor_class=FlattenExtractor,
    features_extractor_kwargs=dict(features_dim=12),
    net_arch=[64, 64]
)

rl_model_untrained = sb3.DQN('MlpPolicy', env, verbose=1, policy_kwargs=policy_kwargs)
rl_model = sb3.DQN('MlpPolicy', env, verbose=1, policy_kwargs=policy_kwargs)

rl_model.policy

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


DQNPolicy(
  (q_net): QNetwork(
    (features_extractor): FlattenExtractor(
      (flatten): Flatten(start_dim=1, end_dim=-1)
    )
    (q_net): Sequential(
      (0): Linear(in_features=12, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): ReLU()
      (4): Linear(in_features=64, out_features=6, bias=True)
    )
  )
  (q_net_target): QNetwork(
    (features_extractor): FlattenExtractor(
      (flatten): Flatten(start_dim=1, end_dim=-1)
    )
    (q_net): Sequential(
      (0): Linear(in_features=12, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): ReLU()
      (4): Linear(in_features=64, out_features=6, bias=True)
    )
  )
)

In [9]:
# Model containing features_extractor, mlp_extractor.policy_net, action_net
class DQN_net(nn.Module):
    def __init__(self, original_model):
        super(DQN_net, self).__init__()
        self.q_net = original_model.policy.q_net

    def forward(self, x):
        x = self.q_net(x)
        return x

In [10]:
dqn_net = DQN_net(rl_model)

In [11]:
# Training
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.Adam(dqn_net.parameters(), lr=0.001)

In [12]:
def train(model, train_loader, criterion, optimizer):
    model.train()
    train_loss = 0
    correct = 0
    for state, target in train_loader:
        optimizer.zero_grad()
        output = model(state)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * state.size(0)
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.argmax(dim=1, keepdim=True)).sum().item()
    train_loss /= len(train_loader.dataset)
    accuracy = 100. * correct / len(train_loader.dataset)
    return train_loss, accuracy

def test(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for state, target in test_loader:
            output = model(state)
            test_loss += criterion(output, target).item() * state.size(0)
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.argmax(dim=1, keepdim=True)).sum().item()
    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    return test_loss, accuracy

In [13]:
epochs = 30
test_acc = 0
for epoch in range(1, epochs+1):
    train_loss, train_acc = train(dqn_net, train_static_dataloader, criterion, optimizer)
    test_loss, test_acc = test(dqn_net, test_static_dataloader, criterion)
    print(f'Epoch {epoch}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')
    if test_acc > 98:
        break

Epoch 1: Train Loss: 0.0121, Train Acc: 93.49%, Test Loss: 0.0072, Test Acc: 95.97%
Epoch 2: Train Loss: 0.0064, Train Acc: 94.66%, Test Loss: 0.0060, Test Acc: 95.98%
Epoch 3: Train Loss: 0.0056, Train Acc: 94.38%, Test Loss: 0.0054, Test Acc: 95.84%
Epoch 4: Train Loss: 0.0051, Train Acc: 94.09%, Test Loss: 0.0049, Test Acc: 95.87%
Epoch 5: Train Loss: 0.0044, Train Acc: 93.91%, Test Loss: 0.0040, Test Acc: 95.98%
Epoch 6: Train Loss: 0.0036, Train Acc: 94.07%, Test Loss: 0.0035, Test Acc: 95.98%
Epoch 7: Train Loss: 0.0031, Train Acc: 94.28%, Test Loss: 0.0030, Test Acc: 95.24%
Epoch 8: Train Loss: 0.0026, Train Acc: 94.37%, Test Loss: 0.0025, Test Acc: 95.78%
Epoch 9: Train Loss: 0.0023, Train Acc: 94.44%, Test Loss: 0.0021, Test Acc: 96.19%
Epoch 10: Train Loss: 0.0020, Train Acc: 94.60%, Test Loss: 0.0018, Test Acc: 95.77%
Epoch 11: Train Loss: 0.0018, Train Acc: 94.74%, Test Loss: 0.0016, Test Acc: 95.35%
Epoch 12: Train Loss: 0.0016, Train Acc: 94.88%, Test Loss: 0.0014, Test A

In [14]:
dqn_net.q_net.parameters_to_vector(), rl_model.q_net.parameters_to_vector()

(array([-0.00921137,  0.00387826,  0.00025387, ..., -0.04905361,
        -0.13919872,  0.01619057], dtype=float32),
 array([-0.00921137,  0.00387826,  0.00025387, ..., -0.04905361,
        -0.13919872,  0.01619057], dtype=float32))

In [15]:
# rl_model.policy.q_net.load_state_dict(dqn_net.q_net.state_dict())
# rl_model.policy.q_net_target.load_state_dict(dqn_net.q_net.state_dict())

In [15]:
def init_eval_env():
    # Prepare Eval ENV & Callback
    eval_env0 = gym.make("SimKubeEnv-v0", reward_file='train_step_3.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')
    eval_env1 = gym.make("SimKubeEnv-v0", reward_file='eval_rur.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')
    eval_env2 = gym.make("SimKubeEnv-v0", reward_file='eval_rbd1.py', scenario_file='scenario-5l-5m-10000p-10m_unbalanced.csv')
    eval_env3 = gym.make("SimKubeEnv-v0", reward_file='eval_rbd2.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')
    eval_env4 = gym.make("SimKubeEnv-v0", reward_file='eval_ct.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')

    return [eval_env0, eval_env1, eval_env2, eval_env3, eval_env4]

def eval_model(model, eval_envs):
    ret = []
    print('Evaluation : train_step_3')
    mean_reward, std_reward = evaluate_policy(model, eval_envs[0], n_eval_episodes=1, deterministic=True)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
    ret += [mean_reward, std_reward]

    print('Evaluation : eval_rur')
    mean_reward, std_reward = evaluate_policy(model, eval_envs[1], n_eval_episodes=1, deterministic=True)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
    ret += [mean_reward, std_reward]

    print('Evaluation : eval_rbd1')
    mean_reward, std_reward = evaluate_policy(model, eval_envs[2], n_eval_episodes=1, deterministic=True)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
    ret += [mean_reward, std_reward]

    print('Evaluation : eval_rbd2')
    mean_reward, std_reward = evaluate_policy(model, eval_envs[3], n_eval_episodes=1, deterministic=True)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
    ret += [mean_reward, std_reward]

    print('Episode length :')
    mean_reward, std_reward = evaluate_policy(model, eval_envs[4], n_eval_episodes=1, deterministic=True)
    print(f"Episode length:{mean_reward:.2f} +/- {std_reward:.2f}")
    ret += [mean_reward, std_reward]

    return ret

In [17]:
# rl_model.learn(500000)

In [18]:
# rl_model_untrained.learn(500000)

In [16]:
eval_envs = init_eval_env()

In [None]:
eval_log = eval_model(rl_model, eval_envs)

for i in range(10):
    print(f"Training {i}th iteration...")

    rl_model.learn(total_timesteps=100000, log_interval=10000)
    print(f"{i}th training done")
    eval_log = eval_model(rl_model, eval_envs)

In [18]:
eval_log = eval_model(rl_model_untrained, eval_envs)

for i in range(10):
    print(f"Training {i}th iteration...")

    rl_model_untrained.learn(total_timesteps=100000, log_interval=10000)
    print(f"{i}th training done")
    eval_log = eval_model(rl_model_untrained, eval_envs)

Evaluation : train_step_3
mean_reward:-2811.66 +/- 0.00
Evaluation : eval_rur
mean_reward:719.48 +/- 0.00
Evaluation : eval_rbd1
mean_reward:6240.91 +/- 0.00
Evaluation : eval_rbd2
mean_reward:3373.64 +/- 0.00
Episode length :
Episode length:4002.00 +/- 0.00
Training 0th iteration...
0th training done
Evaluation : train_step_3
mean_reward:-544.64 +/- 0.00
Evaluation : eval_rur
mean_reward:1450.18 +/- 0.00
Evaluation : eval_rbd1
mean_reward:11346.44 +/- 0.00
Evaluation : eval_rbd2
mean_reward:1805.85 +/- 0.00
Episode length :
Episode length:1927.00 +/- 0.00
Training 1th iteration...
1th training done
Evaluation : train_step_3
mean_reward:-407.11 +/- 0.00
Evaluation : eval_rur
mean_reward:1437.37 +/- 0.00
Evaluation : eval_rbd1
mean_reward:11729.48 +/- 0.00
Evaluation : eval_rbd2
mean_reward:1638.33 +/- 0.00
Episode length :
Episode length:1725.00 +/- 0.00
Training 2th iteration...
2th training done
Evaluation : train_step_3
mean_reward:-391.28 +/- 0.00
Evaluation : eval_rur
mean_reward:

In [None]:
model = sb3.DQN('MlpPolicy', env, verbose=1)

model.learn(300000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 4e+03     |
|    ep_rew_mean      | -2.08e+03 |
|    exploration_rate | 0.493     |
| time/               |           |
|    episodes         | 4         |
|    fps              | 3210      |
|    time_elapsed     | 4         |
|    total_timesteps  | 16008     |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 4e+03     |
|    ep_rew_mean      | -2.09e+03 |
|    exploration_rate | 0.05      |
| time/               |           |
|    episodes         | 8         |
|    fps              | 3303      |
|    time_elapsed     | 9         |
|    total_timesteps  | 32016     |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 4e+03     |
|    e

<stable_baselines3.dqn.dqn.DQN at 0x7f9a9915f8e0>

In [None]:
eval_model(model, eval_envs)