## Prep

In [1]:
import os, sys

base_path = os.path.join(os.getcwd(), "..")
print(f"Base Path: {base_path}")
sys.path.append(base_path)

Base Path: /Users/swkim/Documents/coding/thesis/PROMES_colab/notebook/..


In [2]:
import stable_baselines3 as sb3
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.logger import configure

from datetime import datetime

import gym
from gym import spaces

import torch
import torch.nn as nn
import torch.nn.functional as F

from kube_sim_gym.envs import *

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
sample1 = torch.tensor([[0.99, 0.90, 0.80, 0.80, 0.95, 0.95, 0.90, 0.85, 0.0, 0.0, 0.0, 0.0]])
sample2 = torch.tensor([[0.99, 0.90, 0.80, 0.80, 0.95, 0.95, 0.90, 0.85, 0.0, 0.0, 0.6, 0.7]])
sample3 = torch.tensor([[0.99, 0.90, 0.40, 0.40, 0.15, 0.15, 0.90, 0.85, 0.8, 0.8, 0.6, 0.7]])

## RL Training utils

In [4]:
def test_rl_model(scenario_file, rl_model):

    test_env1 = gym.make('SimKubeEnv-v0', reward_file='train_dynamic.py', scenario_file=scenario_file)
    test_env2 = gym.make('SimKubeEnv-v0', reward_file='train_dynamic.py', scenario_file=scenario_file)

    # RL Scheduler
    rl_model.set_env(test_env1)

    # Default Scheduler
    from kube_hr_scheduler.scheduler.sim_hr_scheduler import SimHrScheduler
    default_scheduler = SimHrScheduler(test_env2, 'default.py')


    # Test the model
    obs1 = test_env1.reset()
    obs2 = test_env2.reset()
    done1 = False
    done2 = False
    step1 = 0
    step2 = 0
    acc_rew1 = 0
    acc_rew2 = 0

    print(f"Testing with {scenario_file} (my model vs. default)")
    while not done1 or not done2:
        if not done1:
            action1, _ = rl_model.predict(obs1)
            # action1 = rl_scheduler.decision(test_env1)
            obs1, reward1, done1, _ = test_env1.step(action1)
            step1 += 1
            acc_rew1 += reward1
        if not done2:
            action2 = default_scheduler.decision(test_env2)
            obs2, reward2, done2, _ = test_env2.step(action2)
            step2 += 1
            acc_rew2 += reward2

    acc_rew1 = round(acc_rew1, 2)
    acc_rew2 = round(acc_rew2, 2)

    print(f"Test result(reward): {acc_rew1} vs. {acc_rew2}")
    print(f"Test result(step): {step1} vs. {step2}")

    return acc_rew1, acc_rew2, step1, step2

In [5]:
from IPython.display import clear_output
from notebook.net_arch import *
import glob

def train_rl_model(json_tracker_fname):

    date = datetime(1992, 7, 5, 8, 33)
    date = date.strftime("%m%d%Y%H%M")

    log_name = json_tracker_fname.split('.')[0]
    log_path = f'training/log/{log_name}'

    if not os.path.exists(log_path):
        os.makedirs(log_path)

    logger = configure(log_path, ['stdout', 'csv', 'tensorboard'])
    
    # Load the json tracker
    import json
    with open(f'training/{json_tracker_fname}', 'r') as f:
        json_tracker = json.load(f)

    last_idx = json_tracker['last_idx']
    learning_steps = json_tracker['learning_steps']
    model_type = json_tracker['model_type']
    reward_file = json_tracker['reward_file']
    model_fname = json_tracker['model_fname']

    # Environment
    envs = []
    for i in range(1, 50):
        env = gym.make('SimKubeEnv-v0', reward_file=reward_file, scenario_file=f'trace2017_100_{i}.csv')
        envs.append(env)

    current_idx = last_idx + 1 # -1 as default

    # Model type : DQN or PPO
    if model_type == 'DQN':
        model = sb3.DQN
    elif model_type == 'PPO':
        model = sb3.PPO
    else:
        print(f"Unknown model type: {model_type}")
        return
    
    model_fpath = f'net_arch/{model_fname}.zip'

    # Check if the model exists
    # Load Model
    if os.path.exists(model_fpath):
        print(f"Loading the model from {model_fname}")
        model = model.load(model_fpath)
    else: # Error
        print(f"Model file does not exist: {model_fname}")
        return
    
    # If last_idx is not -1 and there's a model trained in training/model, then load the model
    if last_idx != -1 and glob.glob(f'training/model/{model_fname}_*'):
        # Load the model with the latest date
        model_fpaths = glob.glob(f'training/model/{model_fname}_*')
        model_fpaths.sort()
        model_fpath = model_fpaths[-1]
        print(f"Loading the model from {model_fpath}")
    
    # Save the model, append _{date} to the model name
    trained_model_fname = f'{model_fname}_{date}'
    trained_model_fpath = f'training/model/{trained_model_fname}'

    # Set logger
    model.set_logger(logger)

    # Train the model
    while current_idx < 20: # Target training steps (Can be changed!)
        print(f"Training with {current_idx}th trace")

        # Test the model first
        a1, a2, a3, a4 = test_rl_model('scenario-5l-5m-1000p-10m_unbalanced.csv', model)
        b1, b2, b3, b4 = test_rl_model('scenario-10l-3m-1000p-10m_unbalanced.csv', model)
        c1, c2, c3, c4 = test_rl_model('scenario-3l-10m-1000p-10m_unbalanced.csv', model)

        with open(f'training/log/{log_name}/test_result.txt', 'a') as f:
            f.write(f"{current_idx},{a1},{a2},{a3},{a4},{b1},{b2},{b3},{b4},{c1},{c2},{c3},{c4}\n")

        env = envs[current_idx]
        model.set_env(env)
        model.learn(total_timesteps=learning_steps)

        # Save the model
        model.save(trained_model_fpath)

        # Update the json tracker
        json_tracker['last_idx'] = current_idx
        with open(f'training/{json_tracker_fname}', 'w') as f:
            json.dump(json_tracker, f)

        current_idx += 1

        clear_output()

        



### Dataset

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader

import pandas as pd

In [7]:
class Pr_Dataset(Dataset):
    def __init__(self, csv_path, train=True):
        self.data = pd.read_csv(csv_path)
        # Drop the row which has 0 for the last -2, -3 columns
        # self.data = self.data.drop(self.data[(self.data.iloc[:, -2] == 0) & (self.data.iloc[:, -3] == 0)].index)

        if train:
            self.data = self.data.sample(frac=0.8, random_state=42)
        else:
            self.data = self.data.drop(self.data.sample(frac=0.8, random_state=42).index)

        self.data = self.transform(self.data)
        self.input = self.data[:, :-6]
        self.label = self.data[:, -6:]
        # Multiply by 100
        self.label = self.label * 100

    def transform(self, data):
        return torch.tensor(data.values, dtype=torch.float32)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.input[idx], self.label[idx]

In [8]:
import os

data_path = os.path.join(base_path, "dataset", "data_2.csv")
train_pr_dataset = Pr_Dataset(data_path, train=True)
test_pr_dataset = Pr_Dataset(data_path, train=False)
train_pr_dataloader = DataLoader(train_pr_dataset, batch_size=32, shuffle=False)
test_pr_dataloader = DataLoader(test_pr_dataset, batch_size=32, shuffle=False)

In [9]:
for batch in train_pr_dataloader:
    input, labels = batch
    print(input.shape, labels.shape)
    print(f"input1: {input}\nlabels: {labels}")
    break

torch.Size([32, 12]) torch.Size([32, 6])
input1: tensor([[0.4000, 0.2700, 0.8800, 0.6200, 0.8200, 0.3000, 0.0200, 0.6800, 0.1100,
         0.8800, 0.2500, 0.4600],
        [0.3700, 0.1300, 0.0800, 0.5100, 0.6800, 0.5100, 0.6100, 0.2700, 0.1900,
         0.1300, 0.3700, 0.4300],
        [0.3100, 0.2000, 0.7700, 0.0400, 0.2000, 0.5300, 0.4000, 0.4500, 0.8100,
         0.9800, 0.4100, 0.4800],
        [0.6100, 0.4300, 0.0000, 0.8800, 0.1000, 0.7200, 0.6400, 0.2900, 0.4300,
         0.5300, 0.1700, 0.2100],
        [0.6300, 0.7300, 0.1000, 0.3700, 0.5500, 0.0300, 0.8800, 0.9600, 0.6100,
         0.3800, 0.0900, 0.2200],
        [0.0900, 0.9200, 0.5500, 0.1800, 0.1900, 0.2800, 0.4200, 0.8500, 0.3100,
         0.5900, 0.0000, 0.0000],
        [0.5800, 0.2700, 0.2100, 0.5500, 0.5000, 0.4200, 0.5500, 0.4900, 0.7500,
         0.9000, 0.1300, 0.0400],
        [0.3900, 0.0600, 0.0700, 0.6000, 0.4800, 0.1900, 0.0100, 0.7700, 0.3200,
         0.1700, 0.1100, 0.1900],
        [0.5100, 0.9700, 0.8700

## Models

### PPO Multi-modal Dynamic (Untrained + Pretrained)

In [10]:
class FE_MM_net(BaseFeaturesExtractor):
    def __init__(self, observation_space: spaces.Box, features_dim: int = 16):
        super(FE_MM_net, self).__init__(observation_space, features_dim)
        self.fc1_1 = nn.Linear(10, 16) # 5 Nodes status (CPU, Memory)
        self.fc1_2 = nn.Linear(2, 16)   # Pod quota (CPU, Memory)
        self.fc2_1 = nn.Linear(16, 8)
        self.fc2_2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(16, 16)    # Concatenated vector
        self.fc4 = nn.Linear(16, 16)    # Last layer of FE_net


    def forward(self, x):
        x1 = x[:, :10]
        x2 = x[:, 10:]
        x1 = F.relu(self.fc1_1(x1))  
        x2 = F.relu(self.fc1_2(x2))
        x1 = F.relu(self.fc2_1(x1))
        x2 = F.relu(self.fc2_2(x2))
        x = torch.cat((x1, x2), dim=1)
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [11]:
policy_kwargs = dict(
    features_extractor_class=FE_MM_net,
    features_extractor_kwargs=dict(features_dim=16),
    net_arch=[dict(pi=[80, 80], vf=[80, 80])]
)

env = gym.make('SimKubeEnv-v0', reward_file='train_dynamic.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')

rl_model = sb3.PPO('MlpPolicy', env, verbose=1, policy_kwargs=policy_kwargs)

Base Path: /Users/swkim/Documents/coding/thesis/PROMES_colab/notebook/..
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [12]:
rl_model.policy

ActorCriticPolicy(
  (features_extractor): FE_MM_net(
    (fc1_1): Linear(in_features=10, out_features=16, bias=True)
    (fc1_2): Linear(in_features=2, out_features=16, bias=True)
    (fc2_1): Linear(in_features=16, out_features=8, bias=True)
    (fc2_2): Linear(in_features=16, out_features=8, bias=True)
    (fc3): Linear(in_features=16, out_features=16, bias=True)
    (fc4): Linear(in_features=16, out_features=16, bias=True)
  )
  (mlp_extractor): MlpExtractor(
    (shared_net): Sequential()
    (policy_net): Sequential(
      (0): Linear(in_features=16, out_features=80, bias=True)
      (1): Tanh()
      (2): Linear(in_features=80, out_features=80, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=16, out_features=80, bias=True)
      (1): Tanh()
      (2): Linear(in_features=80, out_features=80, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=80, out_features=6, bias=True)
  (value_net): Linear(in_features=80, ou

In [13]:
# model_ppo_mm_ut_dynamic
rl_model.save(os.path.join(base_path, 'notebook', 'net_arch', 'model_ppo_mm_ut_dynamic'))

In [14]:
# Model containing features_extractor, mlp_extractor.policy_net, action_net
class PPO_MM_Action_net(nn.Module):
    def __init__(self, original_model):
        super(PPO_MM_Action_net, self).__init__()
        self.features_extractor = original_model.policy.features_extractor
        self.policy_net = original_model.policy.mlp_extractor.policy_net
        self.action_net = original_model.policy.action_net

    def forward(self, x):
        x = self.features_extractor(x)
        x = self.policy_net(x)
        x = self.action_net(x)
        return x

In [15]:
ppo_mm_action_model = PPO_MM_Action_net(rl_model)

In [16]:
ppo_mm_action_model(sample1)

tensor([[ 0.0007,  0.0049,  0.0037, -0.0023, -0.0082, -0.0036]],
       grad_fn=<AddmmBackward0>)

In [17]:
# Training
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.Adam(ppo_mm_action_model.parameters(), lr=0.001)

In [18]:
def train(model, train_loader, criterion, optimizer):
    model.train()
    train_loss = 0
    correct = 0
    for state, target in train_loader:
        optimizer.zero_grad()
        output = model(state)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * state.size(0)
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.argmax(dim=1, keepdim=True)).sum().item()
    train_loss /= len(train_loader.dataset)
    accuracy = 100. * correct / len(train_loader.dataset)
    return train_loss, accuracy

def test(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for state, target in test_loader:
            output = model(state)
            test_loss += criterion(output, target).item() * state.size(0)
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.argmax(dim=1, keepdim=True)).sum().item()
    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    return test_loss, accuracy

In [19]:
epochs = 20
test_acc = 0
for epoch in range(1, epochs+1):
    train_loss, train_acc = train(ppo_mm_action_model, train_pr_dataloader, criterion, optimizer)
    test_loss, test_acc = test(ppo_mm_action_model, test_pr_dataloader, criterion)
    print(f'Epoch {epoch}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')
    if test_acc > 98:
        break

Epoch 1: Train Loss: 182.4380, Train Acc: 42.84%, Test Loss: 66.3190, Test Acc: 63.11%
Epoch 2: Train Loss: 27.5916, Train Acc: 83.25%, Test Loss: 0.9740, Test Acc: 96.92%
Epoch 3: Train Loss: 2.4533, Train Acc: 97.80%, Test Loss: 0.6422, Test Acc: 98.03%


In [20]:
rl_model.policy.features_extractor.load_state_dict(ppo_mm_action_model.features_extractor.state_dict())
rl_model.policy.mlp_extractor.policy_net.load_state_dict(ppo_mm_action_model.policy_net.state_dict())
rl_model.policy.action_net.load_state_dict(ppo_mm_action_model.action_net.state_dict())

<All keys matched successfully>

In [21]:
# model_ppo_mm_pr_dynamic
rl_model.save(os.path.join(base_path, 'notebook', 'net_arch', 'model_ppo_mm_pr_dynamic'))

### PPO Single-modal Dynamic (Untrained + Pretrained)

In [26]:
class FE_SM_net(BaseFeaturesExtractor):
    def __init__(self, observation_space: spaces.Box, features_dim: int = 16):
        super(FE_SM_net, self).__init__(observation_space, features_dim)
        self.fc1 = nn.Linear(12, 16)
        self.fc2 = nn.Linear(16, 16)    # Concatenated vector
        self.fc3 = nn.Linear(16, 16)    # Last layer of FE_net

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)     # (batch_size, 16)
        return x

In [27]:
policy_kwargs = dict(
    features_extractor_class=FE_SM_net,
    features_extractor_kwargs=dict(features_dim=16),
    net_arch=[dict(pi=[80, 80], vf=[80, 80])]
)

env = gym.make('SimKubeEnv-v0', reward_file='train_dynamic.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')

rl_model = sb3.PPO('MlpPolicy', env, verbose=1, policy_kwargs=policy_kwargs)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [28]:
rl_model.policy

ActorCriticPolicy(
  (features_extractor): FE_SM_net(
    (fc1): Linear(in_features=12, out_features=16, bias=True)
    (fc2): Linear(in_features=16, out_features=16, bias=True)
    (fc3): Linear(in_features=16, out_features=16, bias=True)
  )
  (mlp_extractor): MlpExtractor(
    (shared_net): Sequential()
    (policy_net): Sequential(
      (0): Linear(in_features=16, out_features=80, bias=True)
      (1): Tanh()
      (2): Linear(in_features=80, out_features=80, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=16, out_features=80, bias=True)
      (1): Tanh()
      (2): Linear(in_features=80, out_features=80, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=80, out_features=6, bias=True)
  (value_net): Linear(in_features=80, out_features=1, bias=True)
)

In [29]:
# Save untrained model
rl_model.save(os.path.join(base_path, 'notebook', 'net_arch', 'model_ppo_sm_ut_dynamic'))

In [30]:
# Model containing features_extractor, mlp_extractor.policy_net, action_net
class PPO_SM_Action_net(nn.Module):
    def __init__(self, original_model):
        super(PPO_SM_Action_net, self).__init__()
        self.features_extractor = original_model.policy.features_extractor
        self.policy_net = original_model.policy.mlp_extractor.policy_net
        self.action_net = original_model.policy.action_net

    def forward(self, x):
        x = self.features_extractor(x)
        x = self.policy_net(x)
        x = self.action_net(x)
        return x

In [31]:
ppo_sm_action_model = PPO_MM_Action_net(rl_model)

In [32]:
# Training
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.Adam(ppo_sm_action_model.parameters(), lr=0.001)

In [33]:
def train(model, train_loader, criterion, optimizer):
    model.train()
    train_loss = 0
    correct = 0
    for state, target in train_loader:
        optimizer.zero_grad()
        output = model(state)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * state.size(0)
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.argmax(dim=1, keepdim=True)).sum().item()
    train_loss /= len(train_loader.dataset)
    accuracy = 100. * correct / len(train_loader.dataset)
    return train_loss, accuracy

def test(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for state, target in test_loader:
            output = model(state)
            test_loss += criterion(output, target).item() * state.size(0)
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.argmax(dim=1, keepdim=True)).sum().item()
    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    return test_loss, accuracy

In [34]:
epochs = 20
test_acc = 0
for epoch in range(1, epochs+1):
    train_loss, train_acc = train(ppo_sm_action_model, train_pr_dataloader, criterion, optimizer)
    test_loss, test_acc = test(ppo_sm_action_model, test_pr_dataloader, criterion)
    print(f'Epoch {epoch}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')
    if test_acc > 98:
        break

Epoch 1: Train Loss: 156.8899, Train Acc: 66.82%, Test Loss: 17.2506, Test Acc: 89.24%
Epoch 2: Train Loss: 8.9965, Train Acc: 93.89%, Test Loss: 7.8074, Test Acc: 96.61%
Epoch 3: Train Loss: 3.0422, Train Acc: 96.85%, Test Loss: 1.3424, Test Acc: 97.11%
Epoch 4: Train Loss: 2.5247, Train Acc: 97.22%, Test Loss: 1.4347, Test Acc: 97.68%
Epoch 5: Train Loss: 1.9221, Train Acc: 97.47%, Test Loss: 1.3788, Test Acc: 97.49%
Epoch 6: Train Loss: 1.3837, Train Acc: 97.68%, Test Loss: 0.6301, Test Acc: 97.82%
Epoch 7: Train Loss: 1.5111, Train Acc: 97.77%, Test Loss: 0.5548, Test Acc: 98.12%


In [35]:
rl_model.policy.features_extractor.load_state_dict(ppo_sm_action_model.features_extractor.state_dict())
rl_model.policy.mlp_extractor.policy_net.load_state_dict(ppo_sm_action_model.policy_net.state_dict())
rl_model.policy.action_net.load_state_dict(ppo_sm_action_model.action_net.state_dict())

<All keys matched successfully>

In [36]:
# model_ppo_mm_pr_dynamic
rl_model.save(os.path.join(base_path, 'notebook', 'net_arch', 'model_ppo_sm_pr_dynamic'))

### DQN Multi-modal Dynamic (Untrained + Pretrained)

In [54]:
class FE_MM_net(BaseFeaturesExtractor):
    def __init__(self, observation_space: spaces.Box, features_dim: int = 16):
        super(FE_MM_net, self).__init__(observation_space, features_dim)
        self.fc1_1 = nn.Linear(10, 16) # 5 Nodes status (CPU, Memory)
        self.fc1_2 = nn.Linear(2, 16)   # Pod quota (CPU, Memory)
        self.fc2_1 = nn.Linear(16, 8)
        self.fc2_2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(16, 16)    # Concatenated vector
        self.fc4 = nn.Linear(16, 16)    # Last layer of FE_net


    def forward(self, x):
        x1 = x[:, :10]
        x2 = x[:, 10:]
        x1 = F.relu(self.fc1_1(x1))  
        x2 = F.relu(self.fc1_2(x2))
        x1 = F.relu(self.fc2_1(x1))
        x2 = F.relu(self.fc2_2(x2))
        x = torch.cat((x1, x2), dim=1)
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [55]:
policy_kwargs = dict(
    features_extractor_class=FE_MM_net,
    features_extractor_kwargs=dict(features_dim=16),
    net_arch=[80, 80]
)

env = gym.make('SimKubeEnv-v0', reward_file='train_dynamic.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')

rl_model = sb3.DQN('MlpPolicy', env, verbose=1, policy_kwargs=policy_kwargs)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [56]:
rl_model.policy

DQNPolicy(
  (q_net): QNetwork(
    (features_extractor): FE_MM_net(
      (fc1_1): Linear(in_features=10, out_features=16, bias=True)
      (fc1_2): Linear(in_features=2, out_features=16, bias=True)
      (fc2_1): Linear(in_features=16, out_features=8, bias=True)
      (fc2_2): Linear(in_features=16, out_features=8, bias=True)
      (fc3): Linear(in_features=16, out_features=16, bias=True)
      (fc4): Linear(in_features=16, out_features=16, bias=True)
    )
    (q_net): Sequential(
      (0): Linear(in_features=16, out_features=80, bias=True)
      (1): ReLU()
      (2): Linear(in_features=80, out_features=80, bias=True)
      (3): ReLU()
      (4): Linear(in_features=80, out_features=6, bias=True)
    )
  )
  (q_net_target): QNetwork(
    (features_extractor): FE_MM_net(
      (fc1_1): Linear(in_features=10, out_features=16, bias=True)
      (fc1_2): Linear(in_features=2, out_features=16, bias=True)
      (fc2_1): Linear(in_features=16, out_features=8, bias=True)
      (fc2_2): Line

In [57]:
# model_ppo_mm_ut_dynamic
rl_model.save(os.path.join(base_path, 'notebook', 'net_arch', 'model_dqn_mm_ut_dynamic'))

In [58]:
rl_model.policy.q_net_target(sample1)

tensor([[-0.0613,  0.0038,  0.0910, -0.1583,  0.0962,  0.0991]],
       grad_fn=<AddmmBackward0>)

In [59]:
# Model containing features_extractor, mlp_extractor.policy_net, action_net
class DQN_MM_net(nn.Module):
    def __init__(self, original_model):
        super(DQN_MM_net, self).__init__()
        self.q_net = original_model.policy.q_net

    def forward(self, x):
        x = self.q_net(x)
        return x

In [60]:
dqn_mm_model = DQN_MM_net(rl_model)

In [61]:
# Training
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.Adam(dqn_mm_model.parameters(), lr=0.001)

In [62]:
def train(model, train_loader, criterion, optimizer):
    model.train()
    train_loss = 0
    correct = 0
    for state, target in train_loader:
        optimizer.zero_grad()
        output = model(state)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * state.size(0)
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.argmax(dim=1, keepdim=True)).sum().item()
    train_loss /= len(train_loader.dataset)
    accuracy = 100. * correct / len(train_loader.dataset)
    return train_loss, accuracy

def test(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for state, target in test_loader:
            output = model(state)
            test_loss += criterion(output, target).item() * state.size(0)
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.argmax(dim=1, keepdim=True)).sum().item()
    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    return test_loss, accuracy

In [None]:
epochs = 50
test_acc = 0
for epoch in range(1, epochs+1):
    train_loss, train_acc = train(dqn_mm_model, train_pr_dataloader, criterion, optimizer)
    test_loss, test_acc = test(dqn_mm_model, test_pr_dataloader, criterion)
    print(f'Epoch {epoch}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')
    if test_acc > 95:
        break

In [None]:
rl_model.policy.q_net.load_state_dict(dqn_mm_model.features_extractor.state_dict())
rl_model.policy.q_net_target.load_state_dict(dqn_mm_model.features_extractor.state_dict())

In [None]:
# model_ppo_mm_pr_dynamic
rl_model.save(os.path.join(base_path, 'notebook', 'net_arch', 'model_dqn_mm_pr_dynamic'))

### DQN Single-modal Dynamic (Untrained + Pretrained)

In [None]:
class FE_SM_net(BaseFeaturesExtractor):
    def __init__(self, observation_space: spaces.Box, features_dim: int = 16):
        super(FE_SM_net, self).__init__(observation_space, features_dim)
        self.fc1 = nn.Linear(12, 16) # 5 Nodes status (CPU, Memory)
        self.fc2 = nn.Linear(16, 16)    # Last layer of FE_net


    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
policy_kwargs = dict(
    features_extractor_class=FE_SM_net,
    features_extractor_kwargs=dict(features_dim=16),
    net_arch=[80, 80]
)

env = gym.make('SimKubeEnv-v0', reward_file='train_dynamic.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')

rl_model = sb3.DQN('MlpPolicy', env, verbose=1, policy_kwargs=policy_kwargs)

In [None]:
rl_model.policy

In [None]:
# model_ppo_mm_ut_dynamic
rl_model.save(os.path.join(base_path, 'notebook', 'net_arch', 'model_dqn_sm_ut_dynamic'))

In [None]:
rl_model.policy.q_net_target(sample1)

In [None]:
# Model containing features_extractor, mlp_extractor.policy_net, action_net
class DQN_SM_net(nn.Module):
    def __init__(self, original_model):
        super(DQN_MM_net, self).__init__()
        self.q_net = original_model.policy.q_net

    def forward(self, x):
        x = self.q_net(x)
        return x

In [None]:
dqn_sm_model = DQN_SM_net(rl_model)

In [None]:
# Training
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.Adam(dqn_sm_model.parameters(), lr=0.001)

In [None]:
def train(model, train_loader, criterion, optimizer):
    model.train()
    train_loss = 0
    correct = 0
    for state, target in train_loader:
        optimizer.zero_grad()
        output = model(state)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * state.size(0)
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.argmax(dim=1, keepdim=True)).sum().item()
    train_loss /= len(train_loader.dataset)
    accuracy = 100. * correct / len(train_loader.dataset)
    return train_loss, accuracy

def test(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for state, target in test_loader:
            output = model(state)
            test_loss += criterion(output, target).item() * state.size(0)
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.argmax(dim=1, keepdim=True)).sum().item()
    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    return test_loss, accuracy

In [None]:
epochs = 50
test_acc = 0
for epoch in range(1, epochs+1):
    train_loss, train_acc = train(dqn_sm_model, train_pr_dataloader, criterion, optimizer)
    test_loss, test_acc = test(dqn_sm_model, test_pr_dataloader, criterion)
    print(f'Epoch {epoch}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')
    if test_acc > 95:
        break

In [None]:
rl_model.policy.q_net.load_state_dict(dqn_sm_model.features_extractor.state_dict())
rl_model.policy.q_net_target.load_state_dict(dqn_sm_model.features_extractor.state_dict())

In [None]:
# model_ppo_mm_pr_dynamic
rl_model.save(os.path.join(base_path, 'notebook', 'net_arch', 'model_dqn_sm_pr_dynamic'))