In [4]:
import os, sys

base_path = os.path.join(os.getcwd(), "..")
print(f"Base Path: {base_path}")
sys.path.append(base_path)

Base Path: /Users/swkim/Documents/coding/thesis/PROMES_colab/notebook/..


In [5]:
# Load gym environment
import gym
from kube_sim_gym import *
from kube_sim_gym.envs.sim_kube_env import SimKubeEnv

In [6]:
from kube_hr_scheduler.scheduler.sim_hr_scheduler import SimHrScheduler
from kube_hr_scheduler.strategies.model.default import Model

In [7]:
import gym
import numpy as np
from stable_baselines3 import DQN, PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.ppo import MlpPolicy

from imitation.algorithms import bc
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.data.types import Transitions

## Expert from dataset

In [8]:
# Test version
# data = np.genfromtxt('../dataset/data_expert.csv', delimiter=',', max_rows=1000000) 

# Final version
data = np.genfromtxt('../dataset/data_expert.csv', delimiter=',') 

In [9]:
# Split the data into obs, acts, infos, next_obs, and dones
obs = data[:, :12]
acts = data[:, 12]
infos = np.empty(len(data), dtype=dict)
next_obs = data[:, 13:25]
dones = data[:, 25].astype(bool)

In [10]:
transitions = Transitions(obs, acts, infos, next_obs, dones)

In [11]:
transitions[13]

{'obs': array([0.91, 0.85, 0.73, 0.76, 0.83, 0.75, 0.82, 0.94, 0.76, 0.84, 0.05,
        0.14]),
 'acts': 1.0,
 'infos': None,
 'next_obs': array([0.86, 0.71, 0.73, 0.76, 0.83, 0.75, 0.82, 0.94, 0.76, 0.84, 0.04,
        0.06]),
 'dones': False}

## Expert from newly trained model

In [12]:
# def train_expert():
#     print("Training a expert.")

#     expert = DQN(
#         policy='MlpPolicy',
#         env=env
#     )

#     expert.learn(10000)  # Note: change this to 100000 to train a decent expert.
#     return expert

In [13]:
# def sample_expert_transitions():
#     expert = train_expert()

#     print("Sampling expert transitions.")
#     rollouts = rollout.rollout(
#         expert,
#         DummyVecEnv([lambda: RolloutInfoWrapper(env)]),
#         rollout.make_sample_until(min_timesteps=None, min_episodes=50),
#         rng=rng,
#     )
#     return rollout.flatten_trajectories(rollouts)

In [11]:
from stable_baselines3.common.evaluation import evaluate_policy

def eval_unit(model, env, num_eval_episodes=2):
    rews, lens = evaluate_policy(model, env, n_eval_episodes=num_eval_episodes, return_episode_rewards=True)
    # Takes the mean of rews elements divided by lens elements
    mean_rew = np.mean([rew / length for rew, length in zip(rews, lens)]).round(2)

    # Takes the difference between the maximum and minimum of reward elements
    std_rew = (np.max(rews) - np.min(rews)) / 2

    mean_rew = round(mean_rew, 2)
    std_rew = round(std_rew, 2)

    return (mean_rew, std_rew)


def eval_set(model):
    eval_rur_env1 = gym.make('SimKubeEnv-v0', reward_file='eval_rur.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')
    eval_rur_env2 = gym.make('SimKubeEnv-v0', reward_file='eval_rur.py', scenario_file='scenario-10l-3m-1000p-10m_unbalanced.csv')
    eval_rur_env3 = gym.make('SimKubeEnv-v0', reward_file='eval_rur.py', scenario_file='scenario-3l-10m-1000p-10m_unbalanced.csv')

    eval_rbd1_env1 = gym.make('SimKubeEnv-v0', reward_file='eval_rbd1.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')
    eval_rbd1_env2 = gym.make('SimKubeEnv-v0', reward_file='eval_rbd1.py', scenario_file='scenario-10l-3m-1000p-10m_unbalanced.csv')
    eval_rbd1_env3 = gym.make('SimKubeEnv-v0', reward_file='eval_rbd1.py', scenario_file='scenario-3l-10m-1000p-10m_unbalanced.csv')

    eval_rbd2_env1 = gym.make('SimKubeEnv-v0', reward_file='eval_rbd2.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')
    eval_rbd2_env2 = gym.make('SimKubeEnv-v0', reward_file='eval_rbd2.py', scenario_file='scenario-10l-3m-1000p-10m_unbalanced.csv')
    eval_rbd2_env3 = gym.make('SimKubeEnv-v0', reward_file='eval_rbd2.py', scenario_file='scenario-3l-10m-1000p-10m_unbalanced.csv')

    rur1 = eval_unit(model, eval_rur_env1, num_eval_episodes=2)
    rur2 = eval_unit(model, eval_rur_env2, num_eval_episodes=2)
    rur3 = eval_unit(model, eval_rur_env3, num_eval_episodes=2)

    print(rur1, rur2, rur3)

    rbd11 = eval_unit(model, eval_rbd1_env1, num_eval_episodes=2)
    rbd12 = eval_unit(model, eval_rbd1_env2, num_eval_episodes=2)
    rbd13 = eval_unit(model, eval_rbd1_env3, num_eval_episodes=2)

    print(rbd11, rbd12, rbd13)

    rbd21 = eval_unit(model, eval_rbd2_env1, num_eval_episodes=2)
    rbd22 = eval_unit(model, eval_rbd2_env2, num_eval_episodes=2)
    rbd23 = eval_unit(model, eval_rbd2_env3, num_eval_episodes=2)

    print(rbd21, rbd22, rbd23)
    

## Student training

In [14]:
# Initialize environment without rendering
env = gym.make("SimKubeEnv-v0", reward_file='train_step_3.py')
rng = np.random.default_rng(0)

Base Path: /Users/swkim/Documents/coding/thesis/PROMES_colab/notebook/..


In [15]:
bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    demonstrations=transitions,
    rng=rng,
)

In [16]:
eval_set(bc_trainer.policy)

NameError: name 'eval_set' is not defined

In [17]:
# bc_trainer.train(log_interval=100000, n_batches=100000) # Test version 
bc_trainer.train(n_epochs=10, log_interval=100000) # Fianl version

0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.00179 |
|    entropy        | 1.79     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 88.5     |
|    loss           | 1.79     |
|    neglogp        | 1.79     |
|    prob_true_act  | 0.167    |
|    samples_so_far | 32       |
--------------------------------


99976batch [06:43, 243.57batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 100000    |
|    ent_loss       | -0.000157 |
|    entropy        | 0.157     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 1.43e+03  |
|    loss           | 0.058     |
|    neglogp        | 0.0581    |
|    prob_true_act  | 0.948     |
|    samples_so_far | 3200032   |
---------------------------------


199987batch [13:12, 197.03batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 200000    |
|    ent_loss       | -4.35e-05 |
|    entropy        | 0.0435    |
|    epoch          | 1         |
|    l2_loss        | 0         |
|    l2_norm        | 2.19e+03  |
|    loss           | 0.0352    |
|    neglogp        | 0.0353    |
|    prob_true_act  | 0.976     |
|    samples_so_far | 6400032   |
---------------------------------


299984batch [19:44, 286.94batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 300000    |
|    ent_loss       | -0.000223 |
|    entropy        | 0.223     |
|    epoch          | 2         |
|    l2_loss        | 0         |
|    l2_norm        | 2.87e+03  |
|    loss           | 0.144     |
|    neglogp        | 0.144     |
|    prob_true_act  | 0.892     |
|    samples_so_far | 9600032   |
---------------------------------


399996batch [26:22, 268.73batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 400000    |
|    ent_loss       | -0.000119 |
|    entropy        | 0.119     |
|    epoch          | 3         |
|    l2_loss        | 0         |
|    l2_norm        | 3.52e+03  |
|    loss           | 0.104     |
|    neglogp        | 0.104     |
|    prob_true_act  | 0.936     |
|    samples_so_far | 12800032  |
---------------------------------


499992batch [34:07, 101.02batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 500000    |
|    ent_loss       | -4.93e-05 |
|    entropy        | 0.0493    |
|    epoch          | 4         |
|    l2_loss        | 0         |
|    l2_norm        | 4.14e+03  |
|    loss           | 0.0154    |
|    neglogp        | 0.0155    |
|    prob_true_act  | 0.986     |
|    samples_so_far | 16000032  |
---------------------------------


599977batch [42:58, 199.69batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 600000   |
|    ent_loss       | -0.0001  |
|    entropy        | 0.1      |
|    epoch          | 5        |
|    l2_loss        | 0        |
|    l2_norm        | 4.72e+03 |
|    loss           | 0.0939   |
|    neglogp        | 0.094    |
|    prob_true_act  | 0.943    |
|    samples_so_far | 19200032 |
--------------------------------


699993batch [53:16, 133.26batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 700000    |
|    ent_loss       | -3.47e-05 |
|    entropy        | 0.0347    |
|    epoch          | 6         |
|    l2_loss        | 0         |
|    l2_norm        | 5.26e+03  |
|    loss           | 0.0427    |
|    neglogp        | 0.0427    |
|    prob_true_act  | 0.974     |
|    samples_so_far | 22400032  |
---------------------------------


799996batch [1:02:50, 221.54batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 800000    |
|    ent_loss       | -4.55e-05 |
|    entropy        | 0.0455    |
|    epoch          | 7         |
|    l2_loss        | 0         |
|    l2_norm        | 5.77e+03  |
|    loss           | 0.0163    |
|    neglogp        | 0.0164    |
|    prob_true_act  | 0.985     |
|    samples_so_far | 25600032  |
---------------------------------


899994batch [1:12:35, 175.39batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 900000    |
|    ent_loss       | -0.000142 |
|    entropy        | 0.142     |
|    epoch          | 8         |
|    l2_loss        | 0         |
|    l2_norm        | 6.23e+03  |
|    loss           | 0.0886    |
|    neglogp        | 0.0888    |
|    prob_true_act  | 0.935     |
|    samples_so_far | 28800032  |
---------------------------------


999996batch [1:23:22, 136.59batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1000000   |
|    ent_loss       | -4.38e-05 |
|    entropy        | 0.0438    |
|    epoch          | 9         |
|    l2_loss        | 0         |
|    l2_norm        | 6.66e+03  |
|    loss           | 0.0473    |
|    neglogp        | 0.0474    |
|    prob_true_act  | 0.969     |
|    samples_so_far | 32000032  |
---------------------------------


1071450batch [1:28:45, 201.18batch/s]


In [16]:
eval_set(bc_trainer.policy)

(0.88, 0.0) (0.86, 0.0) (0.91, 0.0)
(0.9, 0.0) (0.87, 0.0) (0.93, 0.0)
(0.94, 0.0) (0.93, 0.0) (0.96, 0.0)


In [18]:
from stable_baselines3.common.callbacks import EvalCallback
eval_env1 = gym.make("SimKubeEnv-v0", reward_file='eval_rur.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')
eval_env2 = gym.make("SimKubeEnv-v0", reward_file='eval_rbd1.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')
eval_env3 = gym.make("SimKubeEnv-v0", reward_file='eval_rbd2.py', scenario_file='scenario-5l-5m-1000p-10m_unbalanced.csv')

# Test version
# eval_callback1 = EvalCallback(eval_env1, eval_freq=100000, n_eval_episodes=3, deterministic=True, render=False)
# eval_callback2 = EvalCallback(eval_env2, eval_freq=100000, n_eval_episodes=3, deterministic=True, render=False)
# eval_callback3 = EvalCallback(eval_env3, eval_freq=100000, n_eval_episodes=3, deterministic=True, render=False)

# Final verison
eval_callback1 = EvalCallback(eval_env1, eval_freq=10000, n_eval_episodes=3, deterministic=True, render=False, log_path="results/poc_1/rur")
eval_callback2 = EvalCallback(eval_env2, eval_freq=10000, n_eval_episodes=3, deterministic=True, render=False, log_path="results/poc_1/rbd1")
eval_callback3 = EvalCallback(eval_env3, eval_freq=10000, n_eval_episodes=3, deterministic=True, render=False, log_path="results/poc_1/rbd2")

In [19]:
model1 = DQN('MlpPolicy', env, verbose=1)
model2 = DQN('MlpPolicy', env, verbose=1)
model3 = DQN('MlpPolicy', env, verbose=1)

model1.policy = bc_trainer.policy
model2.policy = bc_trainer.policy
model3.policy = bc_trainer.policy

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [19]:
# Test version
# model1.learn(total_timesteps=5e5, 
#             log_interval=50, 
#             # progress_bar=True, 
#             callback=eval_callback1
#            )

# Final version
model1.learn(total_timesteps=1e6, 
            log_interval=50, 
            # progress_bar=True, 
            callback=eval_callback1
           )

Eval num_timesteps=100000, episode_reward=1423.02 +/- 0.00
Episode length: 1609.00 +/- 0.00
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.61e+03 |
|    mean_reward      | 1.42e+03 |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 100000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0417   |
|    n_updates        | 12499    |
----------------------------------
New best mean reward!
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.22e+03 |
|    ep_rew_mean      | -620     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 50       |
|    fps              | 1201     |
|    time_elapsed     | 92       |
|    total_timesteps  | 111039   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss  

<stable_baselines3.dqn.dqn.DQN at 0x7fbcfb2e66d0>

In [20]:
# Test version
# model2.learn(total_timesteps=5e5, 
#             log_interval=50, 
#             # progress_bar=True, 
#             callback=eval_callback1
#            )

# Final version
model2.learn(total_timesteps=1e6, 
            log_interval=50, 
            # progress_bar=True, 
            callback=eval_callback2
           )



Eval num_timesteps=10000, episode_reward=1458.67 +/- 0.00
Episode length: 1606.00 +/- 0.00
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.61e+03 |
|    mean_reward      | 1.46e+03 |
| rollout/            |          |
|    exploration_rate | 0.905    |
| time/               |          |
|    total_timesteps  | 10000    |
----------------------------------
New best mean reward!
Eval num_timesteps=20000, episode_reward=1458.67 +/- 0.00
Episode length: 1606.00 +/- 0.00
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.61e+03 |
|    mean_reward      | 1.46e+03 |
| rollout/            |          |
|    exploration_rate | 0.81     |
| time/               |          |
|    total_timesteps  | 20000    |
----------------------------------
Eval num_timesteps=30000, episode_reward=1458.67 +/- 0.00
Episode length: 1606.00 +/- 0.00
----------------------------------
| eval/               |          |
|    

<stable_baselines3.dqn.dqn.DQN at 0x7fbe6512ff40>

In [21]:
# Test version
# model3.learn(total_timesteps=5e5, 
#             log_interval=50, 
#             # progress_bar=True, 
#             callback=eval_callback1
#            )

# Final version
model3.learn(total_timesteps=1e6, 
            log_interval=50, 
            # progress_bar=True, 
            callback=eval_callback3
           )

Eval num_timesteps=10000, episode_reward=1527.46 +/- 0.00
Episode length: 1606.00 +/- 0.00
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.61e+03 |
|    mean_reward      | 1.53e+03 |
| rollout/            |          |
|    exploration_rate | 0.905    |
| time/               |          |
|    total_timesteps  | 10000    |
----------------------------------
New best mean reward!
Eval num_timesteps=20000, episode_reward=1527.46 +/- 0.00
Episode length: 1606.00 +/- 0.00
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.61e+03 |
|    mean_reward      | 1.53e+03 |
| rollout/            |          |
|    exploration_rate | 0.81     |
| time/               |          |
|    total_timesteps  | 20000    |
----------------------------------
Eval num_timesteps=30000, episode_reward=1527.46 +/- 0.00
Episode length: 1606.00 +/- 0.00
----------------------------------
| eval/               |          |
|    

<stable_baselines3.dqn.dqn.DQN at 0x7fbe65142640>

In [None]:
rur_default = [0.88, 0.86, 0.87, 0.87, 0.89, 0.88, 0.88, 0.86, 0.86, 0.87]
rur_random = [0.81, 0.82, 0.84, 0.79, 0.81, 0.83, 0.78, 0.83, 0.82, 0.84]
rur_antcolony = [0.86, 0.85, 0.89, 0.88, 0.89, 0.90, 0.85, 0.86, 0.88, 0.90]

rur_st_ut = [0.87, 0.88, 0.89, 0.89, 0.88, 0.88, 0.88, 0.89, 0.88, 0.89]
rur_st_pt = [0.91, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.91,0.91, 0.92]
rur_dy_ut = [0.92, 0.91, 0.91, 0.91, 0.9, 0.91, 0.9, 0.9, 0.9, 0.9]
rur_dy_pt = [0.91, 0.92, 0.91, 0.93, 0.93, 0.93, 0.93, 0.93, 0.93, 0.92]

data = [rur_default, rur_random, rur_antcolony, rur_st_ut, rur_st_pt, rur_dy_ut, rur_dy_pt]

plt.style.use('_mpl-gallery')

fig, ax = plt.subplots(figsize=(4, 2))


VP = ax.boxplot(data, positions=[2, 4, 6, 8, 10, 12, 14], widths=1.5, patch_artist=True,
                showmeans=False, showfliers=False,
                medianprops={"color": "#8C8C8C", "linewidth": 0.5},
                boxprops={"facecolor": "#BFBFBF", "edgecolor": "white",
                          "linewidth": 0.5},
                whiskerprops={"color": "#8C8C8C", "linewidth": 1.5},
                capprops={"color": "#8C8C8C", "linewidth": 1.5})

# Draw horizontal line at 0.9
ax.axhline(y=0.9, color='#8C8C8C', linestyle='--', linewidth=1)

# Change x axis labels tilt
for tick in ax.get_xticklabels():
    tick.set_rotation(45)

# Change x axis labels
ax.set_xticklabels(['RUR', 'RBD1', 'RBD2', 'ST-UT', 'ST-PT', 'DY-UT', 'DY-PT'])
# Set x axos fontsize smaller
ax.tick_params(axis='x', labelsize=8)


# Move y axis closer to plot
ax.tick_params(axis='y', pad=0)

# Make plot starts from 0 in x axis
ax.set_xlim(left=0)

# y axis round to 2 decimal places
ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

# Show *100 of y axis but not show % symbol
vals = ax.get_yticks()
ax.set_yticklabels(['{:,.0%}'.format(x) for x in vals])

# ylabel fontsize
ax.set_ylabel('Average Reward', fontsize=8)


# Highlight DY-PT
VP['boxes'][6].set_facecolor('#FFC000')

# Set labels
ax.set_xlabel('Algorithm')
ax.set_ylabel('Average Reward')

ax.grid(False)
plt.show()

In [None]:
a = ['1', '2', '3']
# Change all to elements in a to integer
a = list(map(int, a))