# Prep

In [1]:
import os, sys

base_path = os.path.join(os.getcwd(), "../..")
print(f"Base Path: {base_path}")
sys.path.append(base_path)

Base Path: /Users/swkim/Documents/coding/thesis/PROMES_colab/notebook/dqfd/../..


In [2]:
import stable_baselines3 as sb3

import gym
from gym import spaces

import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np

from kube_sim_gym.envs import *

# Deep Q Learning from Demonstrations

## Expert Data

In [55]:
import importlib

class ExpertData:
    def __init__(self, expert_data_path='expert_data/ed1.csv'):
        self.expert_data_path = expert_data_path
        self.expert_data = self.load_expert_data()
        
    def load_expert_data(self): # csv file
        with open(self.expert_data_path, 'r') as f:
            self.expert_data = np.loadtxt(f, delimiter=',')
        return self.expert_data
    
    def generate_expert_data(self, reward_path, scenario_path, file_path, expert_fname):
        env = gym.make('SimKubeEnv-v0', reward_file=reward_path, scenario_file=scenario_path)
        state = env.reset()

        from kube_hr_scheduler.scheduler.sim_hr_scheduler import SimHrScheduler
        expert = SimHrScheduler(env, expert_fname)

        with open(file_path, 'a') as f:
            done = False
            while not done:
                state = env.get_state()
                action = expert.decision(env)

                _, reward, done, info = env.step(action)

                next_state = env.get_state()

                expert_data = np.concatenate((state, next_state, [action,reward, done]))
                # Round to 2 decimal places
                expert_data = np.round(expert_data, 2)
                f.write(','.join([str(x) for x in expert_data]) + '\n')

                state = next_state

        return self.load_expert_data()

    def sample(self, batch_size):
        if self.expert_data is None:
            self.load_expert_data()
        idx = np.random.randint(0, len(self.expert_data), batch_size)
        return self.expert_data[idx]

In [76]:
expert_data = ExpertData()

In [57]:
expert_data.generate_expert_data('train_dynamic.py', 'random', 'expert_data/ed1.csv', 'default.py')

array([[ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.5 ,  0.  ],
       [ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.5 ,  0.  ],
       [ 0.  ,  0.  ,  0.  , ...,  4.  ,  0.5 ,  0.  ],
       ...,
       [ 0.9 ,  0.96,  0.95, ...,  0.  ,  0.5 ,  0.  ],
       [ 0.75,  0.82,  0.95, ...,  1.  , -0.01,  0.  ],
       [ 0.84,  0.93,  0.95, ...,  5.  ,  0.5 ,  1.  ]])

## Critic & Policy

In [104]:
import torch

class Critic:

    def __init__(self, env):
        self.env = env
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.critic = torch.nn.Sequential(
            torch.nn.Linear(self.state_dim + 1, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 1),
        )

        self.optimizer = torch.optim.Adam(self.critic.parameters())

    def forward(self, state, action):
        q_value = self.critic(torch.cat([state, action], dim=1))
        return q_value

    def update(self, batch_state, batch_action, batch_reward, batch_done):

        q_values = self.critic(batch_state, batch_action)
        target_q_values = batch_reward + self.gamma * q_values.clone() * (1 - batch_done)

        self.optimizer.zero_grad()
        loss = torch.nn.MSELoss()(q_values, target_q_values)
        loss.backward()
        self.optimizer.step()

class Policy:

    def __init__(self, env):
        self.env = env
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.policy = torch.nn.Sequential(
            torch.nn.Linear(self.state_dim, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, self.action_dim),
            torch.nn.Softmax(dim=1),
        )

        self.optimizer = torch.optim.Adam(self.policy.parameters())

    def forward(self, state):
        probs = self.policy(state)
        return probs

    def update(self, batch_state, batch_reward, batch_done):

        probs = self.policy(batch_state)
        log_probs = torch.log(probs)
        entropy = -torch.sum(probs * log_probs, dim=1)

        advantage = batch_reward + self.gamma * entropy * (1 - batch_done) - log_probs

        self.optimizer.zero_grad()
        loss = -torch.sum(advantage * probs, dim=1)
        loss.backward()
        self.optimizer.step()


## DQfD

In [105]:
import torch

class DQfD:

    def __init__(self, 
                #  env,
                 policy, 
                 critic, 
                 expert_data, 
                 batch_size, 
                 gamma, 
                 tau):
        # self.env = env
        self.policy = policy
        self.critic = critic
        self.expert_data = expert_data
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.num_iterations = 100

    def learn(self):
        for i in range(self.num_iterations):
            # Sample a batch of transitions from the expert data.
            # batch = self.sample_batch()
            # print(batch)

            batch_state, batch_next_state, batch_action, batch_reward, batch_done = self.sample_batch()

            # batch_state = torch.tensor(batch[0], dtype=torch.float32)
            # batch_next_state = torch.tensor(batch[1][:, 12:24], dtype=torch.float32)
            # batch_action = torch.tensor(batch[:, 24:25], dtype=torch.int64)
            # batch_reward = torch.tensor(batch[:, 25:26], dtype=torch.float32)
            # batch_done = torch.tensor(batch[:, 26:27], dtype=torch.float32)
            

            # Calculate the Q-values for the expert actions.
            q_values = self.critic(torch.cat((batch_state, batch_action), dim=1))

            # Update the critic.
            self.critic.update(batch_state, batch_action, batch_reward, batch_done)

            # Update the policy.
            self.policy.update(batch_state, batch_reward, batch_done)

    def sample_batch(self):
        batch = []
        for _ in range(self.batch_size):
            data = self.expert_data.sample(self.batch_size)
            data = torch.tensor(data, dtype=torch.float32)
            state = data[:, :12]
            next_state = data[:, 12:24]
            action = data[:, 24:25]
            reward = data[:, 25:26]
            done = data[:, 26:27]
        return state, next_state, action, reward, done

In [106]:
env = gym.make('SimKubeEnv-v0', reward_file='train_dynamic.py', scenario_file='random')
policy_net = Policy(env)
critic_net = Critic(env)
expert_data = ExpertData()
batch_size = 32
gamma = 0.99
tau = 0.001

dqfd = DQfD(policy_net, critic_net, expert_data, batch_size, gamma, tau)

In [107]:
dqfd.learn()

TypeError: 'Critic' object is not callable

In [103]:
dqfd.sample_batch()[:]

(tensor([[0.9600, 0.9600, 0.8400, 0.9200, 0.8600, 0.9500, 0.9800, 0.7800, 0.6800,
          0.8900, 0.0700, 0.1300],
         [0.9900, 0.8900, 0.9400, 0.8100, 0.7700, 0.9900, 0.7600, 0.9600, 0.9800,
          0.8400, 0.1000, 0.0600],
         [1.0000, 1.0000, 0.9300, 0.8400, 0.9600, 0.7900, 0.9000, 0.9600, 0.9600,
          0.9900, 0.1100, 0.1300],
         [0.9400, 0.9600, 0.9600, 0.9300, 0.7500, 0.8600, 0.9500, 0.8600, 0.9800,
          0.9200, 0.0700, 0.0100],
         [0.7600, 0.9100, 0.8400, 0.9200, 0.8600, 0.9500, 1.0000, 0.8400, 0.6800,
          0.8900, 0.1500, 0.1300],
         [0.9400, 0.7100, 0.9600, 0.9700, 0.6800, 0.9200, 0.7400, 0.8900, 0.9900,
          0.9100, 0.1400, 0.1500],
         [0.8900, 0.9800, 0.9200, 0.9400, 0.9100, 0.9100, 0.7100, 1.0000, 0.8700,
          0.8600, 0.1100, 0.1200],
         [0.8600, 0.8700, 0.9000, 0.7000, 0.8600, 0.8200, 1.0000, 0.7700, 0.9700,
          0.9600, 0.1500, 0.1000],
         [0.7700, 0.9200, 0.8100, 0.9700, 0.8800, 0.9800, 0.8200