In [1]:
#set path to folder above this
import sys
sys.path.append("..")
import numpy as np
from torch.distributions.categorical import Categorical
from utils import live_plot
from environments.ElevatorEnvironment import ElevatorEnvironment
from sentence_transformers import SentenceTransformer

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import tqdm
import gym

  from pyRDDLGym.Visualizer.MovieGenerator import MovieGenerator
  from tqdm.autonotebook import tqdm, trange


## Define Language based DQN network

1. `all-mpnet-base-v2` as backbone to transform text state into latent state
2. self-attention transformer layers to relevant attention information (since the embedding is not being trained)
3. FFC to get q-values vector.

In [35]:
class DQNLM(nn.Module):
    def __init__(self, 
                 encoder=None,
                 embedding_dim=768,
                 action_space_n=4,
                 num_transformer_layers=1,
                 fc_hidden_dims=[512, 256],
                 ):
        super(DQNLM, self).__init__()
        
        self.encoder = encoder
                
        self.embedding_dim = embedding_dim
        
        # Transformer encoder layer (self-attention)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.embedding_dim, nhead=8, dim_feedforward=2048
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_transformer_layers)
        
        # Configurable fully connected layers for Q-values
        fc_layers = []
        input_dim = self.embedding_dim
        for hidden_dim in fc_hidden_dims:
            fc_layers.append(nn.Linear(input_dim, hidden_dim))
            fc_layers.append(nn.ReLU())
            input_dim = hidden_dim
        fc_layers.append(nn.Linear(input_dim, action_space_n))  # Output layer for Q-values
        self.fc = nn.Sequential(*fc_layers)
    
    def forward(self, input_texts):
        # Pass input texts through the SentenceTransformer encoder to get embeddings
        # If encoder is None, assume input_texts are already embeddings
        if self.encoder is not None and isinstance(input_texts, list):
            with torch.no_grad():  # Freeze encoder during training
                embeddings = self.encoder.encode(input_texts, convert_to_tensor=True)  # Shape: [batch_size, embedding_dim]
        else:
            embeddings = input_texts
        
        assert isinstance(embeddings, torch.Tensor), f"Embeddings must be a PyTorch tensor, got: {type(embeddings)}, check encoder"
        assert embeddings.shape[1] == self.embedding_dim, f"Embeddings must have shape [batch_size, embedding_dim], got: {embeddings.shape}"
        
        # Reshape embeddings for the transformer encoder (add a sequence length of 1 for compatibility)
        embeddings = embeddings.unsqueeze(0)  # Shape: [1, batch_size, embedding_dim]
        
        # Pass embeddings through the transformer encoder layer
        transformer_output = self.transformer_encoder(embeddings)  # Shape: [1, batch_size, embedding_dim]
        
        # Remove the sequence length dimension
        pooled_output = transformer_output.squeeze(0)  # Shape: [batch_size, embedding_dim]
        
        # Output Q-values
        q_values = self.fc(pooled_output)  # Shape: [batch_size, num_actions]
        return q_values
        

In [8]:
# test inference
model = DQNLM(action_space_n=4).to("cuda")
sentence_encoder = SentenceTransformer("all-mpnet-base-v2")
input = ["Hello, how are you?", "I am fine, thank you."]
input_embeddings = sentence_encoder.encode(input, convert_to_tensor=True).to("cuda")
output = model(input_embeddings)
print(output)



tensor([[ 0.0232,  0.1813,  0.0450,  0.0037],
        [-0.1898,  0.1318, -0.1481, -0.0193]], device='cuda:0',
       grad_fn=<AddmmBackward0>)


## Generate pre-training data using expert agent

In [3]:
#auto reload
%load_ext autoreload
%autoreload 2

In [None]:
# def create_env():
#     env = ElevatorEnvironment()
#     env = gym.wrappers.RecordEpisodeStatistics(env)
    
#     return env

# batch_size = 8
# envs = gym.vector.SyncVectorEnv([lambda: create_env() for _ in range(batch_size)])
# envs.reset()

c:\Users\ianch\miniconda3\envs\aiplanning\Lib\site-packages\pyRDDLGym\Examples c:\Users\ianch\miniconda3\envs\aiplanning\Lib\site-packages\pyRDDLGym\Examples\manifest.csv
Available example environment(s):
CartPole_continuous -> A simple continuous state-action MDP for the classical cart-pole system by Rich Sutton, with actions that describe the continuous force applied to the cart.
CartPole_discrete -> A simple continuous state MDP for the classical cart-pole system by Rich Sutton, with discrete actions that apply a constant force on either the left or right side of the cart.
Elevators -> The Elevator domain models evening rush hours when people from different floors in a building want to go down to the bottom floor using elevators.
HVAC -> Multi-zone and multi-heater HVAC control problem
MarsRover -> Multi Rover Navigation, where a group of agent needs to harvest mineral.
MountainCar -> A simple continuous MDP for the classical mountain car control problem.
NewLanguage -> Example with

(OrderedDict([('num-person-waiting___f0',
               array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)),
              ('num-person-waiting___f1',
               array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)),
              ('num-person-waiting___f2',
               array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)),
              ('num-person-waiting___f3',
               array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)),
              ('num-person-waiting___f4',
               array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)),
              ('num-person-in-elevator___e0',
               array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)),
              ('elevator-dir-up___e0',
               array([1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)),
              ('elevator-closed___e0',
               array([1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)),
              ('elevator-at-floor___e0__f0',
               array([1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)),
              ('elevator-at-floor___e0__f1',
               arr

In [12]:
env = ElevatorEnvironment()

c:\Users\ianch\miniconda3\envs\aiplanning\Lib\site-packages\pyRDDLGym\Examples c:\Users\ianch\miniconda3\envs\aiplanning\Lib\site-packages\pyRDDLGym\Examples\manifest.csv
Available example environment(s):
CartPole_continuous -> A simple continuous state-action MDP for the classical cart-pole system by Rich Sutton, with actions that describe the continuous force applied to the cart.
CartPole_discrete -> A simple continuous state MDP for the classical cart-pole system by Rich Sutton, with discrete actions that apply a constant force on either the left or right side of the cart.
Elevators -> The Elevator domain models evening rush hours when people from different floors in a building want to go down to the bottom floor using elevators.
HVAC -> Multi-zone and multi-heater HVAC control problem
MarsRover -> Multi Rover Navigation, where a group of agent needs to harvest mineral.
MountainCar -> A simple continuous MDP for the classical mountain car control problem.
NewLanguage -> Example with

In [None]:


# get expert agent
from agents.elevator_expert import ElevatorExpertPolicyAgent

agent = ElevatorExpertPolicyAgent()

# generate expert trajectories
num_trajectories = 100000

trajectories = []

state, _ = env.reset()

num_episodes = 0

for _ in tqdm.tqdm(range(num_trajectories)):
    state_text = env.state_to_text(state)
    action = agent.act(state)
    next_state, reward, done, _, _ = env.step(action)
    next_state_text = env.state_to_text(next_state)
    
    trajectories.append((num_episodes, state_text, action, reward, next_state_text, done))
    
    if done:
        state, _ = env.reset()
        num_episodes += 1
    else:
        state = next_state
        
# save trajectories
import pickle

with open("trajectories/expert_trajectories.pkl", "wb") as f:
    pickle.dump(trajectories, f)    

100%|██████████| 100000/100000 [03:47<00:00, 440.12it/s]


In [23]:
# load trajectories
trajectories = pickle.load(open("trajectories/expert_trajectories.pkl", "rb"))
trajectories[0]

# we first pretrain the DQN model to follow the expert policy
# we need state as input action as output

states = [t[1] for t in trajectories]
actions = [t[2] for t in trajectories]

sentence_encoder = sentence_encoder.to("cuda")
states_tensor = sentence_encoder.encode(states, batch_size=128, convert_to_tensor=True, show_progress_bar=True)
states_tensor.shape

Batches: 100%|██████████| 782/782 [05:12<00:00,  2.50it/s]


torch.Size([100000, 768])

In [24]:
# encode action into one-hot and convert to tensor
actions_tensor = torch.tensor(actions, dtype=torch.long).to("cuda")
actions_tensor = F.one_hot(actions_tensor, num_classes=4).float()
actions_tensor.shape    # [num_trajectories, num_actions]

torch.Size([100000, 4])

In [25]:
# concat state action pairs and save to tensor_data/
torch.save((states_tensor, actions_tensor), "tensor_data/expert_state_action_pairs.pt")

In [None]:
# create dataset and dataloader
from torch.utils.data import DataLoader, TensorDataset

dataset = TensorDataset(states_tensor, actions_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# create model
model = DQNLM(encoder=sentence_encoder, action_space_n=4).to("cuda")

# create optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# create loss function for multi-class classification
loss_fn = nn.CrossEntropyLoss()


In [34]:


# train model for 1 epoch
num_epochs = 1

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    pbar = tqdm.tqdm(dataloader, total=len(dataloader))
    
    for states_batch, actions_batch in pbar:
        optimizer.zero_grad()
        print(states_batch.shape, actions_batch.shape)
        q_values = model(states_batch)
        loss = loss_fn(q_values, actions_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        pbar.set_description(f"Loss: {total_loss:.4f}")
        
        data_to_plot = {
            "loss": loss.item()
        }
        
        break
        
        live_plot(data_to_plot)
        
        
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

  0%|          | 0/3125 [00:01<?, ?it/s]

torch.Size([32, 768]) torch.Size([32, 4])





TypeError: len() of a 0-d tensor