In [3]:
import sys
sys.path.append('..')

In [5]:
import matplotlib.pyplot as plt
import random
import numpy as np
from agents.elevator_expert import ElevatorExpertPolicyAgent
from agents.random_agent import RandomAgent
from agents.llmzero import LLMTransitionModel, LLMRewardModel
from environments.ElevatorEnvironment import ElevatorEnvironment

import os
import dotenv
dotenv.load_dotenv()

env = ElevatorEnvironment()

llmzero_reward_model = LLMRewardModel(
    debug=True,
    env_params={
        "system_prompt_path": "../prompts/prompt_elevator_reward.txt",
        "extract_reward_regex": r"TOTAL_REWARD_FINAL = (.*)\n", # only use the first match, same line
        "extract_reward_regex_fallback": [r"TOTAL_REWARD_FINAL = (.*)\n"],
    }

)

c:\Users\ianch\miniconda3\envs\aiplanning\Lib\site-packages\pyRDDLGym\Examples c:\Users\ianch\miniconda3\envs\aiplanning\Lib\site-packages\pyRDDLGym\Examples\manifest.csv
Available example environment(s):
CartPole_continuous -> A simple continuous state-action MDP for the classical cart-pole system by Rich Sutton, with actions that describe the continuous force applied to the cart.
CartPole_discrete -> A simple continuous state MDP for the classical cart-pole system by Rich Sutton, with discrete actions that apply a constant force on either the left or right side of the cart.
Elevators -> The Elevator domain models evening rush hours when people from different floors in a building want to go down to the bottom floor using elevators.
HVAC -> Multi-zone and multi-heater HVAC control problem
MarsRover -> Multi Rover Navigation, where a group of agent needs to harvest mineral.
MountainCar -> A simple continuous MDP for the classical mountain car control problem.
NewLanguage -> Example with

<op> is one of {<=, <, >=, >}
<rhs> is a deterministic function of non-fluents or constants only.
>> ( sum_{?f: floor} [ elevator-at-floor(?e, ?f) ] ) == 1


In [6]:
SEED = 117
np.random.seed(SEED)
random.seed(SEED)

In [7]:
random_agent = RandomAgent(env, seed=SEED)
expert_agent = ElevatorExpertPolicyAgent()

# random agent
state, _ = env.reset(SEED)
done = False

random_agent_trajectory = []

while not done:
    action = random_agent.act(state)
    next_state, reward, done, _, _ = env.step(action)
    random_agent_trajectory.append((state, action, reward, next_state, done))
    state = next_state
    
# expert agent
state, _ = env.reset(SEED + 1)  # use a different seed for the expert agent
done = False

expert_agent_trajectory = []

while not done:
    action = expert_agent.act(state)
    next_state, reward, done, _, _ = env.step(action)
    expert_agent_trajectory.append((state, action, reward, next_state, done))
    state = next_state

In [8]:
trajectories_combined = random_agent_trajectory + expert_agent_trajectory

In [9]:
llmzero_reward_model.llm_model

'open-mixtral-8x22b'

In [None]:
def test_reward_model(model, trajectories):
    gt_rewards = []
    predicted_rewards = []
    
    for trajectory in trajectories:
        state, action, reward, next_state, done = trajectory
        
        state_text = env.state_to_text(state)
        
        predicted_reward = model.get_reward(state_text)
        
        gt_rewards.append(reward)
        predicted_rewards.append(predicted_reward)
        
    return out