In [1]:
import sys
sys.path.append('..')

In [2]:
%load_ext autoreload
%autoreload 2

In [18]:
import matplotlib.pyplot as plt
import random
import numpy as np
from agents.elevator_expert import ElevatorExpertPolicyAgent
from agents.random_agent import RandomAgent
from agents.llmzero import LLMTransitionModel, LLMRewardModel
from environments.ElevatorEnvironment import ElevatorEnvironment

import os
import dotenv
dotenv.load_dotenv()

env = ElevatorEnvironment()

c:\Users\ianch\miniconda3\envs\aiplanning\Lib\site-packages\pyRDDLGym\Examples c:\Users\ianch\miniconda3\envs\aiplanning\Lib\site-packages\pyRDDLGym\Examples\manifest.csv
Available example environment(s):
CartPole_continuous -> A simple continuous state-action MDP for the classical cart-pole system by Rich Sutton, with actions that describe the continuous force applied to the cart.
CartPole_discrete -> A simple continuous state MDP for the classical cart-pole system by Rich Sutton, with discrete actions that apply a constant force on either the left or right side of the cart.
Elevators -> The Elevator domain models evening rush hours when people from different floors in a building want to go down to the bottom floor using elevators.
HVAC -> Multi-zone and multi-heater HVAC control problem
MarsRover -> Multi Rover Navigation, where a group of agent needs to harvest mineral.
MountainCar -> A simple continuous MDP for the classical mountain car control problem.
NewLanguage -> Example with

<op> is one of {<=, <, >=, >}
<rhs> is a deterministic function of non-fluents or constants only.
>> ( sum_{?f: floor} [ elevator-at-floor(?e, ?f) ] ) == 1


In [None]:
llmzero_reward_model = LLMRewardModel(
    debug=False,
    env_params={
                    "system_prompt_path": "../prompts/prompt_elevator_reward.txt",
                    "extract_reward_regex": r"TOTAL_REWARD_FINAL = (.*)\n", # only use the first match, same line
                    "extract_reward_regex_fallback": [r"TOTAL_REWARD_FINAL = (.*)\n"],
                    "extract_done_regex": r"done: (.*)",
                    "extract_done_regex_fallback": [r"done: (.*)"],
                },
    load_prompt_buffer_path="../prompt_buffer/elevator_reward_20241110_014634.pkl",
    prompt_buffer_prefix="../prompt_buffer/elevator_reward",
)

In [20]:
SEED = 117
np.random.seed(SEED)
random.seed(SEED)

In [21]:
random_agent = RandomAgent(env, seed=SEED)
expert_agent = ElevatorExpertPolicyAgent()

# random agent
state, _ = env.reset(SEED)
done = False

random_agent_trajectory = []

while not done:
    action = random_agent.act(state)
    next_state, reward, done, _, _ = env.step(action)
    random_agent_trajectory.append((state, action, reward, next_state, done))
    state = next_state
    
# expert agent
state, _ = env.reset(SEED + 1)  # use a different seed for the expert agent
done = False

expert_agent_trajectory = []

while not done:
    action = expert_agent.act(state)
    next_state, reward, done, _, _ = env.step(action)
    expert_agent_trajectory.append((state, action, reward, next_state, done))
    state = next_state

In [22]:
trajectories_combined = random_agent_trajectory + expert_agent_trajectory

In [23]:
llmzero_reward_model.llm_model

'mistral-large-2407'

In [33]:
import tqdm

def test_reward_model(model, trajectories):
    gt_rewards = []
    predicted_rewards = []
    squared_errors = []
    status_list = []
    
    pbar = tqdm.tqdm(trajectories)
    
    for trajectory in pbar:
        state, action, reward, next_state, done = trajectory
        
        state_text = env.state_to_text(state)
        action_text = env.action_to_text(action)
        
        predicted_reward, status = model.get_reward(state_text, action_text)
        
        gt_rewards.append(reward)
        predicted_rewards.append(predicted_reward)
        squared_errors.append((reward - predicted_reward) ** 2 / (reward + 1.0))
        status_list.append(status)
        
        pbar.set_description(f"Squared error: {np.mean(squared_errors)}")
        
    return gt_rewards, predicted_rewards, status_list

In [34]:
gt_rewards, predicted_rewards, status_list = test_reward_model(llmzero_reward_model, trajectories_combined)

Squared error: -93.54598436216806:  68%|██████▊   | 274/400 [01:01<00:48,  2.58it/s]   

Saving prompt buffer to ../prompt_buffer/elevator_reward_20241110_014634.pkl


Squared error: -93.1799607114933:  71%|███████▏  | 285/400 [02:42<09:42,  5.06s/it] 

Saving prompt buffer to ../prompt_buffer/elevator_reward_20241110_014634.pkl


Squared error: -92.04032326695396:  74%|███████▍  | 295/400 [05:09<34:55, 19.95s/it]

Saving prompt buffer to ../prompt_buffer/elevator_reward_20241110_014634.pkl


Squared error: -91.46837960003316:  76%|███████▋  | 305/400 [06:56<17:24, 10.99s/it]

Saving prompt buffer to ../prompt_buffer/elevator_reward_20241110_014634.pkl


Squared error: -90.20826998083032:  79%|███████▉  | 315/400 [08:18<11:06,  7.84s/it]

Saving prompt buffer to ../prompt_buffer/elevator_reward_20241110_014634.pkl


Squared error: -88.7088428629399:  81%|████████▏ | 325/400 [10:09<13:04, 10.46s/it] 

Saving prompt buffer to ../prompt_buffer/elevator_reward_20241110_014634.pkl


Squared error: -87.32342350197447:  84%|████████▍ | 335/400 [12:34<12:02, 11.12s/it]

Saving prompt buffer to ../prompt_buffer/elevator_reward_20241110_014634.pkl


Squared error: -86.19945649444837:  86%|████████▋ | 345/400 [15:41<13:43, 14.98s/it]

Saving prompt buffer to ../prompt_buffer/elevator_reward_20241110_014634.pkl


Squared error: -85.47329849744098:  89%|████████▉ | 355/400 [17:33<10:20, 13.79s/it]

Saving prompt buffer to ../prompt_buffer/elevator_reward_20241110_014634.pkl


Squared error: -86.06216330469893:  91%|█████████▏| 365/400 [19:12<05:41,  9.77s/it]

Saving prompt buffer to ../prompt_buffer/elevator_reward_20241110_014634.pkl


Squared error: -85.26172570916157:  93%|█████████▎| 371/400 [20:32<01:36,  3.32s/it]

Error: No match found with fallback regex, using full response as reward





TypeError: unsupported operand type(s) for -: 'float' and 'str'