# Simulating an environment in pyRDDLGym with a custom policy.

This follow-up example illustrates how to define a custom policy to interact with an environment.

First install and import the required packages:

In [1]:
pip install --quiet --upgrade pip pyRDDLGym rddlrepository

Note: you may need to restart the kernel to use updated packages.


Import the required packages:

In [2]:
import warnings
warnings.filterwarnings('ignore')
import sys
import numpy as np

import pyRDDLGym
from pyRDDLGym.core.policy import RandomAgent, BaseAgent

We will attempt to first run the Elevators domain with the random policy:

In [3]:
env = pyRDDLGym.make('Elevators', '0', enforce_action_constraints=True)
agent = RandomAgent(action_space=env.action_space, num_actions=env.max_allowed_actions)
try:
    return_stats = agent.evaluate(env, episodes=20)
except Exception as e:
    print(e, file=sys.stderr)

Precondition 0 is not satisfied for actions {'move-current-dir': array([False,  True]), 'open-door': array([False,  True]), 'close-door': array([False, False])}.
>> ( forall_{?e: elevator} [ ( ( open-door(?e) + close-door(?e) ) + move-current-dir(?e) ) <= 1 ] )


pyRDDLGym provides some support for parsing bounds on action constraints, but unfortunately it cannot parse arbitrarily complex non-box constraints. This means the built-in random policy will sample actions that do not satisfy the required constraints. In this case, the action-precondition requires that, for each elevator, at most one non-noop action can be chosen.

To solve this, we will create a subclass of the BasePolicy that does respect the constraint on each elevator during sampling:

In [4]:
class ElevatorsPolicy(BaseAgent):
    
    def __init__(self, elevators):
        self.elevators = elevators
        
    def sample_action(self, state):
        action = {}
        for e in self.elevators:
            random = np.random.uniform()
            if random < 0.25:  # choose to move elevator
                action[f'move-current-dir___{e}'] = True
            elif random < 0.5:  # choose to open door
                action[f'open-door___{e}'] = True
            elif random < 0.75:  # choose to close door
                action[f'close-door___{e}'] = True
            else:  # choose to do nothing
                pass
        return action

Next we will create an instance of this policy by passing in the objects of type "elevator":

In [5]:
elevators = env.model.type_to_objects['elevator']
agent = ElevatorsPolicy(elevators)

We are now ready to simulate! Let's simulate one trial and check that the actions sampled respect the constraints in the RDDL:

In [6]:
env.horizon = 10   # let's just see the first 10 steps
agent.evaluate(env, episodes=1, verbose=True)

initial state = 
         num-person-waiting___f0 = 0          num-person-waiting___f1 = 0          num-person-waiting___f2 = 0     
         num-person-waiting___f3 = 0          num-person-waiting___f4 = 0      num-person-in-elevator___e0 = 0     
     num-person-in-elevator___e1 = 0             elevator-dir-up___e0 = True          elevator-dir-up___e1 = True  
            elevator-closed___e0 = True          elevator-closed___e1 = True    elevator-at-floor___e0__f0 = True  
      elevator-at-floor___e0__f1 = False   elevator-at-floor___e0__f2 = False   elevator-at-floor___e0__f3 = False 
      elevator-at-floor___e0__f4 = False   elevator-at-floor___e1__f0 = True    elevator-at-floor___e1__f1 = False 
      elevator-at-floor___e1__f2 = False   elevator-at-floor___e1__f3 = False   elevator-at-floor___e1__f4 = False 
    
------------------------------------------------------------------------------------------------------------------------
step   = 0
action = 
     open-door___e0 = Tr

{'mean': np.float64(-201.0),
 'median': np.float64(-201.0),
 'min': np.float64(-201.0),
 'max': np.float64(-201.0),
 'std': np.float64(0.0)}