In [1]:
import torch
import numpy as np
import json

In [2]:
with open('./envs/5x5.json', 'r') as f:
    env = json.load(f)

In [3]:
repeat_bias_factor=2

walks = []

In [4]:
new_walk = []

In [5]:
walk = new_walk

In [6]:
len(walk)

0

In [7]:
# The first location is chosen ar random.
new_location_pos = np.random.randint(env['n_locations'])
new_location_pos

23

## Grid

```
(0.1,0.1) - (0.3,0.1) - (0.5,0.1) - (0.7,0.1) - (0.9,0.1)
    |           |           |           |           |
(0.1,0.3) - (0.3,0.3) - (0.5,0.3) - (0.7,0.3) - (0.9,0.3)
    |           |           |           |           |
(0.1,0.5) - (0.3,0.5) - (0.5,0.5) - (0.7,0.5) - (0.9,0.5)
    |           |           |           |           |
(0.1,0.7) - (0.3,0.7) - (0.5,0.7) - (0.7,0.7) - (0.9,0.7)
    |           |           |           |           |
(0.1,0.9) - (0.3,0.9) - (0.5,0.9) - (0.7,0.9) - (0.9,0.9)
```

## Action Types by ID

- 0: Stay in place / No movement
- 1: Move North (typically probability 0 if impossible)
- 2: Move right (east)
- 3: Move down (south)
- 4: Move left (west)

In [8]:
new_location = env['locations'][new_location_pos]

for k, v in new_location.items():
    if k != 'actions':
        print(f'{k:<13}: {v}')
    else:
        print('actions:')
        for action in v: # The value is the array of actions.
            print(f'\t{action["id"]}: {action["probability"]}')

id           : 23
observation  : 24
x            : 0.7
y            : 0.9
in_locations : [18, 22, 23, 24]
in_degree    : 4
out_locations: [18, 22, 23, 24]
out_degree   : 4
actions:
	0: 0.25
	1: 0.25
	2: 0.25
	3: 0
	4: 0.25


In [9]:
# Find sensory observation for new state, and store it as one-hot vector.
new_observation = np.eye(env['n_observations'])[new_location['observation']]

# Create a new observation by converting the new observation to a torch tensor.
new_observation = torch.tensor(new_observation, dtype=torch.float).view((new_observation.shape[0]))
 
new_observation, torch.nonzero( new_observation ), new_observation.shape

(tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 tensor([[24]]),
 torch.Size([45]))

In [10]:
# Build policy from action probability of each action of provided location dictionary.
policy = np.array( [action['probability'] for action in new_location['actions']] )
policy

array([0.25, 0.25, 0.25, 0.  , 0.25])

In [11]:
policy[ [] ]

array([], dtype=float64)

In [12]:
policy[ [] ] *= repeat_bias_factor
policy

array([0.25, 0.25, 0.25, 0.  , 0.25])

In [13]:
# Add a bias for repeating previous action to walk in straight lines,
# only if (this is not the first step) and (the previous action was a move).
policy[[] if len(walk) == 0 or new_location['id'] == walk[-1][0]['id'] else walk[-1][2]] *= repeat_bias_factor

# And renormalise policy (note that for unavailable actions, the policy was 0 and remains 0,
# so in that case no renormalisation needed).
policy = policy / sum(policy) if sum(policy) > 0 else policy
policy

array([0.25, 0.25, 0.25, 0.  , 0.25])

In [14]:
_some = np.random.rand()

np.cumsum(policy), _some, np.cumsum(policy) > _some

(array([0.25, 0.5 , 0.75, 0.75, 1.  ]),
 0.42612732283085897,
 array([False,  True,  True,  True,  True]))

In [15]:
np.flatnonzero( np.cumsum(policy) > _some )

array([1, 2, 3, 4])

In [16]:
# Select action in new state
new_action = int(np.flatnonzero(np.cumsum(policy)>_some)[0])
new_action

1

In [17]:
# Append location, observation, and action to the walk.
# new_location is the actual location from the env file.
# new_observation is the one-hot-vector for the given observation.
# new_action is the first action ID that is greater than some random number.
walk.append([new_location, new_observation, new_action])

# Next step in the walk (next walk actually)

In [18]:
walk[-1][2]

1

In [19]:
prev_location = walk[-1][0]
prev_action_chosen = walk[-1][2]

In [20]:
prev_location['actions'][prev_action_chosen]

{'id': 1,
 'transition': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0],
 'probability': 0.25}

In [21]:
_some = np.random.rand()

np.cumsum(prev_location['actions'][prev_action_chosen]['transition']) > _some

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True,  True,  True,  True,  True,  True,  True])

In [22]:
np.flatnonzero(
    np.cumsum(
        prev_location['actions'][prev_action_chosen]['transition'],
    )>_some,
)

array([18, 19, 20, 21, 22, 23, 24])

In [23]:
# TODO: this is a very awkward way of doing np.nonzero([...])[0].
new_location = int(
    np.flatnonzero(
        np.cumsum(
            prev_location['actions'][prev_action_chosen]['transition'],
        )>np.random.rand(),
    )[0]
)
print(f'location chosen: {new_location}')
new_location = env['locations'][new_location]
for k, v in new_location.items():
    if k != 'actions':
        print(f'{k:<13}: {v}')
    else:
        print('actions:')
        for action in v: # The value is the array of actions.
            print(f'\t{action["id"]}: {action["probability"]}')

location chosen: 18
id           : 18
observation  : 13
x            : 0.7
y            : 0.7
in_locations : [13, 17, 18, 19, 23]
in_degree    : 5
out_locations: [13, 17, 18, 19, 23]
out_degree   : 5
actions:
	0: 0.2
	1: 0.2
	2: 0.2
	3: 0.2
	4: 0.2


In [24]:
def get_observation(env, new_location):
    # Find sensory observation for new state, and store it as one-hot vector
    new_observation = np.eye(env['n_observations'])[new_location['observation']]
    # Create a new observation by converting the new observation to a torch tensor
    new_observation = torch.tensor(new_observation, dtype=torch.float).view((new_observation.shape[0]))
    # Return the new observation
    return new_observation

In [25]:
new_observation = get_observation(env, new_location)
new_observation, torch.nonzero( new_observation ), new_observation.shape

(tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 tensor([[13]]),
 torch.Size([45]))

In [26]:
def get_action(env, new_location, walk, repeat_bias_factor=2):
    # Build policy from action probability of each action of provided location dictionary
    policy = np.array([action['probability'] for action in new_location['actions']])  
    print(f'init policy: {policy}')
    
    # Add a bias for repeating previous action to walk in straight lines, only if
    # (this is not the first step) and (the previous action was a move)
    policy[
        [] if len(walk) == 0 or new_location['id'] == walk[-1][0]['id'] else walk[-1][2]
    ] *= repeat_bias_factor
    print(f"{new_location['id']=} {walk[-1][0]['id']=}")
    print(f"{walk[-1][2]=}")
    print(f'policy after bias: {policy}')
    
    # And renormalise policy (note that for unavailable actions, the policy was 0 and remains 0,
    # so in that case no renormalisation needed)
    policy = policy / sum(policy) if sum(policy) > 0 else policy
    print(f'normalized policy after bias: {policy}')
    
    # Select action in new state.
    _some = np.random.rand()
    new_action = int(np.flatnonzero(np.cumsum(policy)>_some)[0])
    print(f'rand number chosen: {_some}')
    # Return the new action
    return new_action

In [27]:
new_action = get_action(env, new_location, walk)
new_action

init policy: [0.2 0.2 0.2 0.2 0.2]
new_location['id']=18 walk[-1][0]['id']=23
walk[-1][2]=1
policy after bias: [0.2 0.4 0.2 0.2 0.2]
normalized policy after bias: [0.16666667 0.33333333 0.16666667 0.16666667 0.16666667]
rand number chosen: 0.02609080446562695


0

In [28]:
# Append location, observation, and action to the walk.
# new_location is the actual location from the env file.
# new_observation is the one-hot-vector for the given observation.
# new_action is the first action ID that is greater than some random number.
walk.append([new_location, new_observation, new_action])

In [29]:
len(walk)

2

In [30]:
walk

[[{'id': 23,
   'observation': 24,
   'x': 0.7,
   'y': 0.9,
   'in_locations': [18, 22, 23, 24],
   'in_degree': 4,
   'out_locations': [18, 22, 23, 24],
   'out_degree': 4,
   'actions': [{'id': 0,
     'transition': [0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      1,
      0],
     'probability': 0.25},
    {'id': 1,
     'transition': [0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      1,
      0,
      0,
      0,
      0,
      0,
      0],
     'probability': 0.25},
    {'id': 2,
     'transition': [0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      1]

# Non-shiny walks

A walk is composed of

1. new_location is the actual location from the env file.
2. new_observation is the one-hot-vector for the observation in the chosen location.
3. new_action is the first action ID that is greater than some random number.

The way the action is chosen mimicks CDF sampling - the `cumsum()` essentially makes a CDF given the action transition probabilities.

The first location is chosen at random.
All other locations are chosen based on the action from the previous step/walk - the authors call each step a walk.

Observabtions are stored in the walk as one-got-encoded vectors.