In [47]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [169]:
# https://gymnasium.farama.org/introduction/basic_usage/
# https://gymnasium.farama.org/environments/box2d/

import gymnasium as gym
import gymnasium_env
from gymnasium.wrappers import FlattenObservation
import keyboard
import numpy as np
from utils import *
from behavior_cloing_model import *

#### Generate data for training BC imitation learning model

 - Data generaiton functions in `utils.py`.

In [131]:
env = gym.make('gymnasium_env/GridWorld-v0', render_mode="human")
observation, info = env.reset()
observation

{'agent': array([2, 2]), 'target': array([1, 4])}

In [185]:
# observation={'agent': np.array([2, 1]), 'target': np.array([1, 0])}
observation= {'agent': np.array([0, 3]), 'target': np.array([4, 0])}


In [186]:
%%time
# generate actions
actions=compute_min_steps_to_win(observation, shuffle=False, print_values=True)
actions

Agent: [0 3] Target: [4 0]
Difference: [ 4 -3]
XY Actions: [3, 1]
CPU times: total: 0 ns
Wall time: 0 ns


[3, 3, 3, 3, 1, 1, 1]

In [148]:
%%time
# generate positions
positions=generate_agent_target_positions(observation, actions)
positions

CPU times: total: 0 ns
Wall time: 0 ns


{0: {'agent': array([2, 1]), 'target': array([1, 0]), 'action': -1},
 1: {'agent': array([1, 1]), 'target': array([1, 0]), 'action': 2},
 2: {'agent': array([1, 0]), 'target': array([1, 0]), 'action': 1}}

In [159]:
%%time
# reconstruct grid per position
# ts=2
# agent=positions[ts]["agent"]
# target=positions[ts]["target"]
# reconstruct_grid(agent, target, observation=None, size=5)
grid_data=generate_grid_action_dict(positions, size=5)
print(grid_data)

[{'grid': [0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'action': -1}, {'grid': [0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'action': 2}, {'grid': [0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'action': 1}]
CPU times: total: 0 ns
Wall time: 0 ns


In [None]:
# rewards are not necessary for the BC model, is this because it assumes that data is consistantly good, ie coming from an expert (however, suboptimal training data does not resemble expert)

In [164]:
# iteratively generate training data
training_data=generate_training_data(env, N=100, shuffle=False, print_values=False, size=5)

100%|██████████| 100/100 [00:24<00:00,  4.03it/s]


In [254]:
# generate shuffled set
training_data_shuffled=generate_training_data(env, N=100, shuffle=True, print_values=False, size=5)

100%|██████████| 100/100 [00:24<00:00,  4.03it/s]


In [190]:
# training_data[0]

#### Train basic BC model

#### close environments after generating data, training, etc...

In [282]:
# prep data
# training_data, training_data_shuffled
dataset = GridDataset(training_data_shuffled) 
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [283]:
# for grids, actions in dataloader:
#     break

In [284]:
# set params for training
# input_size = len(training_data[0][0]["grid"])
input_size=25
hidden_size = 128
num_actions = 4
model = BehaviorCloningModel(input_size, hidden_size, num_actions)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

In [285]:
%%time
# train
train_bc_model(model, dataloader, optimizer, criterion, epochs=1000)

Epoch 1/1000, Loss: 29.1350
Epoch 2/1000, Loss: 29.0240
Epoch 3/1000, Loss: 28.9222
Epoch 4/1000, Loss: 28.8285
Epoch 5/1000, Loss: 28.7247
Epoch 6/1000, Loss: 28.6323
Epoch 7/1000, Loss: 28.5434
Epoch 8/1000, Loss: 28.4542
Epoch 9/1000, Loss: 28.3598
Epoch 10/1000, Loss: 28.2745
Epoch 11/1000, Loss: 28.1906
Epoch 12/1000, Loss: 28.1085
Epoch 13/1000, Loss: 28.0179
Epoch 14/1000, Loss: 27.9410
Epoch 15/1000, Loss: 27.8473
Epoch 16/1000, Loss: 27.7666
Epoch 17/1000, Loss: 27.6853
Epoch 18/1000, Loss: 27.5930
Epoch 19/1000, Loss: 27.5173
Epoch 20/1000, Loss: 27.4364
Epoch 21/1000, Loss: 27.3499
Epoch 22/1000, Loss: 27.2679
Epoch 23/1000, Loss: 27.1790
Epoch 24/1000, Loss: 27.1078
Epoch 25/1000, Loss: 27.0178
Epoch 26/1000, Loss: 26.9377
Epoch 27/1000, Loss: 26.8530
Epoch 28/1000, Loss: 26.7661
Epoch 29/1000, Loss: 26.6840
Epoch 30/1000, Loss: 26.6043
Epoch 31/1000, Loss: 26.5216
Epoch 32/1000, Loss: 26.4373
Epoch 33/1000, Loss: 26.3591
Epoch 34/1000, Loss: 26.2733
Epoch 35/1000, Loss: 26

In [None]:
# Evaluate the model in the environment
env = gym.make("gymnasium_env/GridWorld-v0", render_mode="human", size=5)
true_actions, pred_actions, observations=evaluate_bc_model(model, env, max_steps=10, print_steps=True)


Initial state: {'agent': array([0, 3]), 'target': array([4, 2])}
1 {'agent': array([0, 4]), 'target': array([4, 2])} 0
2 {'agent': array([0, 4]), 'target': array([4, 2])} 0
3 {'agent': array([0, 4]), 'target': array([4, 2])} 0
4 {'agent': array([0, 4]), 'target': array([4, 2])} 0
5 {'agent': array([0, 4]), 'target': array([4, 2])} 0
6 {'agent': array([0, 4]), 'target': array([4, 2])} 0
7 {'agent': array([0, 4]), 'target': array([4, 2])} 0
8 {'agent': array([0, 4]), 'target': array([4, 2])} 0
9 {'agent': array([0, 4]), 'target': array([4, 2])} 0
10 {'agent': array([0, 4]), 'target': array([4, 2])} 0


: 

In [230]:
observation={'agent': np.array([2, 1]), 'target': np.array([4, 3])}
observation

{'agent': array([2, 1]), 'target': array([4, 3])}

In [226]:
compute_min_steps_to_win(observation, shuffle=False, print_values=False)

[3, 3, 0, 0]

In [229]:
grid=reconstruct_grid(observation=observation, size=5)
grid_tensor = torch.tensor(grid, dtype=torch.float32).unsqueeze(0)
with torch.no_grad():
    predicted_action = model(grid_tensor).argmax(dim=1).item()
    print(predicted_action)
    print(ACTIONS[predicted_action])

1
[ 0 -1]


In [201]:
observation={'agent': np.array([0, 3]), 'target': np.array([4, 0])}

{'agent': array([0, 3]), 'target': array([4, 0])}

In [228]:
ACTIONS= {
            3: np.array([1, 0]), # right
            1: np.array([0, -1]), # up
            2: np.array([-1, 0]), # left
            0: np.array([0, 1]), # down
        }

[3, 3, 3, 3, 1, 1, 1]

#### Testing env

In [6]:
env = gym.make('gymnasium_env/GridWorld-v0', render_mode="human")
observation, info = env.reset()

# done = False
# print("Controls: W (up), S (down), A (left), D (right), Q (quit)")
# while not done:
#     env.render() 
    
#     action = None
#     while action is None:
#         if keyboard.is_pressed("w"):
#             action = 0  # Up
#         elif keyboard.is_pressed("s"):
#             action = 1  # Down
#         elif keyboard.is_pressed("a"):
#             action = 2  # Left
#         elif keyboard.is_pressed("d"):
#             action = 3  # Right
#         elif keyboard.is_pressed("q"):
#             print("Exiting...")
#             env.close()
#             exit(0)

#     observation, reward, done, truncated, info = env.step(action)
#     print(f"Reward: {reward}")

# env.close()

#### Scratch code

In [15]:
observation, reward, terminated, truncated, info=env.step(env.action_space.sample())

In [21]:
env.step(env.action_space.sample())

({'agent': array([0, 1]), 'target': array([2, 4])},
 0,
 False,
 False,
 {'distance': 5.0})

In [19]:
# example env wrapper
wrapped_env = FlattenObservation(env)
print(wrapped_env.reset())     # E.g.  [3 0 3 3], {}

(array([3, 1, 4, 4]), {'distance': 4.0})


In [25]:
observation, info = env.reset()

In [26]:
observation

{'agent': array([4, 0]), 'target': array([0, 2])}