In [53]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces
from gymnasium.utils import seeding
import ray
from ray import tune
from ray.rllib.algorithms.ppo import PPO

class FifteenPuzzleEnv(gym.Env):
    def __init__(self,env_config):
        self.grid_size = 4
        self.action_space = spaces.Discrete(4)  # 0: left, 1: up, 2: right, 3: down
        self.observation_space = spaces.Box(low=0, high=self.grid_size - 1, shape=(self.grid_size, self.grid_size), dtype=np.int32)
        self.seed()
        self.reset()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.state = np.arange(self.grid_size ** 2)
        self.np_random.shuffle(self.state)
        self.state = self.state.reshape((self.grid_size, self.grid_size))
        self.zero_pos = np.argwhere(self.state == 0)[0]
        assert self.observation_space.contains(self.state), "Invalid initial state!"
        print(self.state)
        return self.state

    def step(self, action):
        self.move(action)
        done = self.is_solved()
        reward = 1.0 if done else 0.0
        return self.state, reward, done, {}

    def move(self, action):
        new_zero_pos = np.array(self.zero_pos)
        if action == 0:  # left
            new_zero_pos[1] -= 1
        elif action == 1:  # up
            new_zero_pos[0] -= 1
        elif action == 2:  # right
            new_zero_pos[1] += 1
        elif action == 3:  # down
            new_zero_pos[0] += 1

        if (0 <= new_zero_pos[0] < self.grid_size) and (0 <= new_zero_pos[1] < self.grid_size):
            self.state[self.zero_pos[0], self.zero_pos[1]], self.state[new_zero_pos[0], new_zero_pos[1]] = (
                self.state[new_zero_pos[0], new_zero_pos[1]],
                self.state[self.zero_pos[0], self.zero_pos[1]],
            )
            self.zero_pos = new_zero_pos

    def is_solved(self):
        return np.array_equal(self.state, np.arange(self.grid_size ** 2).reshape((self.grid_size, self.grid_size)))

def train_agent(config, stop_criteria):
    ray.init(ignore_reinit_error=True, log_to_driver=False)
    trainer = PPO(config=config, env=FifteenPuzzleEnv)
    tune.run(trainer, stop=stop_criteria)
    ray.shutdown()



In [54]:

config = {
        "framework": "torch",
        "num_workers": 1,
        "env_config": {},
        "model": {
            "fcnet_hiddens": [64, 64],
        },
        "num_sgd_iter": 10,
        "gamma": 0.99,
        "lambda": 0.95,
        "clip_param": 0.2,
        "lr": 5e-4,
        "sgd_minibatch_size": 128,
        "train_batch_size": 2048,
        "rollout_fragment_length": 256,
        "monitor": True,
    }
stop_criteria = {
        "episode_reward_mean": 1.0,  # Stop when the mean reward reaches 1.0 (solved)
        "time_total_s": 600,  # Maximum training time (in seconds)
    }
train_agent(config, stop_criteria)

2023-07-19 22:24:58,966	INFO worker.py:1474 -- Calling ray.init() again after it has already been called.


2023-07-19 22:25:01,086	ERROR actor_manager.py:507 -- Ray error, taking actor 1 out of service. The actor died because of an error raised in its creation task, [36mray::RolloutWorker.__init__()[39m (pid=113045, ip=192.168.0.121, actor_id=31285a927f780c7b10b58f9d01000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7f7cb7e2d5a0>)
  File "/home/spacefarers/.local/lib/python3.10/site-packages/ray/rllib/evaluation/rollout_worker.py", line 609, in __init__
    self.env = env_creator(copy.deepcopy(self.env_context))
  File "/home/spacefarers/.local/lib/python3.10/site-packages/ray/rllib/env/utils.py", line 133, in _gym_env_creator
    env = env_descriptor(env_context)
  File "/tmp/ipykernel_88377/1908796418.py", line 15, in __init__
  File "/tmp/ipykernel_88377/1908796418.py", line 26, in reset
AssertionError: Invalid initial state!
2023-07-19 22:25:01,086	ERROR actor_manager.py:507 -- Ray error, taking actor 2 out of service. The actor died because of an error rais

AssertionError: Invalid initial state!

[2m[36m(RolloutWorker pid=113046)[0m     self.env = env_creator(copy.deepcopy(self.env_context))
[2m[36m(RolloutWorker pid=113046)[0m   File "/home/spacefarers/.local/lib/python3.10/site-packages/ray/rllib/env/utils.py", line 133, in _gym_env_creator
[2m[36m(RolloutWorker pid=113046)[0m     env = env_descriptor(env_context)
[2m[36m(RolloutWorker pid=113046)[0m   File "/tmp/ipykernel_88377/1908796418.py", line 26, in reset
[2m[36m(RolloutWorker pid=113046)[0m AssertionError: Invalid initial state!
[2m[36m(RolloutWorker pid=112868)[0m ValueError: The observation collected from env.reset() was not contained within your env's observation space. It is possible that there was a type mismatch, or that one of the sub-observations was out of bounds:[32m [repeated 2x across cluster][0m
[2m[36m(RolloutWorker pid=112868)[0m  [ 4 11  0  8]] (int32)[32m [repeated 2x across cluster][0m
[2m[36m(RolloutWorker pid=112868)[0m  (sub-)observation space: Box(0, 3, (4, 4), int32) 