In [14]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import random
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from gymcts.gymcts_agent import GymctsAgent
from gymcts.gymcts_deepcopy_wrapper import DeepCopyMCTSGymEnvWrapper

In [17]:

class TwentyFortyEightEnv(gym.Env):
    metadata = {"render_modes": ["human"], "render_fps": 4}

    def __init__(self, render_mode=None,log_reward = True,negative_reward = -10, stop_stationary = True):
        super().__init__()
        self.board_size = 4
        self.log_reward = log_reward
        # Actions: 0 = up, 1 = down, 2 = left, 3 = right
        self.action_space = spaces.Discrete(4)
        self.negative_reward = negative_reward
        self.stop_stationary = stop_stationary
        # Board is 4x4 integers; observation is a flattened vector of size 16
        self.observation_space = spaces.Box(
            low=0,
            high=2**16,
            shape=(1,4,4),
            dtype=np.int32
        )

        self.render_mode = render_mode
        self.reset()

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)

        self.board = np.zeros((self.board_size, self.board_size), dtype=np.int32)
        self.score = 0

        self._add_tile()
        self._add_tile()

        return self._get_obs(), {}

    def get_mask(self):
        return [
            self._move_up(test_is_masked=True),
            self._move_down(test_is_masked=True),
            self._move_left(test_is_masked=True),
            self._move_right(test_is_masked=True),

        ]

    def step(self, action):
        old_board = self.board[:][:]
        if action == 0:
            reward = self._move_up()
        elif action == 1:
            reward = self._move_down()
        elif action == 2:
            reward = self._move_left()
        elif action == 3:
            reward = self._move_right()

        # # Invalid move (board unchanged)
        if np.array_equal(self.board, old_board):
            
            # done = True
            reward = self.negative_reward
        else:
            # Only add a tile after a valid move
            self._add_tile()
        done = not self._moves_available()

        return self._get_obs(), reward, done, False, {"score": self.score}

    # -------- Rendering -------- #

    def render(self):
        print("\nScore:", self.score)
        print("-" * 25)
        for row in self.board:
            print("|" + "|".join(f"{num:^5}" if num != 0 else "     " for num in row) + "|")
            print("-" * 25)

    # -------- Helper Methods -------- #

    def _get_obs(self):
        return self.board
    
    def _add_tile(self):
        empty = list(zip(*np.where(self.board == 0)))
        if not empty:
            return
        i, j = random.choice(empty)
        self.board[i, j] = 4 if random.random() < 0.1 else 2

    def _moves_available(self):
        if np.any(self.board == 0):
            return True
        for i in range(4):
            for j in range(4):
                if j < 3 and self.board[i, j] == self.board[i, j + 1]:
                    return True
                if i < 3 and self.board[i, j] == self.board[i + 1, j]:
                    return True
        return False

    # -------- Movement Logic -------- #

    def _compress(self, row):
        new = row[row != 0]
        return np.concatenate([new, np.zeros(4 - len(new), dtype=np.int32)])

    def _merge(self, row):
        score_gain = 0
        for i in range(3):
            if row[i] != 0 and row[i] == row[i + 1]:
                row[i] *= 2
                row[i + 1] = 0
                score_gain += np.log2(row[i]) if self.log_reward else row[i]
        return row, score_gain

    def _move_left(self,test_is_masked = False):
        total_gain = 0
        new_board = np.zeros((self.board_size, self.board_size), dtype=np.int32)
        old_board = self.board.copy()
        for i in range(4):
            row = self._compress(self.board[i])
            row, gain = self._merge(row)
            row = self._compress(row)
            new_board[i] = row
            total_gain += gain
        if test_is_masked:
            self.board = old_board
            return  np.array_equal(self.board,new_board)
        self.board = new_board
        self.score += total_gain
        return float(total_gain)

    def _move_right(self,test_is_masked = False):
        self.board = np.fliplr(self.board)
        reward = self._move_left(test_is_masked=test_is_masked)
        self.board = np.fliplr(self.board)
        if test_is_masked:
            return reward
        return reward

    def _move_up(self,test_is_masked = False):
        self.board = self.board.T
        reward = self._move_left(test_is_masked=test_is_masked)
        self.board = self.board.T
        if test_is_masked:
            return reward
        return reward

    def _move_down(self,test_is_masked = False):
        self.board = self.board.T
        reward = self._move_right(test_is_masked=test_is_masked)
        self.board = self.board.T
        if test_is_masked:
            return reward
        return reward


In [None]:
import gymnasium as gym



# Make your gym env
env = TwentyFortyEightEnv()

# Wrap it with gymct
env = DeepCopyMCTSGymEnvWrapper(env)
agent = GymctsAgent(
        env=env,
        number_of_simulations_per_step=100,
        clear_mcts_tree_after_step=True,
    )
# 3. solve the environment
terminal = False
step = 0
while not terminal:
    action, _ = agent.perform_mcts_step()
    obs, rew, term, trun, info = env.step(action)
    terminal = term or trun

    step += 1

    # log to console every 10 steps
    print(f"step {step}: Reward {rew} state is \n {obs}")


step 1: Reward 2.0 state is 
 [[0 0 0 0]
 [0 0 2 0]
 [0 0 0 0]
 [0 0 0 4]]


KeyboardInterrupt: 