<a href="https://colab.research.google.com/github/sarah-mokhtar/RL-Project-2048/blob/main/2048PPODiffTweaks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install stable-baselines3[extra] gymnasium numpy


Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.7.0-py3-none-any.whl.metadata (4.8 kB)
Downloading stable_baselines3-2.7.0-py3-none-any.whl (187 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m187.2/187.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: stable-baselines3
Successfully installed stable-baselines3-2.7.0


In [None]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces


class Game2048Env(gym.Env):


    metadata = {"render_modes": ["ansi"], "render_fps": 60}

    def __init__(self, render_mode=None, target_tile=2048):
        super().__init__()

        self.board_size = 4
        self.target_tile = target_tile
        self.observation_space = spaces.Box(
            low=0,
            high=15,
            shape=(self.board_size, self.board_size),
            dtype=np.int32,
        )
        self.action_space = spaces.Discrete(4)

        self.render_mode = render_mode
        self.board = np.zeros((self.board_size, self.board_size), dtype=np.int32)
        self.score = 0
        self.rng = np.random.default_rng()

    def _slide_and_merge_line(self, line):
        """
        line: 1D np.array of exponents (0 = empty)
        Returns: (new_line, reward_from_merges)
        """
        non_zero = line[line != 0].tolist()
        new = []
        reward = 0
        i = 0
        while i < len(non_zero):
            if i + 1 < len(non_zero) and non_zero[i] == non_zero[i + 1]:
                exp = non_zero[i] + 1
                new.append(exp)
                reward += 2 ** exp
                i += 2
            else:
                new.append(non_zero[i])
                i += 1
        # pad with zeros
        new += [0] * (len(line) - len(new))
        return np.array(new, dtype=np.int32), reward

    def _add_random_tile(self):
        empty_positions = list(zip(*np.where(self.board == 0)))
        if not empty_positions:
            return
        row, col = empty_positions[self.rng.integers(len(empty_positions))]

        if self.rng.random() < 0.9:
            self.board[row, col] = 1
        else:
            self.board[row, col] = 2

    def _can_move(self):
        # If any cell empty -> can move
        if np.any(self.board == 0):
            return True
        # If any horizontal merge possible
        for i in range(self.board_size):
            for j in range(self.board_size - 1):
                if self.board[i, j] == self.board[i, j + 1]:
                    return True
        # If any vertical merge possible
        for j in range(self.board_size):
            for i in range(self.board_size - 1):
                if self.board[i, j] == self.board[i + 1, j]:
                    return True
        return False

    def _get_max_tile(self):
        exp = int(self.board.max())
        return 0 if exp == 0 else 2 ** exp

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        if seed is not None:
            self.rng = np.random.default_rng(seed)

        self.board[:] = 0
        self.score = 0

        self._add_random_tile()
        self._add_random_tile()

        observation = self.board.copy()
        info = {"score": self.score, "max_tile": self._get_max_tile()}
        return observation, info

    def step(self, action):
        assert self.action_space.contains(action), "Invalid action"

        old_board = self.board.copy()
        reward = 0


        if action == 0:  # up
            for col in range(self.board_size):
                line = self.board[:, col]
                new_line, r = self._slide_and_merge_line(line)
                self.board[:, col] = new_line
                reward += r
        elif action == 1:  # down
            for col in range(self.board_size):
                line = self.board[:, col][::-1]
                new_line, r = self._slide_and_merge_line(line)
                self.board[:, col] = new_line[::-1]
                reward += r
        elif action == 2:  # left
            for row in range(self.board_size):
                line = self.board[row, :]
                new_line, r = self._slide_and_merge_line(line)
                self.board[row, :] = new_line
                reward += r
        elif action == 3:  # right
            for row in range(self.board_size):
                line = self.board[row, :][::-1]
                new_line, r = self._slide_and_merge_line(line)
                self.board[row, :] = new_line[::-1]
                reward += r

        moved = not np.array_equal(old_board, self.board)

        if not moved:
            reward -= 1.0
        else:
            self._add_random_tile()

        self.score += reward

        max_tile = self._get_max_tile()
        terminated = False
        if not self._can_move():
            terminated = True
        if max_tile >= self.target_tile:
            terminated = True

        truncated = False

        observation = self.board.copy()
        info = {"score": self.score, "max_tile": max_tile}

        return observation, reward, terminated, truncated, info

    def render(self):
        if self.render_mode == "ansi":
            return self._board_to_string()
        else:
            print(self._board_to_string())

    def _board_to_string(self):
        display = []
        for row in self.board:
            display_row = []
            for exp in row:
                if exp == 0:
                    display_row.append(".")
                else:
                    display_row.append(str(2 ** int(exp)))
            display.append("\t".join(display_row))
        return "\n".join(display)


In [None]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.vec_env import DummyVecEnv

# We already defined Game2048Env above


def make_env():
    def _init():
        env = Game2048Env()
        return env
    return _init


# Create one env to check API
env = Game2048Env()
check_env(env, warn=True)

# Vectorized env for PPO
vec_env = DummyVecEnv([make_env()])

model = PPO(
    "MlpPolicy",
    vec_env,
    verbose=1,
    learning_rate=3e-4,
    n_steps=2048,
    batch_size=256,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    ent_coef=0.01,
)

# üîÅ Training ‚Äì you can increase timesteps later
model.learn(total_timesteps=2000000)

model.save("ppo_2048")
print("Model saved as ppo_2048.zip")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|    total_timesteps      | 1433600      |
| train/                  |              |
|    approx_kl            | 0.0018848357 |
|    clip_fraction        | 0.0085       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.546       |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 9.44e+03     |
|    n_updates            | 6990         |
|    policy_gradient_loss | -0.00285     |
|    value_loss           | 2.21e+04     |
------------------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 584          |
|    iterations           | 701          |
|    time_elapsed         | 2457         |
|    total_timesteps      | 1435648      |
| train/                  |              |
|    approx_kl            | 0.0014606611 |
|    clip_fraction        | 0.00

In [None]:
import time
from stable_baselines3 import PPO

# Load model
model = PPO.load("ppo_2048")

env = Game2048Env(render_mode="ansi")

obs, info = env.reset()
done = False
step = 0

print("Initial board:")
print(env._board_to_string())

while not done:
    # stochastic actions so you can see exploration / variability
    action, _ = model.predict(obs, deterministic=False)
    obs, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated

    print(f"\nStep {step}, action={action}, reward={reward}, score={info['score']}")
    print(env._board_to_string())
    time.sleep(0.1)
    step += 1

print("\nEpisode finished.")
print(f"Final score: {info['score']}, max tile: {info['max_tile']}")


Initial board:
.	.	.	.
2	.	.	2
.	.	.	.
.	.	.	.

Step 0, action=0, reward=0, score=0
2	2	.	2
.	.	.	.
.	.	.	.
.	.	.	.

Step 1, action=3, reward=4, score=4
.	.	2	4
.	.	.	.
.	.	2	.
.	.	.	.

Step 2, action=1, reward=4, score=8
.	.	.	.
.	.	.	.
.	2	.	.
.	.	4	4

Step 3, action=0, reward=0, score=8
.	2	4	4
.	.	.	.
.	2	.	.
.	.	.	.

Step 4, action=3, reward=8, score=16
.	.	2	8
.	.	.	.
2	.	.	2
.	.	.	.

Step 5, action=3, reward=4, score=20
.	.	2	8
2	.	.	.
.	.	.	4
.	.	.	.

Step 6, action=1, reward=0, score=20
4	.	.	.
.	.	.	.
.	.	.	8
2	.	2	4

Step 7, action=0, reward=0, score=20
4	.	2	8
2	.	.	4
.	.	.	.
2	.	.	.

Step 8, action=1, reward=4, score=24
.	.	.	.
.	.	.	2
4	.	.	8
4	.	2	4

Step 9, action=2, reward=0, score=24
.	.	.	.
2	2	.	.
4	8	.	.
4	2	4	.

Step 10, action=0, reward=8, score=32
2	2	4	.
8	8	.	.
.	2	.	2
.	.	.	.

Step 11, action=3, reward=24, score=56
.	.	4	4
.	.	.	16
.	.	.	4
2	.	.	.

Step 12, action=1, reward=0, score=56
.	.	.	.
.	.	.	4
.	.	2	16
2	.	4	4

Step 13, action=2, reward=8, score=64
2	

In [None]:
!git config --global user.email "sarah04@mit.edu"
!git config --global user.name "sarah-mokhtar"


In [None]:
!git clone https://github.com/sarah-mokhtar/RL-Project-2048.git



fatal: destination path 'RL-Project-2048' already exists and is not an empty directory.


In [None]:

!cp 2048\ RL.ipynb /content/RL-Project-2048/




cp: cannot stat '2048 RL.ipynb': No such file or directory


In [None]:
%cd RL-Project-2048

!git add .
!git commit -m "PPO"
!git push


[Errno 2] No such file or directory: 'RL-Project-2048'
/content/RL-Project-2048
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean
fatal: could not read Username for 'https://github.com': No such device or address


In [None]:
# Clone a team/organization repo

from getpass import getpass

# Get your Personal Access Token
print("Get token from: https://github.com/settings/tokens")
print("Make sure 'repo' scope is checked!")
token = getpass('Paste your GitHub Personal Access Token: ')

# Team/Organization repo details
org_or_username = "team-name-or-org"  # The organization/team name
repo_name = "repo-name"  # The repository name

# Clone with authentication
!git clone https://{token}@github.com/{org_or_username}/{repo_name}.git

# Navigate into repo
%cd {repo_name}

# Configure your git identity (important for team repos)
!git config user.email "your-email@example.com"
!git config user.name "Your Name"

print("‚úÖ Team repo cloned successfully!")

In [None]:
!git clone https://github.com/Ali-Backour/2048_RL.git

Cloning into '2048_RL'...
fatal: could not read Username for 'https://github.com': No such device or address


In [None]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces


class Game2048EnvV2(gym.Env):
    """
    Version 2 of the 2048 environment.
    Same logic as V1, but you can modify reward shaping etc. later.

    Normalized observation: exponents / 15.0 ‚Üí float32 in [0,1]
    """

    metadata = {"render_modes": ["ansi"], "render_fps": 60}

    def __init__(self, render_mode=None, target_tile=2048):
        super().__init__()
        self.board_size = 4
        self.target_tile = target_tile

        self.observation_space = spaces.Box(
            low=0.0,
            high=1.0,
            shape=(4, 4),
            dtype=np.float32
        )
        self.action_space = spaces.Discrete(4)

        self.render_mode = render_mode
        self.board = np.zeros((4, 4), dtype=np.int32)
        self.score = 0.0
        self.prev_max_tile = 0
        self.rng = np.random.default_rng()

    # ---------- Helpers ----------
    def _get_obs(self):
        return self.board.astype(np.float32) / 15.0

    def _slide_and_merge_line(self, line):
        non_zero = line[line != 0].tolist()
        new = []
        reward = 0
        i = 0
        while i < len(non_zero):
            if i+1 < len(non_zero) and non_zero[i] == non_zero[i+1]:
                exp = non_zero[i] + 1
                new.append(exp)
                reward += 2**exp
                i += 2
            else:
                new.append(non_zero[i])
                i += 1
        new += [0]*(len(line)-len(new))
        return np.array(new, dtype=np.int32), reward

    def _add_random_tile(self):
        empty = list(zip(*np.where(self.board == 0)))
        if not empty:
            return
        r, c = empty[self.rng.integers(len(empty))]
        self.board[r,c] = 1 if self.rng.random()<0.9 else 2

    def _can_move(self):
        if np.any(self.board==0): return True
        for r in range(4):
            for c in range(3):
                if self.board[r,c]==self.board[r,c+1]:
                    return True
        for c in range(4):
            for r in range(3):
                if self.board[r,c]==self.board[r+1,c]:
                    return True
        return False

    def _get_max_tile(self):
        return 0 if self.board.max()==0 else 2**int(self.board.max())

    # ---------- Gym API ----------
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        if seed is not None:
            self.rng = np.random.default_rng(seed)

        self.board[:] = 0
        self.score = 0.0
        self.prev_max_tile = 0

        self._add_random_tile()
        self._add_random_tile()

        return self._get_obs(), {"score":0, "max_tile":self._get_max_tile()}

    def step(self, action):
        old_board = self.board.copy()
        reward = 0

        if action == 0:
            for c in range(4):
                new_line, r = self._slide_and_merge_line(self.board[:,c])
                self.board[:,c] = new_line
                reward += r
        elif action == 1:
            for c in range(4):
                new_line, r = self._slide_and_merge_line(self.board[:,c][::-1])
                self.board[:,c] = new_line[::-1]
                reward += r
        elif action == 2:
            for r in range(4):
                new_line, rwd = self._slide_and_merge_line(self.board[r])
                self.board[r] = new_line
                reward += rwd
        elif action == 3:
            for r in range(4):
                new_line, rwd = self._slide_and_merge_line(self.board[r][::-1])
                self.board[r] = new_line[::-1]
                reward += rwd

        moved = not np.array_equal(old_board, self.board)

        if not moved:
            reward -= 2.0
        else:
            self._add_random_tile()

        max_tile = self._get_max_tile()

        # reward shaping
        if max_tile > self.prev_max_tile:
            reward += 0.5
        self.prev_max_tile = max_tile

        reward += 0.01 * np.sum(self.board==0)
        self.score += reward

        terminated = (not self._can_move()) or (max_tile >= self.target_tile)
        truncated = False

        return self._get_obs(), reward, terminated, truncated, {
            "score": self.score,
            "max_tile": max_tile
        }

    def _board_to_string(self):
        rows = []
        for row in self.board:
            r = []
            for exp in row:
                r.append("." if exp==0 else str(2**exp))
            rows.append("\t".join(r))
        return "\n".join(rows)


In [None]:
from stable_baselines3 import PPO

env_v2 = Game2048EnvV2()

policy_kwargs_v2 = dict(
    net_arch=[dict(pi=[256, 256, 256],
                   vf=[256, 256, 256])]
)

model_v2 = PPO(
    "MlpPolicy",
    env_v2,
    policy_kwargs=policy_kwargs_v2,
    learning_rate=3e-4,
    n_steps=2048,
    batch_size=512,
    n_epochs=10,
    gamma=0.99,
    clip_range=0.15,
    ent_coef=0.05,      # üîº higher entropy ‚Üí more stochastic policy
    target_kl=0.02,
    verbose=1,
)

model_v2.learn(total_timesteps=300_000)

model_v2.save("ppo_2048_v2_simple")
print("Saved model ppo_2048_v2_simple.zip")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 149      |
|    ep_rew_mean     | 1.16e+03 |
| time/              |          |
|    fps             | 608      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 147           |
|    ep_rew_mean          | 1.13e+03      |
| time/                   |               |
|    fps                  | 592           |
|    iterations           | 2             |
|    time_elapsed         | 6             |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.00044112414 |
|    clip_fraction        | 0             |
|    clip_range           | 0.15          |
|    entropy_loss         | -1.39         |
|    explained_variance   | -5.89e-05     |


In [None]:
import time
from stable_baselines3 import PPO

model_v2 = PPO.load("ppo_2048_v2_simple")

env_eval = Game2048EnvV2(render_mode="ansi")
obs, info = env_eval.reset()
done = False
step = 0

print("Initial board:")
print(env_eval._board_to_string())

while not done:
    # üîΩ IMPORTANT: try deterministic=False to see variety
    action, _ = model_v2.predict(obs, deterministic=False)
    obs, reward, terminated, truncated, info = env_eval.step(int(action))
    done = terminated or truncated

    print(f"\nStep {step}, action={int(action)}, reward={reward:.2f}, score={info['score']:.2f}")
    print(env_eval._board_to_string())
    time.sleep(0.1)
    step += 1

print("\nEpisode finished.")
print(f"Final score: {info['score']}, max tile: {info['max_tile']}")


Initial board:
.	.	.	4
.	.	.	.
.	.	2	.
.	.	.	.

Step 0, action=1, reward=0.63, score=0.63
.	.	.	2
.	.	.	.
.	.	.	.
.	.	2	4

Step 1, action=2, reward=0.12, score=0.75
2	.	2	.
.	.	.	.
.	.	.	.
2	4	.	.

Step 2, action=0, reward=4.12, score=4.87
4	4	2	.
.	.	.	.
.	.	.	.
.	.	4	.

Step 3, action=0, reward=0.11, score=4.98
4	4	2	.
.	.	4	.
.	.	.	.
.	.	.	2

Step 4, action=1, reward=0.10, score=5.08
.	.	.	.
.	.	.	2
.	.	2	.
4	4	4	2

Step 5, action=0, reward=4.10, score=9.18
4	4	2	4
.	.	4	.
2	.	.	.
.	.	.	.

Step 6, action=3, reward=8.60, score=17.78
.	8	2	4
.	.	2	4
.	.	.	2
.	.	.	.

Step 7, action=2, reward=0.09, score=17.87
8	2	4	.
2	4	.	.
2	.	.	2
.	.	.	.

Step 8, action=0, reward=4.09, score=21.96
8	2	4	2
4	4	.	.
.	.	.	.
.	.	2	.

Step 9, action=1, reward=0.08, score=22.04
.	.	2	.
.	.	.	.
8	2	4	.
4	4	2	2

Step 10, action=3, reward=12.09, score=34.13
.	.	.	2
2	.	.	.
.	8	2	4
.	.	8	4

Step 11, action=0, reward=8.09, score=42.22
2	8	2	2
.	2	8	8
.	.	.	.
.	.	.	.

Step 12, action=1, reward=0.08, score=42.30

In [None]:
import numpy as np
from stable_baselines3 import PPO

# ---- Settings ----
NUM_EPISODES = 30
MODEL_PATH = "ppo_2048_v2_simple"  # change if your model name is different

# ---- Helper: run one episode with a given policy ----
def run_episode_with_policy(env, policy_fn, render=False):
    obs, info = env.reset()
    done = False
    total_reward = 0.0
    final_score = 0.0
    max_tile = 0

    while not done:
        action = policy_fn(obs, env)
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        total_reward += reward
        final_score = info["score"]
        max_tile = info["max_tile"]

        if render:
            print(env._board_to_string())
            print(f"Action: {action}, Reward: {reward:.2f}, Score: {final_score:.2f}")
            print("-" * 40)

    return total_reward, final_score, max_tile

# ---- Random policy ----
def random_policy(obs, env):
    return env.action_space.sample()

# ---- PPO policy ----
print("Loading PPO model...")
ppo_model = PPO.load(MODEL_PATH)

def ppo_policy(obs, env):
    action, _ = ppo_model.predict(obs, deterministic=True)
    return int(action)

# ---- Evaluate both ----
def evaluate_policy(name, policy_fn, num_episodes=NUM_EPISODES, render=False):
    rewards = []
    scores = []
    max_tiles = []

    for ep in range(num_episodes):
        env = Game2048EnvV2()  # new fresh env each episode
        total_r, score, max_tile = run_episode_with_policy(env, policy_fn, render=False)
        rewards.append(total_r)
        scores.append(score)
        max_tiles.append(max_tile)

    print(f"\n=== {name} over {num_episodes} episodes ===")
    print(f"Avg total reward: {np.mean(rewards):.2f} ¬± {np.std(rewards):.2f}")
    print(f"Avg final score:  {np.mean(scores):.2f} ¬± {np.std(scores):.2f}")
    print(f"Avg max tile:     {np.mean(max_tiles):.1f}")
    print(f"Max of max tiles: {np.max(max_tiles)}")

    return rewards, scores, max_tiles

# Run comparison
rand_rewards, rand_scores, rand_tiles = evaluate_policy("Random policy", random_policy)
ppo_rewards,  ppo_scores,  ppo_tiles  = evaluate_policy("PPO policy",    ppo_policy)

print("\nDone.")


Loading PPO model...

=== Random policy over 30 episodes ===
Avg total reward: 991.10 ¬± 475.12
Avg final score:  991.10 ¬± 475.12
Avg max tile:     104.5
Max of max tiles: 256


KeyboardInterrupt: 

In [2]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces

class Game2048EnvV3(gym.Env):
    """
    Simple, stable 2048 env for PPO.

    - Obs: 4x4 grid, exponents / 15.0 in [0, 1]
    - Action: 0=up,1=down,2=left,3=right
    - Reward:
        * (sum of merged tile values) / 32.0
        * -1 for invalid move (no board change)
    """

    metadata = {"render_modes": ["ansi"], "render_fps": 60}

    def __init__(self, render_mode=None, target_tile=2048):
        super().__init__()
        self.board_size = 4
        self.target_tile = target_tile

        self.observation_space = spaces.Box(
            low=0.0,
            high=1.0,
            shape=(self.board_size, self.board_size),
            dtype=np.float32,
        )
        self.action_space = spaces.Discrete(4)

        self.render_mode = render_mode
        self.board = np.zeros((self.board_size, self.board_size), dtype=np.int32)
        self.score = 0.0
        self.rng = np.random.default_rng()

    # ---------- Helpers ----------
    def _get_obs(self):
        return self.board.astype(np.float32) / 15.0

    def _slide_and_merge_line(self, line):
        non_zero = line[line != 0].tolist()
        new = []
        merged_value = 0
        i = 0
        while i < len(non_zero):
            if i + 1 < len(non_zero) and non_zero[i] == non_zero[i+1]:
                exp = non_zero[i] + 1
                new.append(exp)
                merged_value += 2 ** exp
                i += 2
            else:
                new.append(non_zero[i])
                i += 1
        new += [0] * (len(line) - len(new))
        return np.array(new, dtype=np.int32), merged_value

    def _add_random_tile(self):
        empties = list(zip(*np.where(self.board == 0)))
        if not empties:
            return
        r, c = empties[self.rng.integers(len(empties))]
        self.board[r, c] = 1 if self.rng.random() < 0.9 else 2  # 2 or 4

    def _can_move(self):
        if np.any(self.board == 0):
            return True
        for r in range(self.board_size):
            for c in range(self.board_size - 1):
                if self.board[r, c] == self.board[r, c+1]:
                    return True
        for c in range(self.board_size):
            for r in range(self.board_size - 1):
                if self.board[r, c] == self.board[r+1, c]:
                    return True
        return False

    def _get_max_tile(self):
        exp = int(self.board.max())
        return 0 if exp == 0 else 2 ** exp

    # ---------- Gym API ----------
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        if seed is not None:
            self.rng = np.random.default_rng(seed)

        self.board[:] = 0
        self.score = 0.0

        self._add_random_tile()
        self._add_random_tile()

        return self._get_obs(), {"score": self.score, "max_tile": self._get_max_tile()}

    def step(self, action):
        assert self.action_space.contains(action), "Invalid action"

        old_board = self.board.copy()
        merged_value = 0

        if action == 0:  # up
            for c in range(self.board_size):
                new_line, mv = self._slide_and_merge_line(self.board[:, c])
                self.board[:, c] = new_line
                merged_value += mv
        elif action == 1:  # down
            for c in range(self.board_size):
                new_line, mv = self._slide_and_merge_line(self.board[:, c][::-1])
                self.board[:, c] = new_line[::-1]
                merged_value += mv
        elif action == 2:  # left
            for r in range(self.board_size):
                new_line, mv = self._slide_and_merge_line(self.board[r])
                self.board[r] = new_line
                merged_value += mv
        elif action == 3:  # right
            for r in range(self.board_size):
                new_line, mv = self._slide_and_merge_line(self.board[r][::-1])
                self.board[r] = new_line[::-1]
                merged_value += mv

        moved = not np.array_equal(old_board, self.board)

        reward = 0.0
        if moved:
            self._add_random_tile()
            # scale down raw 2048 reward to keep PPO stable
            reward += merged_value / 32.0
        else:
            # strong penalty for useless move
            reward -= 1.0

        self.score += merged_value  # human-style score for reporting

        max_tile = self._get_max_tile()
        terminated = (not self._can_move()) or (max_tile >= self.target_tile)
        truncated = False

        obs = self._get_obs()
        info = {"score": self.score, "max_tile": max_tile}
        return obs, reward, terminated, truncated, info

    def _board_to_string(self):
        rows = []
        for row in self.board:
            r = []
            for exp in row:
                r.append("." if exp == 0 else str(2 ** exp))
            rows.append("\t".join(r))
        return "\n".join(rows)


In [3]:
from stable_baselines3 import PPO

env = Game2048EnvV3()

policy_kwargs = dict(
    net_arch=[dict(pi=[256, 256, 256],
                   vf=[256, 256, 256])]
)

model = PPO(
    "MlpPolicy",
    env,
    policy_kwargs=policy_kwargs,
    learning_rate=1e-4,   # smaller LR
    n_steps=4096,         # more rollout per update
    batch_size=512,
    n_epochs=20,          # reuse data more
    gamma=0.99,
    clip_range=0.1,
    ent_coef=0.1,         # stronger exploration
    target_kl=0.02,
    verbose=1,
)

model.learn(total_timesteps=2_000_000)  # 2M instead of 500k
model.save("ppo_2048_v3_big")


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|    value_loss           | 8.07         |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 121          |
|    ep_rew_mean          | 18.5         |
| time/                   |              |
|    fps                  | 553          |
|    iterations           | 252          |
|    time_elapsed         | 1865         |
|    total_timesteps      | 1032192      |
| train/                  |              |
|    approx_kl            | 0.0016753317 |
|    clip_fraction        | 0.0377       |
|    clip_range           | 0.1          |
|    entropy_loss         | -1.3         |
|    explained_variance   | 0.461        |
|    learning_rate        | 0.0001       |
|    loss                 | 3.19         |
|    n_updates            | 5020         |
|    policy_gradient_loss | -0.00353     |
|    value_loss           | 7.36

In [None]:
import torch
from stable_baselines3.common.preprocessing import preprocess_obs

env_dbg = Game2048EnvV3()
obs, info = env_dbg.reset()

for i in range(5):
    obs_tensor = torch.as_tensor([obs], device=model.device)
    obs_tensor = preprocess_obs(obs_tensor, env_dbg.observation_space)

    dist = model.policy.get_distribution(obs_tensor)
    probs = dist.distribution.probs.detach().cpu().numpy()[0]

    print(f"Probs on state {i}: {probs}")

    # create new state
    action = env_dbg.action_space.sample()
    obs, _, done, _, _ = env_dbg.step(action)
    if done:
        obs, info = env_dbg.reset()


Probs on state 0: [0.26067576 0.22560897 0.22320904 0.29050624]
Probs on state 1: [0.18664004 0.33731905 0.13716367 0.33887723]
Probs on state 2: [0.14637853 0.40725422 0.15866132 0.2877059 ]
Probs on state 3: [0.11983611 0.4621226  0.31354755 0.10449384]
Probs on state 4: [0.10891565 0.47009867 0.3245981  0.09638763]


  obs_tensor = torch.as_tensor([obs], device=model.device)


In [None]:
env = Game2048EnvV3()
obs, info = env.reset()

print("Initial obs:", obs.flatten())

for i in range(5):
    action = env.action_space.sample()
    obs, reward, term, trunc, info = env.step(action)
    print(f"Step {i}, action={action}, obs:", obs.flatten())


Initial obs: [0.06666667 0.         0.         0.         0.         0.
 0.         0.         0.06666667 0.         0.         0.
 0.         0.         0.         0.        ]
Step 0, action=2, obs: [0.06666667 0.         0.         0.         0.         0.
 0.         0.         0.06666667 0.         0.         0.
 0.         0.         0.         0.        ]
Step 1, action=1, obs: [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.06666667
 0.13333334 0.         0.         0.        ]
Step 2, action=0, obs: [0.13333334 0.06666667 0.         0.06666667 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.        ]
Step 3, action=2, obs: [0.13333334 0.13333334 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.06666667 0.         0.         0.        ]
Step 4, action=0, obs: [0.13333334 0.13333334 0.         0.        

In [6]:
import time
from stable_baselines3 import PPO

# Load the trained model
model = PPO.load("ppo_2048_v3_big")

# Create a fresh environment for playing
env = Game2048EnvV3(render_mode="ansi")

obs, info = env.reset()
done = False
step = 0

print("Initial Board:")
print(env._board_to_string())

while not done:
    # deterministic=True = best move according to PPO
    # deterministic=False = allows exploration
    action, _ = model.predict(obs, deterministic=False)

    obs, reward, terminated, truncated, info = env.step(int(action))
    done = terminated or truncated

    print(f"\nStep {step} | Action: {int(action)} | Reward: {reward:.2f} | Score: {info['score']:.1f}")
    print(env._board_to_string())
    print("-" * 40)

    step += 1
    time.sleep(0.15)  # Slow down for visual effect (optional)

print("\n=== Episode Finished ===")
print(f"Final Score: {info['score']}")
print(f"Max Tile: {info['max_tile']}")


Initial Board:
.	.	.	.
.	2	.	.
.	.	2	.
.	.	.	.

Step 0 | Action: 0 | Reward: 0.00 | Score: 0.0
.	2	2	.
.	.	4	.
.	.	.	.
.	.	.	.
----------------------------------------

Step 1 | Action: 1 | Reward: 0.00 | Score: 0.0
.	.	.	.
.	.	.	.
.	2	2	.
.	2	4	.
----------------------------------------

Step 2 | Action: 1 | Reward: 0.12 | Score: 4.0
.	.	.	.
.	.	.	.
.	.	2	2
.	4	4	.
----------------------------------------

Step 3 | Action: 1 | Reward: 0.00 | Score: 4.0
.	.	.	2
.	.	.	.
.	.	2	.
.	4	4	2
----------------------------------------

Step 4 | Action: 0 | Reward: 0.12 | Score: 8.0
.	4	2	4
.	.	4	.
.	.	.	.
.	2	.	.
----------------------------------------

Step 5 | Action: 0 | Reward: 0.00 | Score: 8.0
.	4	2	4
.	2	4	4
.	.	.	.
.	.	.	.
----------------------------------------

Step 6 | Action: 2 | Reward: 0.25 | Score: 16.0
4	2	4	.
2	8	.	.
.	.	.	.
.	.	.	2
----------------------------------------

Step 7 | Action: 1 | Reward: 0.00 | Score: 16.0
.	.	.	.
.	.	.	.
4	2	.	2
2	8	4	2
------------------------

In [7]:
import numpy as np
from stable_baselines3 import PPO

NUM_EPISODES = 30

# Load model
ppo_model = PPO.load("ppo_2048_v3_big")

def run_episode(env, policy_fn):
    obs, info = env.reset()
    done = False
    total_reward = 0.0
    final_score = 0.0
    max_tile = 0

    while not done:
        action = policy_fn(obs, env)
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        total_reward += reward
        final_score = info["score"]
        max_tile = info["max_tile"]

    return total_reward, final_score, max_tile

def random_policy(obs, env):
    return env.action_space.sample()

def ppo_policy(obs, env):
    action, _ = ppo_model.predict(obs, deterministic=True)
    return int(action)

def evaluate(name, policy_fn, num_episodes=NUM_EPISODES):
    rewards, scores, tiles = [], [], []
    for _ in range(num_episodes):
        env = Game2048EnvV3()
        R, S, T = run_episode(env, policy_fn)
        rewards.append(R)
        scores.append(S)
        tiles.append(T)
    print(f"\n=== {name} over {num_episodes} episodes ===")
    print(f"Avg total_reward: {np.mean(rewards):.2f} ¬± {np.std(rewards):.2f}")
    print(f"Avg final score:  {np.mean(scores):.2f} ¬± {np.std(scores):.2f}")
    print(f"Avg max tile:     {np.mean(tiles):.1f}")
    print(f"Max of max tiles: {np.max(tiles)}")
    return rewards, scores, tiles

rand_r, rand_s, rand_t = evaluate("Random", random_policy)
ppo_r,  ppo_s,  ppo_t  = evaluate("PPO",    ppo_policy,num_episodes=2)

print("\nDone.")



=== Random over 30 episodes ===
Avg total_reward: 9.68 ¬± 10.65
Avg final score:  962.40 ¬± 466.98
Avg max tile:     89.6
Max of max tiles: 128


KeyboardInterrupt: 