In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import gym

np.bool8 = bool
np.bool = np.bool_
np.float32 = float

# Define Deep Q-Network (DQN) Model
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [4]:
class Game2048Env(gym.Env):
    """
    OpenAI Gym environment for the 2048 game.
    Observation: 4x4 grid of integers (exponents of 2; 0 means empty).
    Actions: 0=up, 1=down, 2=left, 3=right.
    Reward: sum of all merged tile values in the move.
    Episode ends when no moves are possible.
    """
    metadata = {'render.modes': ['human']}

    def __init__(self):
        super(Game2048Env, self).__init__()
        self.size = 4
        # Observation space: grid of integers from 0 up to, say, 16 (2^16=65536)
        self.observation_space = spaces.Box(low=0, high=16, shape=(self.size, self.size), dtype=np.int32)
        # Action space: 4 directions
        self.action_space = spaces.Discrete(4)
        self.reset()

    def reset(self):
        # Start with empty grid and add two tiles
        self.board = np.zeros((self.size, self.size), dtype=np.int32)
        self._add_tile()
        self._add_tile()
        self.score = 0
        return self.get_state()

    def step(self, action):
        assert self.action_space.contains(action), "Invalid action"
        board_before = self.board.copy()
        reward = self._move(action)
        moved = not np.array_equal(board_before, self.board)
        if moved:
            self._add_tile()
        self.score += reward
        done = not self._can_move()
        info = {'score': self.score}
        return self.get_state(), reward, done, info

    def render(self, mode='human'):
        # Simple console render
        def fmt(x): return f"{2**x:5d}" if x>0 else "    ."
        print("+-----"*self.size + "+")
        for row in self.board:
            print("|" + "|".join(fmt(x) for x in row) + "|")
            print("+-----"*self.size + "+")

    def _add_tile(self):
        empties = list(zip(*np.where(self.board == 0)))
        if not empties:
            return
        i, j = random.choice(empties)
        # 90% 2 (exponent=1), 10% 4 (exp=2)
        self.board[i, j] = 1 if random.random() < 0.9 else 2

    def _move(self, direction):
        """
        Slide and merge tiles in given direction.
        Returns total merge reward (sum of merged tile values, e.g. merging two '3's gives 2^4=16).
        """
        reward = 0
        for idx in range(self.size):
            if direction == 0 or direction == 1:  # up/down => work on columns
                line = self.board[:, idx]
            else:  # left/right => work on rows
                line = self.board[idx, :]

            if direction == 1 or direction == 3:  # down or right => reverse
                line = line[::-1]

            merged_line, gained = self._merge_line(line)
            reward += gained

            if direction == 1 or direction == 3:
                merged_line = merged_line[::-1]

            if direction == 0 or direction == 1:
                self.board[:, idx] = merged_line
            else:
                self.board[idx, :] = merged_line
        return reward

    @staticmethod
    def _merge_line(line):
        """Compresses one row/col and merges equal tiles. Returns new line and reward."""
        new = [x for x in line if x != 0]
        reward = 0
        i = 0
        while i < len(new) - 1:
            if new[i] == new[i+1]:
                new[i] += 1
                reward += 2**new[i]
                del new[i+1]
                i += 1
            else:
                i += 1
        # pad with zeros
        new += [0] * (len(line) - len(new))
        return np.array(new, dtype=np.int32), reward

    def _can_move(self):
        # If any empty, can move
        if np.any(self.board == 0):
            return True
        # If any adjacent equal tiles, can move
        for i in range(self.size):
            for j in range(self.size-1):
                if self.board[i, j] == self.board[i, j+1]:
                    return True
                if self.board[j, i] == self.board[j+1, i]:
                    return True
        return False

    def close(self):
        pass

    def get_state(self):
        obs = np.zeros((1, self.size * self.size), dtype=np.int32)
        for y in range(self.size):
            for x in range(self.size):
                obs[0][y * self.size + x] = self.board[y][x]
        return obs

In [2]:
# Define DQN Agent with Experience Replay Buffer
class DQNAgent:
    def __init__(self, state_dim, action_dim, lr, gamma, epsilon, epsilon_decay, buffer_size):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.memory = deque(maxlen=buffer_size)
        self.model = DQN(state_dim, action_dim)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.action_dim)
        q_values = self.model(torch.tensor(state, dtype=torch.float32))
        return torch.argmax(q_values).item()

    def choose(self, state):
        q_values = self.model(torch.tensor(state, dtype=torch.float32))
        return torch.argmax(q_values).item()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * torch.max(self.model(torch.tensor(next_state, dtype=torch.float32))).item()
            target_f = self.model(torch.tensor(state, dtype=torch.float32))
            target_f[action] = target
            self.optimizer.zero_grad()
            loss = nn.MSELoss()(torch.tensor(target_f), self.model(torch.tensor(state, dtype=torch.float32)))
            loss.backward()
            self.optimizer.step()
        if self.epsilon > 0.01:
            self.epsilon *= self.epsilon_decay

In [3]:
# Initialize environment and agent with Experience Replay Buffer
env = gym.make('CartPole-v1')
state_dim = 4
action_dim = 2
agent = DQNAgent(state_dim, action_dim, lr=0.001, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, buffer_size=10000)

# Train the DQN agent with Experience Replay Buffer
batch_size = 32
num_episodes = 1000
for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0
    done = False
    while not done:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        agent.replay(batch_size)
    print(f"Episode: {episode + 1}, Total Reward: {total_reward}")

  deprecation(
  deprecation(


Episode: 1, Total Reward: 21.0
Episode: 2, Total Reward: 11.0


  loss = nn.MSELoss()(torch.tensor(target_f), self.model(torch.tensor(state, dtype=torch.float32)))


Episode: 3, Total Reward: 28.0
Episode: 4, Total Reward: 26.0
Episode: 5, Total Reward: 15.0
Episode: 6, Total Reward: 19.0
Episode: 7, Total Reward: 15.0
Episode: 8, Total Reward: 21.0
Episode: 9, Total Reward: 14.0
Episode: 10, Total Reward: 17.0
Episode: 11, Total Reward: 13.0
Episode: 12, Total Reward: 14.0
Episode: 13, Total Reward: 10.0
Episode: 14, Total Reward: 51.0
Episode: 15, Total Reward: 36.0
Episode: 16, Total Reward: 41.0
Episode: 17, Total Reward: 57.0
Episode: 18, Total Reward: 85.0
Episode: 19, Total Reward: 58.0
Episode: 20, Total Reward: 48.0
Episode: 21, Total Reward: 51.0
Episode: 22, Total Reward: 60.0
Episode: 23, Total Reward: 45.0
Episode: 24, Total Reward: 28.0
Episode: 25, Total Reward: 34.0
Episode: 26, Total Reward: 62.0
Episode: 27, Total Reward: 48.0
Episode: 28, Total Reward: 85.0
Episode: 29, Total Reward: 79.0
Episode: 30, Total Reward: 77.0
Episode: 31, Total Reward: 119.0
Episode: 32, Total Reward: 169.0
Episode: 33, Total Reward: 130.0
Episode: 34,

In [7]:
agent.epsilon = 0
state = env.reset()
total_reward = 0
done = False
while not done:
    action = agent.act(state)
    next_state, reward, done, _ = env.step(action)
    state = next_state
    total_reward += reward
print(f"Total Reward: {total_reward}")

Total Reward: 500.0
