In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

In [None]:
class ShapePlacementEnv:
    def __init__(self, grid_size, shapes):
        """
        Initialize the environment.
        :param grid_size: Tuple (n, m) representing the size of the grid.
        :param shapes: List of tuples representing the shapes to be placed.
        """
        self.grid_size = grid_size
        self.shapes = shapes
        self.reset()

    def reset(self):
        """
        Reset the environment for a new episode.
        :return: The initial state of the environment.
        """
        self.grid = np.zeros(self.grid_size)
        self.current_shape_index = 0
        return self.grid

    def step(self, action):
        """
        Take an action in the environment.
        :param action: Tuple (x, y) representing where to place the current shape.
        :return: (next_state, reward, done)
        """
        reward = 0
        done = False

        shape = self.shapes[self.current_shape_index]
        if self._can_place_shape(action, shape):
            self._place_shape(action, shape)
            reward = 1  # Example reward for successful placement
        else:
            reward = -1  # Example penalty for unsuccessful placement

        self.current_shape_index += 1
        if self.current_shape_index >= len(self.shapes):
            done = True  # End of episode

        return self.grid, reward, done

    def _can_place_shape(self, position, shape):
        """
        Check if a shape can be placed at the given position.
        """
        x, y = position
        shape_width, shape_height = shape
        if x + shape_width > self.grid_size[0] or y + shape_height > self.grid_size[1]:
            return False

        # Check for overlap
        for i in range(x, x + shape_width):
            for j in range(y, y + shape_height):
                if self.grid[i, j] == 1:
                    return False

        return True

    def _place_shape(self, position, shape):
        """
        Place a shape on the grid at the given position.
        """
        x, y = position
        shape_width, shape_height = shape
        for i in range(x, x + shape_width):
            for j in range(y, y + shape_height):
                self.grid[i, j] = 1

In [None]:
class PolicyNetwork(nn.Module):
    def __init__(self, n, m):
        super(PolicyNetwork, self).__init__()
        self.n = n
        self.m = m

        # Assuming input grid is n x m and single-channel
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)

        # Calculate the size of the grid after convolutional and pooling layers
        conv_output_size = (n // 4) * (m // 4) * 64  # Assuming two 2x2 pooling layers

        # Fully connected layers
        self.fc1 = nn.Linear(conv_output_size, 128)
        # The output size will be dynamically determined based on the number of empty cells
        self.fc2 = nn.Linear(128, n * m)  # Maximum possible actions

    def forward(self, x, empty_cells_count):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))

        # Adjusting the output layer based on the number of empty cells
        action_probs = self.fc2(x)[:, :empty_cells_count]  # Slicing to match empty cells count
        return F.softmax(action_probs, dim=1)

# Example usage
n, m = 10, 10  # Grid size
policy_net = PolicyNetwork(n, m)
optimizer = torch.optim.Adam(policy_net.parameters(), lr=0.01)  # Example learning rate

In [None]:
def run_episode(env, policy_net):
    state = env.reset()
    log_probs = []
    rewards = []
    done = False
    while not done:
        #print(state)
        #state = preprocess(state)  # Convert state to the correct input format
        empty_cells_count = get_empty_cells_count(state)  # Count the number of empty cells
        action_probs = policy_net(state, empty_cells_count)
        
        # Sample an action
        action = select_action(action_probs)
        log_prob = torch.log(action_probs.squeeze(0)[action])
        
        next_state, reward, done = env.step(action)  # Perform the action in the environment

        log_probs.append(log_prob)
        rewards.append(reward)
        state = next_state
    return log_probs, rewards


In [None]:
def compute_returns(rewards, gamma=0.99):
    returns = []
    R = 0
    for r in reversed(rewards):
        R = r + gamma * R
        returns.insert(0, R)
    return returns


In [None]:
def update_policy(log_probs, returns):
    policy_loss = []
    for log_prob, R in zip(log_probs, returns):
        policy_loss.append(-log_prob * R)
    policy_loss = torch.cat(policy_loss).sum()

    optimizer.zero_grad()
    policy_loss.backward()
    optimizer.step()


In [None]:
num_episodes = 1000  # Example number of episodes
env = ShapePlacementEnv(5000,5000)
for episode in range(num_episodes):
    log_probs, rewards = run_episode(env, policy_net)
    returns = compute_returns(rewards)
    update_policy(log_probs, returns)

    if episode % 100 == 0:
        print(f"Episode {episode}: Average Reward: {sum(rewards) / len(rewards)}")
