In [52]:
import sys
print(sys.executable)

/usr/local/opt/python@3.10/bin/python3.10


In [53]:
import numpy as np
print(np.__version__)

1.26.4


In [54]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

In [55]:
def flatten_obs(obs):
    return obs.flatten()

In [56]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_size=300, hidden_size=128, output_size=100):
        super().__init__()

        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
            nn.Softmax(dim=-1)   # output is probability distribution
        )

    def forward(self, x):
        return self.layers(x)


In [57]:
class REINFORCEAgent:
    def __init__(self, lr=0.001, gamma=0.99):
        self.gamma = gamma
        self.policy = PolicyNetwork()
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr)

        self.log_probs = []
        self.episode_rewards = []

    def choose_action(self, state):
        state = torch.tensor(state, dtype=torch.float32)

        # forward pass → probability distribution over 100 actions
        probs = self.policy(state)

        dist = Categorical(probs)
        action = dist.sample()

        self.log_probs.append(dist.log_prob(action))

        return action.item()

    def compute_returns(self):
        # computes discounted returns for episode
        returns = []
        R = 0

        # backwards through the rewards
        for r in reversed(self.episode_rewards):
            R = r + self.gamma * R
            returns.insert(0, R)

        returns = torch.tensor(returns,dtype=torch.float32)

        returns = (returns-returns.mean()) /(returns.std()+1e-9)
        return returns

    def update_policy(self):
        # uses reinforce rule
        returns = self.compute_returns()

        loss = 0
        for log_prob, G in zip(self.log_probs, returns):
            # gradient ascent → maximize expected reward
            loss += -log_prob*G  

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.log_probs =[]
        self.episode_rewards =[]

In [58]:
reward_dict = {
    'win': 100,
    'missed': 0,
    'hit': 5,
    'proximal_hit': 20,
    'repeat_missed': -1,
    'repeat_hit': -1,
    'sunk_ship_bonus': 5.0,

    # same as 'hit'
    'touched': 5,         
    # same as 'repeat_hit'  
    'repeat_touched': -1     
}

In [59]:
env = gym.make("Battleship-v0", reward_dictionary=reward_dict)
agent = REINFORCEAgent(lr=1e-3, gamma=0.99)

num_episodes = 300  
episode_rewards = []


In [60]:
obs, _ = env.reset()
print("Obs shape:", obs.shape)
print(obs)

Obs shape: (3, 10, 10)
[[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 