In [47]:
import random

class EaterEnvironment:
    def __init__(self, mover_policy, grid_size=7, g1_state=37, g2_state=49, initial_bananas_g1=100, initial_bananas_g2=100):
        self.mover_policy = mover_policy
        self.grid_size = grid_size
        self.g1 = self.state_to_position(g1_state)
        self.g2 = self.state_to_position(g2_state)
        self.bananas_g1 = initial_bananas_g1
        self.bananas_g2 = initial_bananas_g2
        self.actions = [
            (1, 0), (0, 1), (0.5, 0.5), (0.2, 0.8), (0.6, 0.4),
            (0.8, 0.2), (0.4, 0.6), (0.7, 0.3), (0.3, 0.7)
        ]
        self.initial_mover_state = 1
        self.current_mover_state = self.initial_mover_state
        self.current_mover_action = 0
        self.done = False
        self.action_history = []
        self.min_distance_to_g1 = float('inf')
        self.min_distance_to_g2 = float('inf')
        self.deception_detected = False
        self.assumed_true_goal = None  # Assumed true goal after deception is detected

    def reset(self):
        self.current_mover_state = 1
        self.current_mover_action = random.choice(range(len(self.actions)))
        self.done = False
        self.action_history.clear()
        self.min_distance_to_g1 = float('inf')
        self.min_distance_to_g2 = float('inf')
        self.deception_detected = False
        self.assumed_true_goal = None
        return (self.current_mover_state, self.current_mover_action)

    # Assume other methods like state_to_position, calculate_distance, etc., are defined here.
    def update_position(self, position, action):
        x, y = position
        if action == 0 and y < self.grid_size - 1:
            y += 1
        elif action == 1 and y > 0:
            y -= 1
        elif action == 2 and x > 0:
            x -= 1
        elif action == 3 and x < self.grid_size - 1:
            x += 1
        return (x, y)

    def position_to_state(self, position):
        x, y = position
        return y * self.grid_size + x + 1

    def detect_deception(self, previous_distance, current_distance, goal_position):
        # Simple deception detection based on moving away from a goal when very close
        if previous_distance < 2 and current_distance > 1:
            return True
        return False

    def step(self, eater_state, action_index):
        self.current_mover_state, _ = eater_state
        mover_position = self.state_to_position(self.current_mover_state)
        self.current_mover_action = self.mover_policy[mover_position]
        self.action_history.append(self.current_mover_action)  # Record action
        mover_next_position = self.update_position(mover_position, self.current_mover_action)
        next_mover_state = self.position_to_state(mover_next_position)

        previous_distance_to_g1 = self.calculate_distance(mover_position, self.g1)
        previous_distance_to_g2 = self.calculate_distance(mover_position, self.g2)

        current_distance_to_g1 = self.calculate_distance(mover_next_position, self.g1)
        current_distance_to_g2 = self.calculate_distance(mover_next_position, self.g2)

        if self.detect_deception(previous_distance_to_g1, current_distance_to_g1, self.g1):
            self.deception_detected = True
            self.assumed_true_goal = self.g2
        elif self.detect_deception(previous_distance_to_g2, current_distance_to_g2, self.g2):
            self.deception_detected = True
            self.assumed_true_goal = self.g1

        reward = self.calculate_reward(current_distance_to_g1, current_distance_to_g2, action_index)

        goal1_consume, goal2_consume = self.actions[action_index]
        if self.deception_detected:
            action_index = self.maximize_consumption(self.assumed_true_goal)
            goal1_consume, goal2_consume = self.actions[action_index]
        self.bananas_g1 -= goal1_consume
        self.bananas_g2 -= goal2_consume
        next_state = (next_mover_state, self.current_mover_action)
        if self.current_mover_action == 4:
            self.done = True
        return next_state, reward, self.done

    def maximize_consumption(self, assumed_true_goal):
        # Determine which action maximizes consumption from the assumed true goal
        if assumed_true_goal == self.g1:
            return 0  # Maximum consumption from g1
        else:
            return 1  # Maximum consumption from g2

    def calculate_reward(self, distance_to_g1, distance_to_g2, action_index):
        # Extend the reward calculation with deception detection and path prediction
        ideal_g1_ratio = distance_to_g2 / (distance_to_g1 + distance_to_g2) if (distance_to_g1 + distance_to_g2) != 0 else 0
        ideal_g2_ratio = 1 - ideal_g1_ratio
        action_ratios = {
            0: (1.0, 0.0), 1: (0.0, 1.0), 2: (0.5, 0.5), 3: (0.2, 0.8), 4: (0.6, 0.4),
            5: (0.8, 0.2), 6: (0.4, 0.6), 7: (0.7, 0.3), 8: (0.3, 0.7)
        }
        actual_g1_ratio, actual_g2_ratio = action_ratios[action_index]
        diff_g1 = abs(actual_g1_ratio - ideal_g1_ratio)
        diff_g2 = abs(actual_g2_ratio - ideal_g2_ratio)
        reward = 1 - (diff_g1 + diff_g2)

        # Path prediction and deception detection
        predicted_path = self.predict_future_path()
        actual_path = self.current_mover_state
        return reward

    def state_to_position(self, state):
        x = (state - 1) % self.grid_size
        y = (state - 1) // self.grid_size
        return (x, y)

    def calculate_distance(self, pos1, pos2):
        return abs(pos1[0] - pos2[0]) + abs(pos1[1] - pos2[1])

    def predict_future_path(self):
        # Simple path prediction based on the last observed action
        if not self.action_history:
            return self.current_mover_state  # No history, return current state
        last_action = self.action_history[-1]
        predicted_next_position = self.update_position(self.state_to_position(self.current_mover_state), last_action)


In [48]:
mover_policy  = {(0, 0): 0,
 (0, 1): 0,
 (0, 2): 0,
 (0, 3): 0,
 (0, 4): 0,
 (0, 5): 3,
 (0, 6): 1,
 (1, 0): 2,
 (1, 1): 1,
 (1, 2): 2,
 (1, 3): 2,
 (1, 4): 0,
 (1, 5): 3,
 (1, 6): 1,
 (2, 0): 2,
 (2, 1): 0,
 (2, 2): 0,
 (2, 3): 2,
 (2, 4): 0,
 (2, 5): 3,
 (2, 6): 2,
 (3, 0): 0,
 (3, 1): 0,
 (3, 2): 0,
 (3, 3): 0,
 (3, 4): 0,
 (3, 5): 0,
 (3, 6): 3,
 (4, 0): 2,
 (4, 1): 0,
 (4, 2): 0,
 (4, 3): 2,
 (4, 4): 2,
 (4, 5): 0,
 (4, 6): 3,
 (5, 0): 2,
 (5, 1): 1,
 (5, 2): 3,
 (5, 3): 3,
 (5, 4): 0,
 (5, 5): 0,
 (5, 6): 3,
 (6, 0): 1,
 (6, 1): 1,
 (6, 2): 3,
 (6, 3): 0,
 (6, 4): 0,
 (6, 5): 2,
 (6, 6): 4}

In [49]:
import random
env = EaterEnvironment(mover_policy)
next_state = (1,0)
for i in range(10):
  action = random.choice(range(len(env.actions)))
  next_state, reward, done = env.step(next_state, action)
  if done == True:
    break
  print(next_state, reward, done)


(8, 0) 0.7749999999999999 False
(15, 0) 0.8285714285714285 False
(22, 0) -0.10000000000000009 False
(29, 0) 1.0 False
(36, 0) 0.25 False
(37, 3) -0.19999999999999996 False
(38, 3) 0.33333333333333326 False
(39, 3) 0.6666666666666667 False
(46, 0) 0.6000000000000001 False
(47, 3) 0.46666666666666656 False


In [50]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

# Define the neural network model
class QNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, 16)
        self.fc2 = nn.Linear(16, 128)
        self.fc3 = nn.Linear(128, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)



class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)


In [51]:
import torch
import torch.nn as nn
import torch.optim as optim

# Assuming QNetwork and ReplayBuffer are defined above

state_dim = 2  # Define the size of the input state dimensions
action_dim = len(env.actions)  # Define the number of actions available in the environment
buffer_capacity = 10000
batch_size = 10
model = QNetwork(state_dim, action_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)
replay_buffer = ReplayBuffer(buffer_capacity)
gamma = 0.99
epsilon = 0.1
total_episodes = 5000

def compute_loss(batch):
    states, actions, rewards, next_states, dones = zip(*batch)
    states = torch.tensor(states, dtype=torch.float32)
    next_states = torch.tensor(next_states, dtype=torch.float32)
    actions = torch.tensor(actions)
    rewards = torch.tensor(rewards)
    dones = torch.tensor(dones)

    current_q_values = model(states).gather(1, actions.unsqueeze(1)).squeeze(1)
    next_q_values = model(next_states).max(1)[0]
    next_q_values[dones] = 0.0  # Zero out the values for terminal states
    target_q_values = rewards + gamma * next_q_values

    loss = nn.MSELoss()(current_q_values, target_q_values)
    return loss

for epoch in range(total_episodes):
    state = env.reset()
    done = False
    while not done:
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        q_values = model(state_tensor)
        action = torch.argmax(q_values).item() if random.random() > epsilon else random.randint(0, action_dim - 1)
        next_state, reward, done = env.step(state, action)

        replay_buffer.push(state, action, reward, next_state, done)
        state = next_state

        # Regular update if enough samples are collected
        if len(replay_buffer) >= batch_size:
            batch = replay_buffer.sample(batch_size)
            loss = compute_loss(batch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Update at the end of the episode using all available samples if fewer than batch_size
    if done and len(replay_buffer) < batch_size:
        batch = replay_buffer.sample(len(replay_buffer))  # Take all that is available
        loss = compute_loss(batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if epoch % 10 == 0:
        print("Epoch:", epoch + 1, "Loss:", loss.item() if 'loss' in locals() else 'No update')
print("Final loss:", loss.item() if 'loss' in locals() else 'No final update')


Epoch: 1 Loss: 4.440133094787598
Epoch: 11 Loss: 0.4337710738182068
Epoch: 21 Loss: 0.26174014806747437
Epoch: 31 Loss: 0.05129067972302437
Epoch: 41 Loss: 0.1471439152956009
Epoch: 51 Loss: 0.15136206150054932
Epoch: 61 Loss: 0.2854503393173218
Epoch: 71 Loss: 0.12240271270275116
Epoch: 81 Loss: 0.09675641357898712
Epoch: 91 Loss: 0.14667868614196777
Epoch: 101 Loss: 0.10476432740688324
Epoch: 111 Loss: 0.1501254141330719
Epoch: 121 Loss: 0.21464891731739044
Epoch: 131 Loss: 0.13221785426139832
Epoch: 141 Loss: 0.20359483361244202
Epoch: 151 Loss: 0.07239727675914764
Epoch: 161 Loss: 0.2690611779689789
Epoch: 171 Loss: 0.07715148478746414
Epoch: 181 Loss: 0.017327217385172844
Epoch: 191 Loss: 0.15159226953983307
Epoch: 201 Loss: 0.12241852283477783
Epoch: 211 Loss: 0.10833646357059479
Epoch: 221 Loss: 0.2009856253862381
Epoch: 231 Loss: 0.1792593151330948
Epoch: 241 Loss: 0.10419623553752899
Epoch: 251 Loss: 0.04052206873893738
Epoch: 261 Loss: 0.1610371470451355
Epoch: 271 Loss: 0.18

In [52]:
policy = {}
for mover_position in range(1, 50):
  for mover_action in range(4):
    state = (mover_position, mover_action)
    state_tensor = torch.tensor(state, dtype=torch.float32)
    q_values = model(state_tensor)
    best_action = torch.argmax(q_values).item()
    policy[state] = best_action
print(policy)

{(1, 0): 7, (1, 1): 7, (1, 2): 7, (1, 3): 7, (2, 0): 7, (2, 1): 7, (2, 2): 7, (2, 3): 7, (3, 0): 7, (3, 1): 7, (3, 2): 7, (3, 3): 7, (4, 0): 7, (4, 1): 7, (4, 2): 7, (4, 3): 7, (5, 0): 7, (5, 1): 7, (5, 2): 7, (5, 3): 7, (6, 0): 7, (6, 1): 7, (6, 2): 7, (6, 3): 7, (7, 0): 7, (7, 1): 7, (7, 2): 7, (7, 3): 7, (8, 0): 7, (8, 1): 7, (8, 2): 7, (8, 3): 7, (9, 0): 7, (9, 1): 7, (9, 2): 7, (9, 3): 7, (10, 0): 7, (10, 1): 7, (10, 2): 7, (10, 3): 7, (11, 0): 7, (11, 1): 7, (11, 2): 7, (11, 3): 7, (12, 0): 5, (12, 1): 5, (12, 2): 5, (12, 3): 5, (13, 0): 5, (13, 1): 5, (13, 2): 5, (13, 3): 5, (14, 0): 5, (14, 1): 5, (14, 2): 5, (14, 3): 5, (15, 0): 5, (15, 1): 5, (15, 2): 5, (15, 3): 5, (16, 0): 5, (16, 1): 5, (16, 2): 5, (16, 3): 5, (17, 0): 5, (17, 1): 5, (17, 2): 5, (17, 3): 5, (18, 0): 5, (18, 1): 5, (18, 2): 5, (18, 3): 5, (19, 0): 5, (19, 1): 5, (19, 2): 5, (19, 3): 5, (20, 0): 5, (20, 1): 5, (20, 2): 5, (20, 3): 5, (21, 0): 5, (21, 1): 5, (21, 2): 5, (21, 3): 5, (22, 0): 5, (22, 1): 5, (22

In [60]:
env = EaterEnvironment(mover_policy)
state = env.reset()

for i in range(19):
    if state[1] == 4:
      print(f" Final goal : {state[0]} Bananas at g1: {env.bananas_g1}, Bananas at g2: {env.bananas_g2}")
      break
    action_index = policy[state] # Cycle through actions
    next_state, reward, done = env.step(state, action_index)
    if done != True:
      print(f"Step {i+1}: Next State: {next_state}, Reward: {reward}, action: {env.actions[action_index]} Bananas at g1: {env.bananas_g1}, Bananas at g2: {env.bananas_g2}")
    state = next_state

Step 1: Next State: (8, 0), Reward: 0.9750000000000001, action: (0.7, 0.3) Bananas at g1: 99.3, Bananas at g2: 99.7
Step 2: Next State: (15, 0), Reward: 0.9714285714285713, action: (0.7, 0.3) Bananas at g1: 98.6, Bananas at g2: 99.4
Step 3: Next State: (22, 0), Reward: 0.8999999999999999, action: (0.8, 0.2) Bananas at g1: 97.8, Bananas at g2: 99.2
Step 4: Next State: (29, 0), Reward: 1.0, action: (0.8, 0.2) Bananas at g1: 97.0, Bananas at g2: 99.0
Step 5: Next State: (36, 0), Reward: 0.8500000000000001, action: (0.8, 0.2) Bananas at g1: 96.2, Bananas at g2: 98.8
Step 6: Next State: (37, 3), Reward: 1.0, action: (1, 0) Bananas at g1: 95.2, Bananas at g2: 98.8
Step 7: Next State: (38, 3), Reward: 0.9333333333333333, action: (0.8, 0.2) Bananas at g1: 94.4, Bananas at g2: 98.6
Step 8: Next State: (39, 3), Reward: 0.7333333333333332, action: (0.8, 0.2) Bananas at g1: 94.4, Bananas at g2: 97.6
Step 9: Next State: (46, 0), Reward: 0.6000000000000001, action: (0.3, 0.7) Bananas at g1: 94.4, Ba