In [41]:
import random

class EaterEnvironment:
    def __init__(self, mover_policy, grid_size=7, g1_state=37, g2_state=49, initial_bananas_g1=100, initial_bananas_g2=100):
        self.mover_policy = mover_policy
        self.grid_size = grid_size
        self.g1 = self.state_to_position(g1_state)
        self.g2 = self.state_to_position(g2_state)
        self.bananas_g1 = initial_bananas_g1
        self.bananas_g2 = initial_bananas_g2
        self.actions = [
            (1, 0), (0, 1), (0.5, 0.5), (0.2, 0.8), (0.6, 0.4),
            (0.8, 0.2), (0.4, 0.6), (0.7, 0.3), (0.3, 0.7)
        ]
        self.initial_mover_state = 1
        self.current_mover_state = self.initial_mover_state
        self.current_mover_action = 0
        self.done = False
        self.action_history = []
        self.min_distance_to_g1 = float('inf')
        self.min_distance_to_g2 = float('inf')
        self.deception_detected = False
        self.assumed_true_goal = None  # Assumed true goal after deception is detected

    def reset(self):
        self.current_mover_state = 1
        self.current_mover_action = random.choice(range(len(self.actions)))
        self.done = False
        self.action_history.clear()
        self.min_distance_to_g1 = float('inf')
        self.min_distance_to_g2 = float('inf')
        self.deception_detected = False
        self.assumed_true_goal = None
        return (self.current_mover_state, self.current_mover_action)

    # Assume other methods like state_to_position, calculate_distance, etc., are defined here.
    def update_position(self, position, action):
        x, y = position
        if action == 0 and y < self.grid_size - 1:
            y += 1
        elif action == 1 and y > 0:
            y -= 1
        elif action == 2 and x > 0:
            x -= 1
        elif action == 3 and x < self.grid_size - 1:
            x += 1
        return (x, y)

    def position_to_state(self, position):
        x, y = position
        return y * self.grid_size + x + 1

    def detect_deception(self, previous_distance, current_distance, goal_position):
        # Simple deception detection based on moving away from a goal when very close
        if previous_distance == 1 and current_distance > 1:
            return True
        return False

    def step(self, eater_state, action_index):
        self.current_mover_state, _ = eater_state
        mover_position = self.state_to_position(self.current_mover_state)
        self.current_mover_action = self.mover_policy[mover_position]
        self.action_history.append(self.current_mover_action)  # Record action
        mover_next_position = self.update_position(mover_position, self.current_mover_action)
        next_mover_state = self.position_to_state(mover_next_position)

        previous_distance_to_g1 = self.calculate_distance(mover_position, self.g1)
        previous_distance_to_g2 = self.calculate_distance(mover_position, self.g2)

        current_distance_to_g1 = self.calculate_distance(mover_next_position, self.g1)
        current_distance_to_g2 = self.calculate_distance(mover_next_position, self.g2)

        if self.detect_deception(previous_distance_to_g1, current_distance_to_g1, self.g1):
            self.deception_detected = True
            self.assumed_true_goal = self.g2
        elif self.detect_deception(previous_distance_to_g2, current_distance_to_g2, self.g2):
            self.deception_detected = True
            self.assumed_true_goal = self.g1

        reward = self.calculate_reward(current_distance_to_g1, current_distance_to_g2, action_index)

        goal1_consume, goal2_consume = self.actions[action_index]
        if self.deception_detected:
            action_index = self.maximize_consumption(self.assumed_true_goal)
            goal1_consume, goal2_consume = self.actions[action_index]
        self.bananas_g1 -= goal1_consume
        self.bananas_g2 -= goal2_consume
        next_state = (next_mover_state, self.current_mover_action)
        if self.current_mover_action == 4:
            self.done = True
        return next_state, reward, self.done

    def maximize_consumption(self, assumed_true_goal):
        # Determine which action maximizes consumption from the assumed true goal
        if assumed_true_goal == self.g1:
            return 0  # Maximum consumption from g1
        else:
            return 1  # Maximum consumption from g2

    def calculate_reward(self, distance_to_g1, distance_to_g2, action_index):
        # Extend the reward calculation with deception detection and path prediction
        ideal_g1_ratio = distance_to_g2 / (distance_to_g1 + distance_to_g2) if (distance_to_g1 + distance_to_g2) != 0 else 0
        ideal_g2_ratio = 1 - ideal_g1_ratio
        action_ratios = {
            0: (1.0, 0.0), 1: (0.0, 1.0), 2: (0.5, 0.5), 3: (0.2, 0.8), 4: (0.6, 0.4),
            5: (0.8, 0.2), 6: (0.4, 0.6), 7: (0.7, 0.3), 8: (0.3, 0.7)
        }
        actual_g1_ratio, actual_g2_ratio = action_ratios[action_index]
        diff_g1 = abs(actual_g1_ratio - ideal_g1_ratio)
        diff_g2 = abs(actual_g2_ratio - ideal_g2_ratio)
        reward = 1 - (diff_g1 + diff_g2)

        # Path prediction and deception detection
        predicted_path = self.predict_future_path()
        actual_path = self.current_mover_state
        return reward

    def state_to_position(self, state):
        x = (state - 1) % self.grid_size
        y = (state - 1) // self.grid_size
        return (x, y)

    def calculate_distance(self, pos1, pos2):
        return abs(pos1[0] - pos2[0]) + abs(pos1[1] - pos2[1])

    def predict_future_path(self):
        # Simple path prediction based on the last observed action
        if not self.action_history:
            return self.current_mover_state  # No history, return current state
        last_action = self.action_history[-1]
        predicted_next_position = self.update_position(self.state_to_position(self.current_mover_state), last_action)


In [42]:
mover_policy  = {(0, 0): 0,
 (0, 1): 0,
 (0, 2): 0,
 (0, 3): 0,
 (0, 4): 0,
 (0, 5): 3,
 (0, 6): 1,
 (1, 0): 2,
 (1, 1): 1,
 (1, 2): 2,
 (1, 3): 2,
 (1, 4): 0,
 (1, 5): 3,
 (1, 6): 1,
 (2, 0): 2,
 (2, 1): 0,
 (2, 2): 0,
 (2, 3): 2,
 (2, 4): 0,
 (2, 5): 3,
 (2, 6): 2,
 (3, 0): 0,
 (3, 1): 0,
 (3, 2): 0,
 (3, 3): 0,
 (3, 4): 0,
 (3, 5): 0,
 (3, 6): 3,
 (4, 0): 2,
 (4, 1): 0,
 (4, 2): 0,
 (4, 3): 2,
 (4, 4): 2,
 (4, 5): 0,
 (4, 6): 3,
 (5, 0): 2,
 (5, 1): 1,
 (5, 2): 3,
 (5, 3): 3,
 (5, 4): 0,
 (5, 5): 0,
 (5, 6): 3,
 (6, 0): 1,
 (6, 1): 1,
 (6, 2): 3,
 (6, 3): 0,
 (6, 4): 0,
 (6, 5): 2,
 (6, 6): 4}

In [43]:
env = EaterEnvironment(mover_policy)
next_state = (1,0)
for i in range(15):
  action = random.choice(range(len(env.actions)))
  next_state, reward, done = env.step(next_state, action)
  print(next_state,action,reward)

(8, 0) 4 0.825
(15, 0) 7 0.9714285714285713
(22, 0) 5 0.8999999999999999
(29, 0) 0 0.6000000000000001
(36, 0) 2 0.25
(37, 3) 0 1.0
(38, 3) 5 0.9333333333333333
(39, 3) 4 0.8666666666666667
(46, 0) 7 0.6000000000000001
(47, 3) 4 0.46666666666666656
(48, 3) 0 -0.6666666666666667
(49, 3) 6 0.19999999999999996
(49, 4) 3 0.6000000000000001
(49, 4) 6 0.19999999999999996
(49, 4) 1 1.0


In [44]:
def initialize_q_returns_policy(env):
    Q = {}
    returns = {}
    policy = {}
    # Assuming there are 49 mover positions and 5 mover actions (0 to 4)
    possible_actions = range(len(env.actions))  # Assuming env.actions is defined with 9 possible eater actions

    # Initialize Q, returns, and policy for every possible state and action
    for mover_position in range(1, 50):  # Mover positions 1 to 49
        for mover_action in range(4):  # Mover actions 0 to 4
            state = (mover_position, mover_action)
            for eater_action in possible_actions:
                Q[(state, eater_action)] = 0  # Initial Q-value
                returns[(state, eater_action)] = []  # Initialize return list
            # Initialize policy with a random action from possible eater actions
            policy[state] = random.choice(list(possible_actions))

    return Q, returns, policy


In [45]:
import numpy as np
import random

def generate_episode(env, policy, epsilon=0.1):
    episode = []
    env = EaterEnvironment(mover_policy)
    current_state = (1, 0)
    done = False
    while not done:
        if random.random() < epsilon:
            action = random.choice(range(len(env.actions)))
        else:
            action = policy[current_state] if current_state in policy else random.choice(range(len(env.actions)))
        next_state, reward, done = env.step(current_state, action)
        episode.append((current_state, action, reward))
        current_state = next_state
    return episode

import numpy as np
import random


def first_visit_mc(env, num_episodes, epsilon=0.1):
    Q, returns, policy = initialize_q_returns_policy(env)

    for episode_number in range(num_episodes):
        episode = generate_episode(env, policy, epsilon)
        G = 0
        visited = set()
        for (state, action, reward) in reversed(episode):
            G = reward + 0.99 * G
            if (state, action) not in visited:
                visited.add((state, action))
                returns[(state, action)].append(G)
                Q[(state, action)] = np.mean(returns[(state, action)])
                best_action = max((Q[(state, a)], a) for a in range(len(env.actions)))[1]
                policy[state] = best_action

    return policy, Q


estimated_policy, Q_values = first_visit_mc(env, 10000, 0.12)


In [46]:
print(estimated_policy)

{(1, 0): 4, (1, 1): 5, (1, 2): 5, (1, 3): 2, (2, 0): 2, (2, 1): 3, (2, 2): 8, (2, 3): 7, (3, 0): 4, (3, 1): 3, (3, 2): 3, (3, 3): 4, (4, 0): 5, (4, 1): 3, (4, 2): 7, (4, 3): 5, (5, 0): 4, (5, 1): 4, (5, 2): 6, (5, 3): 7, (6, 0): 7, (6, 1): 0, (6, 2): 3, (6, 3): 4, (7, 0): 5, (7, 1): 7, (7, 2): 2, (7, 3): 1, (8, 0): 5, (8, 1): 4, (8, 2): 5, (8, 3): 8, (9, 0): 1, (9, 1): 7, (9, 2): 8, (9, 3): 6, (10, 0): 7, (10, 1): 5, (10, 2): 4, (10, 3): 7, (11, 0): 4, (11, 1): 3, (11, 2): 6, (11, 3): 1, (12, 0): 1, (12, 1): 7, (12, 2): 3, (12, 3): 4, (13, 0): 8, (13, 1): 3, (13, 2): 4, (13, 3): 1, (14, 0): 5, (14, 1): 0, (14, 2): 3, (14, 3): 5, (15, 0): 5, (15, 1): 8, (15, 2): 8, (15, 3): 5, (16, 0): 7, (16, 1): 5, (16, 2): 0, (16, 3): 6, (17, 0): 7, (17, 1): 2, (17, 2): 5, (17, 3): 7, (18, 0): 7, (18, 1): 5, (18, 2): 7, (18, 3): 8, (19, 0): 3, (19, 1): 4, (19, 2): 8, (19, 3): 6, (20, 0): 5, (20, 1): 8, (20, 2): 7, (20, 3): 3, (21, 0): 1, (21, 1): 4, (21, 2): 7, (21, 3): 1, (22, 0): 5, (22, 1): 2, (22

In [47]:
env = EaterEnvironment(mover_policy)
current_eater_state = (1, 0)  # Initial state

for i in range(19):
    action_index = estimated_policy[current_eater_state] # Cycle through actions
    next_state, reward, done = env.step(current_eater_state, action_index)
    if not done:
      print(f"Step {i+1}: Next State: {next_state}, Reward: {reward}, action: {env.actions[action_index]} Bananas at g1: {env.bananas_g1}, Bananas at g2: {env.bananas_g2}")
      current_eater_state = next_state

Step 1: Next State: (8, 0), Reward: 0.825, action: (0.6, 0.4) Bananas at g1: 99.4, Bananas at g2: 99.6
Step 2: Next State: (15, 0), Reward: 0.8285714285714285, action: (0.8, 0.2) Bananas at g1: 98.60000000000001, Bananas at g2: 99.39999999999999
Step 3: Next State: (22, 0), Reward: 0.8999999999999999, action: (0.8, 0.2) Bananas at g1: 97.80000000000001, Bananas at g2: 99.19999999999999
Step 4: Next State: (29, 0), Reward: 1.0, action: (0.8, 0.2) Bananas at g1: 97.00000000000001, Bananas at g2: 98.99999999999999
Step 5: Next State: (36, 0), Reward: 0.8500000000000001, action: (0.8, 0.2) Bananas at g1: 96.20000000000002, Bananas at g2: 98.79999999999998
Step 6: Next State: (37, 3), Reward: 1.0, action: (1, 0) Bananas at g1: 95.20000000000002, Bananas at g2: 98.79999999999998
Step 7: Next State: (38, 3), Reward: 0.9333333333333333, action: (0.8, 0.2) Bananas at g1: 94.40000000000002, Bananas at g2: 98.59999999999998
Step 8: Next State: (39, 3), Reward: 0.9333333333333333, action: (0.7, 0.