# NXFP

In [1]:
import numpy as np
import random
from scipy.optimize import minimize
from scipy.special import logsumexp
from time import time
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Define the GridWorld environment
class IcyGridWorld:
    def __init__(self, size=3):
        self.size = size  # Grid size (e.g., 3x3)
        self.features = np.zeros((size, size))
        self.weights = {}
        self.define_features()
        self.states = [(x, y) for x in range(size) for y in range(size)]

    def define_features(self):
        for x in range(self.size):
            for y in range(self.size):
                if (x + y) % 3 == 0:
                    self.features[x, y] = 1
                elif (x + y) % 3 == 1:
                    self.features[x, y] = 2
                else:
                    self.features[x, y] = 3
        # Normalize weights to sum to 1 and stay in (0,1)
        raw_weights = {1: 0.4, 2: 0.2, 3: 0.4}
        abs_values = np.abs(list(raw_weights.values()))
        total = sum(abs_values)
        self.weights = {key: val / total for key, val in raw_weights.items()}

    def get_weight(self, position):
        x, y = position
        feature = self.features[x, y]
        return self.weights[feature]

# Define the Agent class
class Agent:
    def __init__(self, gridworld, discount_factor=0.95):
        self.gridworld = gridworld
        self.discount_factor = discount_factor
        self.actions = ['up', 'down', 'left', 'right', 'stay']
        self.num_states = len(gridworld.states)
        self.state_indices = {state: idx for idx, state in enumerate(gridworld.states)}

    def move(self, position, action):
        x, y = position
        if action == 'up' and x > 0:
            x -= 1
        elif action == 'down' and x < self.gridworld.size - 1:
            x += 1
        elif action == 'left' and y > 0:
            y -= 1
        elif action == 'right' and y < self.gridworld.size - 1:
            y += 1
        return (x, y)

    def value_iteration(self, max_iter=1000, tol=1e-7):
        V = np.zeros(self.num_states)
        for iteration in range(max_iter):
            V_prev = V.copy()
            for idx, state in enumerate(self.gridworld.states):
                action_values = []
                for action in self.actions:
                    new_state = self.move(state, action)
                    reward = self.gridworld.get_weight(new_state)
                    new_state_idx = self.state_indices[new_state]
                    Q_value = reward + self.discount_factor * V_prev[new_state_idx]
                    action_values.append(Q_value)
                V[idx] = logsumexp(action_values)
            if np.max(np.abs(V - V_prev)) < tol:
                break
        return V

    def get_action_probabilities(self, state, V):
        action_values = []
        for action in self.actions:
            new_state = self.move(state, action)
            reward = self.gridworld.get_weight(new_state)
            new_state_idx = self.state_indices[new_state]
            Q_value = reward + self.discount_factor * V[new_state_idx]
            action_values.append(Q_value)
        log_probs = action_values - logsumexp(action_values)
        return np.exp(log_probs)

def generate_trajectory(agent, start_position, max_steps=10):
    trajectory = []
    position = start_position
    V = agent.value_iteration()
    for _ in range(max_steps):
        probs = agent.get_action_probabilities(position, V)
        action = np.random.choice(agent.actions, p=probs)
        new_position = agent.move(position, action)
        reward = agent.gridworld.get_weight(new_position)
        trajectory.append((position, action, reward))
        position = new_position
    return trajectory

def generate_expert_data(agent, num_trajectories, max_steps=10):
    trajectories = []
    for _ in range(num_trajectories):
        start_position = (random.randint(0, agent.gridworld.size - 1), random.randint(0, agent.gridworld.size - 1))
        trajectories.append(generate_trajectory(agent, start_position, max_steps))
    return trajectories

def nfxp_log_likelihood(params, agent, trajectories):
    agent.gridworld.weights = {1: params[0], 2: params[1], 3: params[2]}
    V = agent.value_iteration()
    log_likelihood = 0
    for trajectory in trajectories:
        for (position, action, _) in trajectory:
            probs = agent.get_action_probabilities(position, V)
            action_index = agent.actions.index(action)
            log_likelihood += np.log(probs[action_index] + 1e-12)
    return -log_likelihood

def estimate_nfxp(agent, trajectories):
    initial_params = [0.3, 0.3, 0.4]  # Initial guess (sum to 1)
    bounds = [(0.01, 0.99)] * 3  # Enforce (0,1) constraint
    result = minimize(
        nfxp_log_likelihood,
        initial_params,
        args=(agent, trajectories),
        bounds=bounds,
        method='L-BFGS-B',
        options={'maxiter': 1000, 'disp': False}
    )

    # Calculate standard errors from the Hessian
    hessian_inv = result.hess_inv.todense() if hasattr(result.hess_inv, 'todense') else result.hess_inv
    standard_errors = np.sqrt(np.diag(hessian_inv)) if hessian_inv is not None else [np.nan] * len(initial_params)

    return result.x, standard_errors

# Main execution
if __name__ == "__main__":
    gridworld = IcyGridWorld()
    agent = Agent(gridworld)
    num_trajectories = 500
    max_steps = 10

    print(f"Grid size: {gridworld.size}x{gridworld.size}")
    print(f"Number of features: {len(gridworld.weights)}")
    print(f"Number of states: {len(gridworld.states)}")
    print(f"Number of trajectories: {num_trajectories}")
    print(f"Trajectory length: {max_steps}")

    expert_data = generate_expert_data(agent, num_trajectories, max_steps)

    # True weights
    true_weights = np.array(list(gridworld.weights.values()))
    print(f"\nTrue weights: {true_weights}")

    # Estimate weights and standard errors
    estimated_weights, standard_errors = estimate_nfxp(agent, expert_data)

    # Display results
    results_df = pd.DataFrame({
        'Weight': ['Theta_1', 'Theta_2', 'Theta_3'],
        'True': true_weights,
        'Estimate': estimated_weights,
        'Std. Error': standard_errors
    })

    print("\nEstimation Results:")
    print(results_df.to_string(index=False))

Grid size: 3x3
Number of features: 3
Number of states: 9
Number of trajectories: 500
Trajectory length: 10

True weights: [0.4 0.2 0.4]

Estimation Results:
 Weight  True  Estimate  Std. Error
Theta_1   0.4  0.411456    0.578342
Theta_2   0.2  0.204024    0.577305
Theta_3   0.4  0.386063    0.577772


# MAXENT

In [None]:
import numpy as np
from scipy.special import logsumexp

# Define the GridWorld environment
class IcyGridWorld:
    def __init__(self, size=3):
        self.size = size  # Grid size (e.g., 3x3)
        self.features = np.zeros((size, size))
        self.weights = {}
        self.define_features()
        self.states = [(x, y) for x in range(size) for y in range(size)]

    def define_features(self):
        for x in range(self.size):
            for y in range(self.size):
                if (x + y) % 3 == 0:
                    self.features[x, y] = 1
                elif (x + y) % 3 == 1:
                    self.features[x, y] = 2
                else:
                    self.features[x, y] = 3
        self.weights = {1: 0.7, 2: 0.2, 3: 0.1}

    def get_weight(self, position):
        x, y = position
        feature = self.features[x, y]
        return self.weights[feature]

# Define the Agent class
class Agent:
    def __init__(self, gridworld, discount_factor=0.95):
        self.gridworld = gridworld
        self.discount_factor = discount_factor
        self.actions = ['up', 'down', 'left', 'right', 'stay']
        self.num_states = len(gridworld.states)
        self.state_indices = {state: idx for idx, state in enumerate(gridworld.states)}

    def move(self, position, action):
        x, y = position
        if action == 'up' and x > 0:
            x -= 1
        elif action == 'down' and x < self.gridworld.size - 1:
            x += 1
        elif action == 'left' and y > 0:
            y -= 1
        elif action == 'right' and y < self.gridworld.size - 1:
            y += 1
        return (x, y)

    def value_iteration(self, max_iter=1000, tol=1e-7):
        V = np.zeros(self.num_states)
        for iteration in range(max_iter):
            V_prev = V.copy()
            for idx, state in enumerate(self.gridworld.states):
                action_values = []
                for action in self.actions:
                    new_state = self.move(state, action)
                    reward = self.gridworld.get_weight(new_state)
                    new_state_idx = self.state_indices[new_state]
                    Q_value = reward + self.discount_factor * V_prev[new_state_idx]
                    action_values.append(Q_value)
                V[idx] = logsumexp(action_values)
            if np.max(np.abs(V - V_prev)) < tol:
                break
        return V

    def get_action_probabilities(self, state, V):
        action_values = []
        for action in self.actions:
            new_state = self.move(state, action)
            reward = self.gridworld.get_weight(new_state)
            new_state_idx = self.state_indices[new_state]
            Q_value = reward + self.discount_factor * V[new_state_idx]
            action_values.append(Q_value)
        log_probs = action_values - logsumexp(action_values)
        return np.exp(log_probs)

# Generate trajectories
def generate_trajectory(agent, start_position, max_steps=10):
    trajectory = []
    position = start_position
    V = agent.value_iteration()
    for _ in range(max_steps):
        probs = agent.get_action_probabilities(position, V)
        action = np.random.choice(agent.actions, p=probs)
        new_position = agent.move(position, action)
        trajectory.append((position, action))
        position = new_position
    return trajectory

def generate_expert_data(agent, num_trajectories, max_steps=10):
    trajectories = []
    for _ in range(num_trajectories):
        start_position = (np.random.randint(0, agent.gridworld.size), np.random.randint(0, agent.gridworld.size))
        trajectories.append(generate_trajectory(agent, start_position, max_steps))
    return trajectories

# MaxEnt IRL
def maxent_irl(features, trajectories, num_states, discount, learning_rate=0.1, num_iterations=1000):
    num_features = features.shape[1]
    theta = np.random.uniform(size=num_features)
    mu_D = np.zeros(num_features)
    
    # Compute feature expectations from trajectories
    for trajectory in trajectories:
        for state, _ in trajectory:
            mu_D += features[state]
    mu_D /= len(trajectories)

    for iteration in range(num_iterations):
        # Compute rewards and policy
        rewards = features @ theta
        V = np.zeros(num_states)
        for _ in range(1000):  # Value iteration
            V_prev = V.copy()
            for s in range(num_states):
                V[s] = logsumexp([rewards[s] + discount * V_prev[s]])
            if np.max(np.abs(V - V_prev)) < 1e-7:
                break

        # Policy from value function
        policy = np.zeros((num_states, num_states))
        for s in range(num_states):
            policy[s] = np.exp(rewards[s] + discount * V[s] - logsumexp(rewards + discount * V))
        
        # State visitation frequencies
        mu_theta = np.zeros(num_features)
        for s in range(num_states):
            mu_theta += features[s] * policy[s].sum()

        # Update parameters
        grad = mu_D - mu_theta
        theta += learning_rate * grad

    return theta / theta.sum()

# Main integration
if __name__ == "__main__":
    gridworld = IcyGridWorld(size=3)
    agent = Agent(gridworld)
    trajectories = generate_expert_data(agent, num_trajectories=500, max_steps=20)

    num_states = len(gridworld.states)
    features = np.zeros((num_states, 3))
    for x in range(gridworld.size):
        for y in range(gridworld.size):
            idx = agent.state_indices[(x, y)]
            features[idx, int(gridworld.features[x, y]) - 1] = 1

    weights = maxent_irl(features, trajectories, num_states, discount=0.95)
    print(f"Recovered Weights: {weights}")


# Maxmargin

In [None]:
import numpy as np
from scipy.optimize import linprog
from scipy.special import logsumexp

# Define the GridWorld environment
class IcyGridWorld:
    def __init__(self, size=3):
        self.size = size  # Grid size (e.g., 3x3)
        self.features = np.zeros((size, size))
        self.weights = {}
        self.define_features()
        self.states = [(x, y) for x in range(size) for y in range(size)]

    def define_features(self):
        for x in range(self.size):
            for y in range(self.size):
                if (x + y) % 3 == 0:
                    self.features[x, y] = 1
                elif (x + y) % 3 == 1:
                    self.features[x, y] = 2
                else:
                    self.features[x, y] = 3
        self.weights = {1: 0.4, 2: 0.2, 3: 0.4}

    def get_weight(self, position):
        x, y = position
        feature = self.features[x, y]
        return self.weights[feature]

class Agent:
    def __init__(self, gridworld, discount_factor=0.95):
        self.gridworld = gridworld
        self.discount_factor = discount_factor
        self.actions = ['up', 'down', 'left', 'right', 'stay']
        self.num_states = len(gridworld.states)
        self.state_indices = {state: idx for idx, state in enumerate(gridworld.states)}

    def move(self, position, action):
        x, y = position
        if action == 'up' and x > 0:
            x -= 1
        elif action == 'down' and x < self.gridworld.size - 1:
            x += 1
        elif action == 'left' and y > 0:
            y -= 1
        elif action == 'right' and y < self.gridworld.size - 1:
            y += 1
        return (x, y)

    def generate_trajectory(self, start_position, max_steps=10):
        trajectory = []
        position = start_position
        for _ in range(max_steps):
            action = np.random.choice(self.actions)
            new_position = self.move(position, action)
            trajectory.append((self.gridworld.states.index(position), action))
            position = new_position
        return trajectory

    def generate_expert_data(self, num_trajectories, max_steps=10):
        trajectories = []
        for _ in range(num_trajectories):
            start_position = (np.random.randint(0, self.gridworld.size), np.random.randint(0, self.gridworld.size))
            trajectories.append(self.generate_trajectory(start_position, max_steps))
        return trajectories

# Max-Margin IRL

def state_features(gridworld):
    num_states = len(gridworld.states)
    num_features = 3  # Assuming 3 features as defined in IcyGridWorld
    features = np.zeros((num_states, num_features))
    for x in range(gridworld.size):
        for y in range(gridworld.size):
            state_index = gridworld.states.index((x, y))
            feature_type = int(gridworld.features[x, y]) - 1  # Map features {1, 2, 3} to indices {0, 1, 2}
            features[state_index, feature_type] = 1
    return features
    return np.eye(num_states)

def feature_expectations(features, trajectories, discount):
    expectations = np.zeros(features.shape[1])
    for trajectory in trajectories:
        for t, (state, _) in enumerate(trajectory):
            expectations += (discount ** t) * features[state]
    expectations /= len(trajectories)
    return expectations

def maxmargin_irl(features, trajectories, num_states, discount, num_comparator_policies=5):
    mu_D = feature_expectations(features, trajectories, discount)
    comparator_expectations = []

    for _ in range(num_comparator_policies):
        random_policy = np.random.dirichlet(np.ones(len(features[0])), size=num_states)
        random_policy /= random_policy.sum(axis=1, keepdims=True)
        mu_j = feature_expectations(features, agent.generate_expert_data(num_trajectories=10, max_steps=10), discount)
        comparator_expectations.append(mu_j)

    comparator_expectations = np.array(comparator_expectations)
    num_features = features.shape[1]

    c = np.zeros(num_features + 1)
    c[-1] = -1.0

    A_ub = []
    b_ub = []
    for mu_j in comparator_expectations:
        diff = mu_D - mu_j
        row = np.zeros(num_features + 1)
        row[:num_features] = -diff
        row[-1] = 1.0
        A_ub.append(row)
        b_ub.append(0.0)

    bounds = [(0, None)] * num_features + [(0, None)]
    res = linprog(c, A_ub=A_ub, b_ub=b_ub, bounds=bounds, method='highs')

    if not res.success:
        raise ValueError(f"Linear Program did not find a feasible solution. Debug info:
A_ub: {A_ub}
b_ub: {b_ub}
mu_D: {mu_D}
mu_j: {comparator_expectations}")

    x = res.x
    alpha = x[:-1]
    margin = x[-1]

    return alpha, margin

if __name__ == "__main__":
    gridworld = IcyGridWorld(size=3)
    agent = Agent(gridworld)
    trajectories = agent.generate_expert_data(num_trajectories=50, max_steps=10)

    features = state_features(gridworld)
    alpha, margin = maxmargin_irl(features, trajectories, len(gridworld.states), discount=0.95)

    print(f"Recovered Weights: {alpha}")
    print(f"Margin: {margin}")


AttributeError: 'IcyGridWorld' object has no attribute 'state_indices'