In [35]:
import numpy as np

np.set_printoptions(precision=4, suppress=True)

# ---------------------------
# 1) Triangular MDP Setup
# ---------------------------
# States: 0,1,2 (think of them arranged in a triangle)
# Actions: 0=left, 1=right, 2=stay
# We'll define transitions with a small noise probability eps.
eps = 0.05
n_states = 3
n_actions = 3
gamma = 0.9

def build_transition_matrix(eps=0.05):
    """
    Triangular 3-state MDP with 3 actions.
    P[(s, a)] = probability distribution over next states [p0, p1, p2].
    
    Action 0 = 'left' 
    Action 1 = 'right'
    Action 2 = 'stay'
    """
    # Initialize dictionary
    P = {}
    # Helper to set distribution with main target and small noise to others
    def dist(target):
        # With prob (1 - eps) go to the target, with prob eps the rest is evenly distributed
        d = np.ones(n_states) * (eps / (n_states))
        d[target] += (1 - eps)
        return d
    
    # For each state s, define transitions for each action
    for s in range(n_states):
        # Action 'left'
        if s == 0:
            P[(0, 0)] = dist(target=2)  # from 0, left goes to 2 ideally
        elif s == 1:
            P[(1, 0)] = dist(target=0)  # from 1, left goes to 0
        else: # s==2
            P[(2, 0)] = dist(target=1)  # from 2, left goes to 1
        
        # Action 'right'
        if s == 0:
            P[(0, 1)] = dist(target=1)  # from 0, right -> 1
        elif s == 1:
            P[(1, 1)] = dist(target=2)  # from 1, right -> 2
        else: # s==2
            P[(2, 1)] = dist(target=0)  # from 2, right -> 0
        
        # Action 'stay'
        P[(s, 2)] = dist(target=s)     # from s, stay in s
    return P

# Construct the transitions
P = build_transition_matrix(eps)

# ---------------------------
# 2) Features & True Rewards
# ---------------------------
# We'll use one-hot features for each state: phi(s) = e_s
features = np.eye(n_states)

# Set a "true" reward for demonstration. 
# The user can choose any arbitrary vector.
true_rewards = np.array([0.5, 0.25, 0.10])

# ---------------------------
# 3) Soft Value Iteration
# ---------------------------
def soft_value_iteration(reward, P, tol=1e-6, max_iter=200):
    """
    V(s) <- log sum_a exp( R(s) + gamma * sum_{s'} P(s'|s,a)*V(s') )
    """
    V = np.zeros(n_states)
    for _ in range(max_iter):
        V_prev = V.copy()
        for s in range(n_states):
            Q_sa = []
            for a in range(n_actions):
                Q_sa.append(reward[s] + gamma * np.dot(P[(s,a)], V_prev))
            # log-sum-exp
            V[s] = np.log(np.sum(np.exp(Q_sa)))
        if np.max(np.abs(V - V_prev)) < tol:
            break
    return V

# ---------------------------
# 4) Compute Policy
# ---------------------------
def compute_policy(V, reward, P):
    """
    pi(a|s) = exp(Q(s,a)) / sum_{a'} exp(Q(s,a'))
    Q(s,a) = R(s) + gamma * sum_{s'} P(s'|s,a]*V(s')
    """
    policy = np.zeros((n_states, n_actions))
    Q_values = np.zeros((n_states, n_actions))  # for debug printing
    for s in range(n_states):
        Q_sa = []
        for a in range(n_actions):
            q = reward[s] + gamma * np.dot(P[(s,a)], V)
            Q_sa.append(q)
        Q_sa = np.array(Q_sa)
        # For debugging
        Q_values[s] = Q_sa
        # Stable softmax
        shift = Q_sa - np.max(Q_sa)
        policy[s] = np.exp(shift) / np.sum(np.exp(shift))
    return policy, Q_values

# ---------------------------
# 5) Compute State Visitation Frequencies
# ---------------------------
def compute_svf(policy, P, start_state=0, trajectory_length=5):
    """
    Accumulate visitation frequencies over 'trajectory_length' steps,
    starting with a single initial state (prob=1).
    """
    d_t = np.zeros(n_states)
    d_t[start_state] = 1.0

    svf = np.zeros(n_states)
    for _ in range(trajectory_length):
        svf += d_t
        next_d_t = np.zeros(n_states)
        for s in range(n_states):
            for a in range(n_actions):
                next_d_t += d_t[s] * policy[s, a] * P[(s,a)]
        d_t = next_d_t
    return svf

# ---------------------------
# 6) Generate "Expert" Trajectories
# ---------------------------
def generate_soft_optimal_trajectories(policy, P, n_trajectories=100, trajectory_length=5):
    """
    Sample states by following 'policy'. 
    Each trajectory has length 'trajectory_length'.
    """
    trajectories = []
    for _ in range(n_trajectories):
        traj = []
        # Start from a random state
        state = np.random.choice(n_states)
        for _ in range(trajectory_length):
            action = np.random.choice(n_actions, p=policy[state])
            next_state = np.random.choice(n_states, p=P[(state, action)])
            traj.append(state)
            state = next_state
        trajectories.append(traj)
    return trajectories

# ---------------------------
# 7) MaxEnt IRL
# ---------------------------
def maxent_irl(features, expert_trajectories,
               P,  # transition model
               true_rewards=None, true_policy=None, true_value=None,
               gamma=0.9, lr=0.01, n_iters=10000, print_every=1000):
    """
    Gradient-based MaxEnt IRL: R(s) = theta^T phi(s).
    Additional debugging prints added.
    """
    n_states, d_features = features.shape
    # Initialize random weights
    reward_weights = np.random.uniform(size=d_features)

    # Compute expert state visitation frequency (normalized)
    expert_svf = np.zeros(n_states)
    total_steps = 0
    for traj in expert_trajectories:
        for s in traj:
            expert_svf[s] += 1
        total_steps += len(traj)
    expert_svf /= total_steps

    for it in range(n_iters):
        # 1) Current reward
        reward_est = features @ reward_weights
        
        # 2) Soft Value Iteration
        V_est = soft_value_iteration(reward_est, P)
        
        # 3) Compute policy & Q-values
        policy_est, Q_values = compute_policy(V_est, reward_est, P)
        
        # 4) Predicted SVF
        svf_est = compute_svf(policy_est, P, start_state=0, trajectory_length=5)
        svf_est /= np.sum(svf_est)

        # 5) Gradient step
        grad = expert_svf - svf_est
        reward_weights += lr * features.T @ grad

        # 6) Debug prints
        if (it+1) % print_every == 0:
            loss_svf = np.linalg.norm(expert_svf - svf_est)  # L2
            grad_norm = np.linalg.norm(grad)
            msg = f"Iter {it+1:05d} | Loss(SVF): {loss_svf:.4f} | GradNorm: {grad_norm:.4f}"
            
            if true_policy is not None:
                # L1 difference across all states, all actions
                pol_diff = np.sum(np.abs(policy_est - true_policy))
                msg += f" | PolDiff: {pol_diff:.4f}"
            if true_value is not None:
                # L1 difference in value
                val_diff = np.sum(np.abs(V_est - true_value))
                msg += f" | ValDiff: {val_diff:.4f}"
            if true_rewards is not None:
                # L1 difference in reward vector
                rew_diff = np.sum(np.abs(reward_est - true_rewards))
                msg += f" | RewDiff: {rew_diff:.4f}"

            print(msg)
            # Print SVF side by side
            print(f"  Expert SVF: {expert_svf}")
            print(f"  Pred   SVF: {svf_est}")
            # Print Q-values
            print("  Q-values (s x a):")
            for s in range(n_states):
                print(f"    s={s}, Q={Q_values[s]}")
            # Print policy
            print("  Policy (s x a):")
            for s in range(n_states):
                print(f"    s={s}, π={policy_est[s]}")
            # Print reward
            print(f"  Reward: {reward_est}")
            print("")

    return features @ reward_weights

# ---------------------------
# 8) Main Demo
# ---------------------------
if __name__ == "__main__":
    np.random.seed(0)  # for reproducibility, if desired

    # 1) Compute the "true" V, policy from the known reward
    V_true = soft_value_iteration(true_rewards, P)
    policy_true, _ = compute_policy(V_true, true_rewards, P)

    # 2) Generate "expert" data from the "true" policy
    n_sample_trajectories = 500
    expert_trajectories = generate_soft_optimal_trajectories(
        policy_true, P, n_trajectories=n_sample_trajectories, trajectory_length=100
    )

    # 3) Run MaxEnt IRL with extra prints
    estimated_rewards = maxent_irl(
        features,
        expert_trajectories,
        P,
        true_rewards=true_rewards,
        true_policy=policy_true,
        true_value=V_true,
        lr=0.01,
        n_iters=10000,
        print_every=1000
    )

    # Final results
    print("\nFinal Results:")
    print("True Rewards:      ", true_rewards)
    print("Estimated Rewards: ", estimated_rewards)

Iter 01000 | Loss(SVF): 0.0076 | GradNorm: 0.0076 | PolDiff: 0.9582 | ValDiff: 11.8977 | RewDiff: 1.5524
  Expert SVF: [0.3905 0.3233 0.2862]
  Pred   SVF: [0.3938 0.3171 0.2891]
  Q-values (s x a):
    s=0, Q=[16.5041 16.6004 16.0799]
    s=1, Q=[16.6887 17.1129 17.2093]
    s=2, Q=[17.0966 16.576  17.0002]
  Policy (s x a):
    s=0, π=[0.3629 0.3996 0.2375]
    s=1, π=[0.2375 0.3629 0.3996]
    s=2, π=[0.3996 0.2375 0.3629]
  Reward: [0.2974 0.9063 0.7935]

Iter 02000 | Loss(SVF): 0.0008 | GradNorm: 0.0008 | PolDiff: 0.9798 | ValDiff: 11.9721 | RewDiff: 1.5823
  Expert SVF: [0.3905 0.3233 0.2862]
  Pred   SVF: [0.3911 0.3227 0.2863]
  Q-values (s x a):
    s=0, Q=[16.5035 16.6286 16.0745]
    s=1, Q=[16.7225 17.1515 17.2766]
    s=2, Q=[17.1303 16.5762 17.0052]
  Policy (s x a):
    s=0, π=[0.3591 0.407  0.2339]
    s=1, π=[0.2339 0.3591 0.407 ]
    s=2, π=[0.407  0.2339 0.3591]
  Reward: [0.2825 0.9305 0.7842]

Iter 03000 | Loss(SVF): 0.0001 | GradNorm: 0.0001 | PolDiff: 0.9832 | Va

In [2]:
import numpy as np

np.set_printoptions(precision=4, suppress=True)

# -------------------------------------------------
# 1) Triangular MDP Setup
# -------------------------------------------------
eps = 0.05
n_states = 3
n_actions = 3
gamma = 0.9

def build_transition_matrix(eps=0.05):
    """
    Triangular 3-state MDP with 3 actions.
    P[(s, a)] = probability distribution over next states [p0, p1, p2].
    
    Action 0=left, 1=right, 2=stay
    """
    P = {}
    def dist(target):
        d = np.ones(n_states) * (eps / n_states)
        d[target] += (1 - eps)
        return d
    
    for s in range(n_states):
        if s == 0:
            P[(0,0)] = dist(target=2)  # left
            P[(0,1)] = dist(target=1)  # right
            P[(0,2)] = dist(target=0)  # stay
        elif s == 1:
            P[(1,0)] = dist(target=0)
            P[(1,1)] = dist(target=2)
            P[(1,2)] = dist(target=1)
        else:  # s=2
            P[(2,0)] = dist(target=1)
            P[(2,1)] = dist(target=0)
            P[(2,2)] = dist(target=2)
    return P

P = build_transition_matrix(eps)

# -------------------------------------------------
# 2) Features & True Rewards
# -------------------------------------------------
features = np.eye(n_states)
true_rewards = np.array([0.0, 0.25, 0.9])

# -------------------------------------------------
# 3) Soft Value Iteration
# -------------------------------------------------
def soft_value_iteration(reward, P, tol=1e-6, max_iter=200):
    V = np.zeros(n_states)
    for _ in range(max_iter):
        V_prev = V.copy()
        for s in range(n_states):
            Q_sa = [reward[s] + gamma * np.dot(P[(s,a)], V_prev)
                    for a in range(n_actions)]
            # log-sum-exp
            V[s] = np.log(np.sum(np.exp(Q_sa)))
        if np.max(np.abs(V - V_prev)) < tol:
            break
    return V

# -------------------------------------------------
# 4) Compute Policy
# -------------------------------------------------
def compute_policy(V, reward, P):
    policy = np.zeros((n_states, n_actions))
    Q_values = np.zeros((n_states, n_actions))
    for s in range(n_states):
        Q_sa = np.array([reward[s] + gamma * np.dot(P[(s,a)], V) for a in range(n_actions)])
        Q_values[s] = Q_sa
        shift = Q_sa - np.max(Q_sa)  # stable softmax
        policy[s] = np.exp(shift) / np.sum(np.exp(shift))
    return policy, Q_values

# -------------------------------------------------
# 5) Compute State Visitation Frequencies
# -------------------------------------------------
def compute_svf(policy, P, start_state=0, trajectory_length=5):
    d_t = np.zeros(n_states)
    d_t[start_state] = 1.0
    svf = np.zeros(n_states)
    for _ in range(trajectory_length):
        svf += d_t
        next_d_t = np.zeros(n_states)
        for s in range(n_states):
            for a in range(n_actions):
                next_d_t += d_t[s] * policy[s,a] * P[(s,a)]
        d_t = next_d_t
    return svf

# -------------------------------------------------
# 6) Generate "Expert" Trajectories
# -------------------------------------------------
def generate_soft_optimal_trajectories(policy, P, n_trajectories=100, trajectory_length=5):
    trajectories = []
    for _ in range(n_trajectories):
        traj = []
        state = np.random.choice(n_states)
        for __ in range(trajectory_length):
            action = np.random.choice(n_actions, p=policy[state])
            next_state = np.random.choice(n_states, p=P[(state, action)])
            traj.append(state)
            state = next_state
        trajectories.append(traj)
    return trajectories

# -------------------------------------------------
# 7) MaxEnt IRL with Anchors + L2 + Verbosity
# -------------------------------------------------
def maxent_irl(features,
               expert_trajectories,
               P,
               anchor_mode=0,           # 0=fix none, 1=fix first, 2=fix first&second
               anchor_values=None,      # e.g. [0.0], or [0.0, 0.25], or None
               reg_lambda=0.0,          # L2 penalty
               gamma=0.9,
               lr=0.01,
               n_iters=10000,
               print_every=1000,
               verbose=True,            # verbosity flag
               true_rewards=None,
               true_policy=None,
               true_value=None):
    """
    anchor_mode:
      0 -> no anchoring
      1 -> fix R(0)=anchor_values[0]
      2 -> fix R(0)=anchor_values[0], R(1)=anchor_values[1]

    reg_lambda: L2 penalty coefficient
    verbose: if False, no intermediate prints
    """
    n_states, d_features = features.shape
    reward_weights = np.random.uniform(low=-1e-3, high=1e-3, size=d_features)

    # Initialize anchored states if needed
    if anchor_mode >= 1:
        reward_weights[0] = anchor_values[0]
    if anchor_mode == 2:
        reward_weights[1] = anchor_values[1]

    # Compute expert SVF
    expert_svf = np.zeros(n_states)
    total_steps = 0
    for traj in expert_trajectories:
        for s in traj:
            expert_svf[s] += 1
        total_steps += len(traj)
    expert_svf /= total_steps

    for it in range(n_iters):
        reward_est = features @ reward_weights
        V_est = soft_value_iteration(reward_est, P)
        policy_est, Q_values = compute_policy(V_est, reward_est, P)

        svf_est = compute_svf(policy_est, P, start_state=0, trajectory_length=5)
        svf_est /= np.sum(svf_est)

        # IRL gradient
        grad_main = expert_svf - svf_est
        grad_for_weights = features.T @ grad_main

        # L2 penalty gradient
        if reg_lambda > 0:
            grad_for_weights -= reg_lambda * reward_weights

        # Apply gradient
        reward_weights += lr * grad_for_weights

        # Re-pin anchored states
        if anchor_mode >= 1:
            reward_weights[0] = anchor_values[0]
        if anchor_mode == 2:
            reward_weights[1] = anchor_values[1]

        # Possibly print
        if verbose and (it+1) % print_every == 0:
            loss_svf = np.linalg.norm(grad_main)
            msg = f"Iter {it+1:05d} | Loss(SVF): {loss_svf:.4f}"
            if true_rewards is not None:
                rew_diff = np.sum(np.abs(reward_est - true_rewards))
                msg += f" | RewDiff: {rew_diff:.4f}"
            print(msg)

    return features @ reward_weights

# -------------------------------------------------
# 8) Main Demo
# -------------------------------------------------
if __name__ == "__main__":
    np.random.seed(0)

    # ------------------------------
    # Print Base Experiment Info
    # ------------------------------
    print("=== Base Experiment Info ===")
    print(f"States:        {n_states}")
    print(f"Actions:       {n_actions}")
    print(f"eps (noise):   {eps}")
    print(f"gamma:         {gamma}")
    print(f"True Rewards:  {true_rewards}")
    n_sample_trajectories = 500
    trajectory_length = 50
    print(f"NumTraj:       {n_sample_trajectories}")
    print(f"TrajLen:       {trajectory_length}")
    print("============================\n")

    # Build transitions, compute "true" policy
    V_true = soft_value_iteration(true_rewards, P)
    policy_true, _ = compute_policy(V_true, true_rewards, P)

    # Generate expert data
    expert_trajectories = generate_soft_optimal_trajectories(
        policy_true, P,
        n_trajectories=n_sample_trajectories,
        trajectory_length=trajectory_length
    )

    # We'll test anchor_mode in [0,1,2] and reg_lambda in [0,0.05,0.1,0.2].
    # Store final estimates in a dict: results[(anchor_mode, reg_lambda)].
    anchor_modes = [0, 1, 2]
    reg_lambdas = [0.0, 0.05, 0.1, 0.2, 0.7]
    results = {}

    for am in anchor_modes:
        # If am=1 => fix R(0)=0.0
        # If am=2 => fix R(0)=0.0, R(1)=0.25
        # If am=0 => no anchor => anchor_values=None
        if am == 0:
            anc_vals = None
        elif am == 1:
            anc_vals = [0.0]
        else:  # am=2
            anc_vals = [0.0, 0.25]

        for rl in reg_lambdas:
            est_rew = maxent_irl(
                features,
                expert_trajectories,
                P,
                anchor_mode=am,
                anchor_values=anc_vals,
                reg_lambda=rl,
                gamma=gamma,
                lr=0.01,
                n_iters=3000,      # short run for demo
                print_every=1000,  # could set verbose=False to hide logs
                verbose=False,
                true_rewards=true_rewards,
                true_policy=policy_true,
                true_value=V_true
            )
            results[(am, rl)] = est_rew

    # Now print a final table of results.
    # We'll have one row for each (anchor_mode, reg_lambda).
    print("\n=== Final Results Table (multiple reg_lambdas & anchor_modes) ===")
    print("AnchorMode | RegLambda  | R(0)    R(1)    R(2)    | RewDiff")
    print("--------------------------------------------------------------")

    for am in anchor_modes:
        for rl in reg_lambdas:
            est_rew = results[(am, rl)]
            rew_diff = np.sum(np.abs(est_rew - true_rewards))
            # format anchor mode
            if am == 0:
                am_str = "FixNone"
            elif am == 1:
                am_str = "FixFirst"
            else:
                am_str = "FixFirstSecond"
            print(f"{am_str:<12} | {rl:<9.2f} | "
                  f"{est_rew[0]:.4f}  {est_rew[1]:.4f}  {est_rew[2]:.4f} | "
                  f"{rew_diff:.4f}")
    print("==============================================================\n")


=== Base Experiment Info ===
States:        3
Actions:       3
eps (noise):   0.05
gamma:         0.9
True Rewards:  [0.   0.25 0.9 ]
NumTraj:       500
TrajLen:       50


=== Final Results Table (multiple reg_lambdas & anchor_modes) ===
AnchorMode | RegLambda  | R(0)    R(1)    R(2)    | RewDiff
--------------------------------------------------------------
FixNone      | 0.00      | -1.7464  0.5624  1.1849 | 2.3437
FixNone      | 0.05      | -1.1259  0.2844  0.8415 | 1.2188
FixNone      | 0.10      | -0.8586  0.1801  0.6785 | 1.1500
FixNone      | 0.20      | -0.6029  0.0991  0.5038 | 1.1500
FixNone      | 0.70      | -0.2574  0.0291  0.2284 | 1.1500
FixFirst     | 0.00      | 0.0000  1.3468  2.0132 | 2.2099
FixFirst     | 0.05      | 0.0000  0.6975  1.2958 | 0.8433
FixFirst     | 0.10      | 0.0000  0.4287  0.9609 | 0.2396
FixFirst     | 0.20      | 0.0000  0.2189  0.6442 | 0.2870
FixFirst     | 0.70      | 0.0000  0.0498  0.2519 | 0.8482
FixFirstSecond | 0.00      | 0.0000  0.2500