In [1]:
import numpy as np 

n_S = 3
n_A = 3
gamma = 0.9

def get_mdp(n_S, n_A, gamma):
    S = np.arange(n_S)
    A = np.arange(n_A)
    P = np.zeros((n_S, n_S, n_A)) # Prob(s'|s,a)
    for a in range(n_A):
        for s in range(n_S):
            p = np.random.rand(n_S)  # Random values for s'
            P[:, s, a] = p / p.sum()  # Normalize to ensure sum over s' is 1
    R = np.round(np.random.uniform(0,1,(n_S)),1) # R(s)
    return (S, A, P, R, gamma)

M = get_mdp(n_S, n_A, gamma)
(S, A, P, R, gamma) = M
print(M)

(array([0, 1, 2]), array([0, 1, 2]), array([[[0.29984948, 0.42367271, 0.44573531],
        [0.3378003 , 0.60626228, 0.31581472],
        [0.36242946, 0.34974274, 0.48133725]],

       [[0.01891492, 0.42487334, 0.23432187],
        [0.14931944, 0.02378561, 0.6725836 ],
        [0.40162751, 0.34936108, 0.26057285]],

       [[0.68123561, 0.15145395, 0.31994282],
        [0.51288026, 0.36995211, 0.01160168],
        [0.23594303, 0.30089618, 0.2580899 ]]]), array([0.7, 0.6, 0.3]), 0.9)


In [2]:
def get_random_policy(n_S, n_A):
    pi = np.zeros((n_S, n_A))  # Prob(a|s)
    for s in range(n_S):
        p = np.random.rand(n_A)  # Random values for actions
        pi[s, :] = p / p.sum()  # Normalize to ensure sum over actions is 1
    return pi

# Example usage
pi_rand = get_random_policy(n_S, n_A)
print(pi_rand)

[[0.49339288 0.07828154 0.42832558]
 [0.17141226 0.76102181 0.06756593]
 [0.64160828 0.00632262 0.3520691 ]]


In [3]:
def get_optimal_policy(M, T):
    S, A, P, R, gamma = M
    n_S = len(S)
    n_A = len(A)
    
    V = np.zeros((T+1, n_S))  # V[t][s]: value at time t for state s
    
    # Backward induction for finite horizon
    for t in range(T-1, -1, -1):
        for s in range(n_S):
            Q_sa = np.zeros(n_A)
            for a in range(n_A):
                Q_sa[a] = sum(P[s_prime, s, a] * (R[s_prime] + gamma * V[t+1][s_prime]) for s_prime in range(n_S))
            V[t][s] = max(Q_sa)
    
    # Extract optimal policy at t=0
    pi_opt = np.zeros((n_S, n_A))
    for s in range(n_S):
        Q_sa = np.zeros(n_A)
        for a in range(n_A):
            Q_sa[a] = sum(P[s_prime, s, a] * (R[s_prime] + gamma * V[1][s_prime]) for s_prime in range(n_S))
        optimal_action = np.argmax(Q_sa)
        pi_opt[s, optimal_action] = 1
    
    return pi_opt, V[0]


# Example usage
T = 10
pi_opt, V_opt = get_optimal_policy(M, T)
print(pi_opt)
print(V_opt)


[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]
[3.96302434 4.00882195 3.9279263 ]


In [4]:
# initial state distribution 

def get_init_dist(n_S):
    p = np.random.rand(n_S)
    D = p / p.sum()
    return D

def get_init_state(S, D):
    s0 = np.random.choice(S, p = D)
    return s0

D = get_init_dist(n_S)
s0 = get_init_state(S, D)


def get_trajectory(T, M, pi, s0):
    S, A, P, R, gamma = M
    traj_s = [s0]
    traj_a = []
    traj_r = [R[s0]]  # Include reward for the initial state
    for t in range(T):
        s = traj_s[t]
        if np.sum(pi[s]) == 1 and (pi[s] == np.eye(len(A))[np.argmax(pi[s])]).all():
            a = np.argmax(pi[s])
        else:
            a = np.random.choice(A, p=pi[s])
        sp = np.random.choice(S, p=P[:, s, a])
        traj_s.append(sp)
        traj_a.append(a)
        if t + 1 < T:
            traj_r.append(R[sp])
    G = sum(gamma**t * traj_r[t] for t in range(len(traj_r)))
    return traj_s, traj_a, traj_r, G


D = get_init_dist(n_S)
s0 = get_init_state(S, D)
T = 10
traj_s, traj_a, traj_r, G = get_trajectory(T, M, pi_opt, s0)
print("Trajectory States:", traj_s)
print("Trajectory Actions:", traj_a)
print("Trajectory Rewards:", traj_r)
print("Discounted Reward G:", G)


Trajectory States: [2, 1, 1, 1, 0, 0, 2, 1, 0, 0, 1]
Trajectory Actions: [0, 2, 2, 2, 1, 1, 0, 2, 1, 1]
Trajectory Rewards: [0.3, 0.6, 0.6, 0.6, 0.7, 0.7, 0.3, 0.6, 0.7, 0.7]
Discounted Reward G: 3.6549448293000006


In [114]:
def get_unconditional_G(num_trajectories, T, M, pi, D):
    total_G = 0
    for _ in range(num_trajectories):
        s0 = get_init_state(M[0], D)  # Sample s0 from initial distribution D
        _, _, _, G = get_trajectory(T, M, pi, s0)
        total_G += G
    return total_G / num_trajectories

def get_conditional_G(num_trajectories, T, M, pi, s0):
    total_G = 0
    for _ in range(num_trajectories):
        _, _, _, G = get_trajectory(T, M, pi, s0)  # Use given s0
        total_G += G
    return total_G / num_trajectories

D = get_init_dist(n_S)  # Initialize distribution
num_trajectories = 1000
avg_G_unconditional = get_unconditional_G(num_trajectories, T, M, pi_opt, D)
print(f"Unconditional Average G over D: {avg_G_unconditional}")

s0 = 0  # Specific starting state
avg_G_conditional = get_conditional_G(num_trajectories, T, M, pi_opt, s0)
print(f"Conditional Average G for s0={s0}: {avg_G_conditional}")


Unconditional Average G over D: 4.056193366448799
Conditional Average G for s0=0: 3.783977090678108
