# Dynamic Programming
- Can be used to "plan" i.e. $\pi^*$ or $V^*$ find when MDP $(S, P, d, R, A)$ is fully known
    - 1) Policy Evaluation / Prediction - for $\pi$, find $V^\pi$
    - 2) Policy Iteration / Control - Find $\pi^*$
    - 3) Value Iteration / Control -  Find $V^*$

In [1]:
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (30, 10)
plt.rcParams['font.size']=20

# 2) Policy Iteration


- Given a MDP $(S, P, d, R, A)$ we want to find $\pi^*$
- Begin with an arbitrary policy $\pi_{0}$ and then fully evaluate: 
- Find $V^\pi_{0}(s)$, $Q^\pi_{k}(s,a)$ and update policy by $\pi_{k+1}(s) = argmax_{a} Q^{\pi_{k}}(s,a)$


In [39]:
# Simple MDP
S = np.array([0, 2]) 
A = np.array([100, 200])
P = np.array([[[0.5, 0.5],   # P given a=0
              [0.8, 0.2]], 
     
             [[0.2, 0.8],   # P given a=0
              [0.4, 0.6]]] 
 
             )
R = np.array([[10, 90], 
               [5, 30]])
d = 0.95
s0 = 0
T = 100

def discountedSum(R, d):
    sum = 0
    for i, r in enumerate(R):
        sum += R[i] * (d ** i)
    return sum

pi_old = np.array([[0.5, 0.5], 
               [0.5, 0.5]])
pi_new = np.empty((2,2))
Q = np.empty((A.shape[0], S.shape[0]))
V_new = np.zeros(S.shape[0])
V_old = np.ones(S.shape[0])
HS = np.zeros(T+1).astype(int)
HR = np.zeros(T).astype(int)
HA = np.zeros(T).astype(int)
HG = np.zeros(T).astype(int)
HV = np.zeros(T).astype(int)
HS[0] = s0

# Find V of pi
N = 1000 # iterations
for i_n in range(N):
    for i_s, s in enumerate(S):
        V_new[i_s] = np.dot(pi_old[i_s], R[:, i_s] + d * np.dot(P[:, i_s, :], V_old))
    V_old = V_new.copy()

# Find Q of pi
for i_s, s in enumerate(S):
    for i_a, a in enumerate(A):
        Q[i_a, i_s] = R[i_a, i_s] + d * np.sum(np.dot(P[i_a, i_s, :], V_new[i_s]))
        
# Update Poicy
for i_s, s in enumerate(S):
    tot = sum(Q[:, i_s] == np.max(Q[:, i_s]))
    pi_new[i_s] = np.where(Q[:, i_s] == np.max(Q[:, i_s]), 1,0)/tot
print(V_new)
print(Q)
print(pi_new)

[673.93939394 716.36363636]
[[650.24242424 770.54545455]
 [645.24242424 710.54545455]]
[[1. 0.]
 [1. 0.]]


In [40]:
S = np.array([0, 1, 2]) # three states
A = np.array([100, 200, 300]) # three decisions
P = np.array([[[0.5, 0.4, 0.1],   # P given a=0
              [0.8, 0.1, 0.1],
              [0.2, 0.1, 0.7]], 
     
             [[0.2, 0.2, 0.6],   # P given a=0
              [0.4, 0.3, 0.3],
              [0.5, 0.4, 0.1]], 
 
             [[0.1, 0.7, 0.2],   # P given a=0
              [0.05, 0.9, 0.05],
              [0.3, 0.2, 0.5]],
             ])
R = np.array([[10, 20, 90], 
               [5, 30, 80],
              [4, 4, 110]])
d = 0.95
s0 = 0
pi = np.array([[0.3, 0.5, 0.2], 
               [0.7, 0.2, 0.1], 
              [0.3, 0.4, 0.3]])

pi_old = np.array([[0.4, 0.3, 0.3], 
                  [0.4, 0.3, 0.3],
                  [0.4, 0.3, 0.3]])
pi_new = np.empty((A.shape[0], S.shape[0]))
Q = np.empty((A.shape[0], S.shape[0]))
V_new = np.zeros(S.shape[0])
V_old = np.ones(S.shape[0])
HS = np.zeros(T+1).astype(int)
HR = np.zeros(T).astype(int)
HA = np.zeros(T).astype(int)
HG = np.zeros(T).astype(int)
HV = np.zeros(T).astype(int)
HS[0] = s0

# Find V of pi
N = 1000 # iterations
for i_n in range(N):
    for i_s, s in enumerate(S):
        V_new[i_s] = np.dot(pi_old[i_s], R[:, i_s] + d * np.dot(P[:, i_s, :], V_old))
    V_old = V_new.copy()

# Find Q of pi
for i_s, s in enumerate(S):
    for i_a, a in enumerate(A):
        Q[i_a, i_s] = R[i_a, i_s] + d * np.sum(np.dot(P[i_a, i_s, :], V_new[i_s]))
        
# Update Poicy
for i_s, s in enumerate(S):
    tot = sum(Q[:, i_s] == np.max(Q[:, i_s]))
    pi_new[i_s] = np.where(Q[:, i_s] == np.max(Q[:, i_s]), 1,0)/tot
print(V_new)
print(Q)
print(pi_new)

[675.03598025 673.1805998  779.58380702]
[[651.28418123 659.52156981 830.60461667]
 [646.28418123 669.52156981 820.60461667]
 [645.28418123 643.52156981 850.60461667]]
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
