In [1]:
# Recall our Markov Decision Process (MDP) from last week:
# +3 points for Heads, +1 point for Tails
# Stop at any time and collect that many points as your payoff.
# But if you get 6 or more points, your payoff is nothing.



# List our States (S), Actions (A), and Discount Factor (Gamma)
S = [0, 1, 2, 3, 4, 5, 'DONE']
A = ['STOP', 'PLAY']
Gamma = 1


# Define the Transition function with three parameters:
# current state (s), action (a), new state (n)
def T(s,a,n):
    if a=='STOP' and n=='DONE':
        return 1
    if a=='PLAY':
        if s=='DONE':
            if n=='DONE': return 1
        else:
            if n=='DONE':
                if s==5: return 1
                if s==4: return 0.5
                if s==3: return 0.5
            else:
                if n-s == 1: return 0.5
                if n-s == 3: return 0.5
    return 0


# Define the Reward function with three parameters:
# current state (s), action (a), new state (n)
def R(s,a,n):
    if a=='STOP' and n=='DONE':
        if s in [0,1,2,3,4,5]: return s
    return 0


# Define the Value Iteration table V[k][s] and the
# Policy Iteration table P[k][s], with 101 rows
V = [ [0 for s in S] for k in range(101)]
P = [ ['' for s in S] for k in range(101)]


# Use the Bellman Equation to determine Row k+1 of
# tables V and P, using the information in Row k.

for k in range(100):
    for s in S:
        
        # Initialize bestValue and bestPolicy
        bestValue=-1
        bestPolicy='UNKNOWN'

        # For each State-Action pair (s,a), determine Q(s,a) using the
        # Bellman Equation by considering all new states n from state s.
        
        for a in A:
            qValue = 0
            for n in S:
                qValue += T(s,a,n)*(R(s,a,n) + Gamma*V[k][S.index(n)])
            if qValue > bestValue:
                bestValue = qValue
                bestPolicy = a
                
        V[k+1][S.index(s)]=bestValue
        P[k+1][S.index(s)]=bestPolicy
        
        
# Print the results of our Value iteration
print("Results of Value Iteration")
x=1
while V[x] != V[x-1]:
    print(V[x])
    x+=1
    
# Print the final (optimal) Policy
print("")
print(P[x])

# Output the results of our 100th iteration.
print("")
for s in S:
    i = S.index(s)
    print("In state", s, "the optimal action is", P[100][i], "with value", V[100][i])

Results of Value Iteration
[0, 1, 2, 3, 4, 5, 0]
[2.0, 3.0, 4.0, 3, 4, 5, 0]
[3.0, 4.0, 4.0, 3.0, 4.0, 5.0, 0.0]
[3.5, 4.0, 4.0, 3.0, 4.0, 5.0, 0.0]

['PLAY', 'PLAY', 'PLAY', 'STOP', 'STOP', 'STOP', 'STOP']

In state 0 the optimal action is PLAY with value 3.5
In state 1 the optimal action is PLAY with value 4.0
In state 2 the optimal action is PLAY with value 4.0
In state 3 the optimal action is STOP with value 3.0
In state 4 the optimal action is STOP with value 4.0
In state 5 the optimal action is STOP with value 5.0
In state DONE the optimal action is STOP with value 0.0
