In [1]:
# Recall our Markov Decision Process (MDP) from last week:
# +3 points for Heads, +1 point for Tails
# Stop at any time and collect that many points as your payoff.
# But if you get 6 or more points, your payoff is nothing.


# List our States (S), Actions (A), and Discount Factor (Gamma)
S = [0, 1, 2, 3, 4, 5, 'DONE']
A = ['STOP', 'PLAY']
Gamma = 1


# Define the Transition function with three parameters:
# current state (s), action (a), new state (n)
def T(s,a,n):
    if a=='STOP' and n=='DONE':
        return 1
    if a=='PLAY':
        if s=='DONE':
            if n=='DONE': return 1
        else:
            if n=='DONE':
                if s==5: return 1
                if s==4: return 0.5
                if s==3: return 0.5
            else:
                if n-s == 1: return 0.5
                if n-s == 3: return 0.5
    return 0


# Define the Reward function with three parameters:
# current state (s), action (a), new state (n)
def R(s,a,n):
    if a=='STOP' and n=='DONE':
        if s in [0,1,2,3,4,5]: return s
    return 0


# Define the Value Iteration table V[k][s] and the
# Policy Iteration table P[k][s], with 101 rows
V = [ [0 for s in S] for k in range(101)]
P = [ ['' for s in S] for k in range(101)]


# Use the Bellman Equation to determine Row k+1 of
# tables V and P, using the information in Row k.

for k in range(100):
    for s in S:
        
        # Initialize bestValue and bestPolicy
        bestValue=-1
        bestPolicy='UNKNOWN'

        # For each State-Action pair (s,a), determine Q(s,a) using the
        # Bellman Equation by considering all new states n from state s.
        
        for a in A:
            qValue = 0
            for n in S:
                qValue += T(s,a,n)*(R(s,a,n) + Gamma*V[k][S.index(n)])
            if qValue > bestValue:
                bestValue = qValue
                bestPolicy = a
                
        V[k+1][S.index(s)]=bestValue
        P[k+1][S.index(s)]=bestPolicy
        
        
# Print the results of our Value iteration
print("Results of Value Iteration")
x=1
while V[x] != V[x-1]:
    print(V[x])
    x+=1
    
# Print the final (optimal) Policy
print("")
print(P[x])

# Output the results of our 100th iteration.
print("")
for s in S:
    i = S.index(s)
    print("In state", s, "the optimal action is", P[100][i], "with value", V[100][i])

Results of Value Iteration
[0, 1, 2, 3, 4, 5, 0]
[2.0, 3.0, 4.0, 3, 4, 5, 0]
[3.0, 4.0, 4.0, 3.0, 4.0, 5.0, 0.0]
[3.5, 4.0, 4.0, 3.0, 4.0, 5.0, 0.0]

['PLAY', 'PLAY', 'PLAY', 'STOP', 'STOP', 'STOP', 'STOP']

In state 0 the optimal action is PLAY with value 3.5
In state 1 the optimal action is PLAY with value 4.0
In state 2 the optimal action is PLAY with value 4.0
In state 3 the optimal action is STOP with value 3.0
In state 4 the optimal action is STOP with value 4.0
In state 5 the optimal action is STOP with value 5.0
In state DONE the optimal action is STOP with value 0.0


In [2]:
# Temporal Difference Learning


# Let P be the optimal policy, found from the above MDP.
S = [0, 1, 2, 3, 4, 5, 'DONE']
P = ['PLAY', 'PLAY', 'PLAY', 'STOP', 'STOP', 'STOP', 'STOP']


# Initialize the Values Array V and set a Learning Rate of Alpha=0.5
V = [ [0 for s in S] for k in range(1000)]
Alpha = 0.5


# Run TD Learning

import random
k = 0
for k in range(4):
    print("")
    print(k, V[k])
    
    # For each state (s), determine the action (a) based on the optimal policy
    # Run an experiement: find the new state (n), calculate the reward, and
    # generate the Sample Value.  Use this sample to update V(s)
    
    for s in S:
        
        i = S.index(s)
        a = P[i]
        coin = 'NONE'
        
        if a == 'STOP':
            n = 'DONE'
        else:
            if random.random() <0.5: 
                coin = 'TAILS'
                n = s+1
            else: 
                coin = 'HEADS'
                n = s+3
            
        j = S.index(n)
        sample = R(s,a,n) + V[k][j]

        
        # The new estimate for V(i) is equal to (1-Alpha) times the old estimate 
        # for V(i) plus (Alpha) times the result of our sample.  If Alpha = 0.5,
        # then these two results have the same weight of 50%.
        V[k+1][i]=(1-Alpha)*V[k][i] + Alpha*sample
        
        print("State", s, "Action", a, "Flip", coin, "Updated Value", V[k+1][i])


0 [0, 0, 0, 0, 0, 0, 0]
State 0 Action PLAY Flip TAILS Updated Value 0.0
State 1 Action PLAY Flip TAILS Updated Value 0.0
State 2 Action PLAY Flip TAILS Updated Value 0.0
State 3 Action STOP Flip NONE Updated Value 1.5
State 4 Action STOP Flip NONE Updated Value 2.0
State 5 Action STOP Flip NONE Updated Value 2.5
State DONE Action STOP Flip NONE Updated Value 0.0

1 [0.0, 0.0, 0.0, 1.5, 2.0, 2.5, 0.0]
State 0 Action PLAY Flip TAILS Updated Value 0.0
State 1 Action PLAY Flip HEADS Updated Value 1.0
State 2 Action PLAY Flip TAILS Updated Value 0.75
State 3 Action STOP Flip NONE Updated Value 2.25
State 4 Action STOP Flip NONE Updated Value 3.0
State 5 Action STOP Flip NONE Updated Value 3.75
State DONE Action STOP Flip NONE Updated Value 0.0

2 [0.0, 1.0, 0.75, 2.25, 3.0, 3.75, 0.0]
State 0 Action PLAY Flip TAILS Updated Value 0.5
State 1 Action PLAY Flip TAILS Updated Value 0.875
State 2 Action PLAY Flip TAILS Updated Value 1.5
State 3 Action STOP Flip NONE Updated Value 2.625
State 4 

In [3]:
# Q Learning

Q = [ [[0 for a in A] for s in S] for k in range(1000)]

k = 0
Alpha = 0.5

print(A)
print(S)

for k in range(4):
    print("")
    print(k, Q[k])
    
    for s in S:
        for a in A:     
            coin = 'NONE'
            
            if a == 'STOP' or s == 'DONE':
                n = 'DONE'
            else:
                if random.random() <0.5: 
                    coin = 'TAILS'
                    n = s+1
                    if not n in S: n = 'DONE'
                else: 
                    coin = 'HEADS'
                    n = s+3
                    if not n in S: n = 'DONE'
            
            # Get the correct indices for s and n from set S, and the correct
            # index for a from A - i.e., A.index('STOP')=0, A.index('PLAY')=1
            i = S.index(s)
            j = S.index(n)
            b = A.index(a)
            
            # Calculate the sample and update the (k+1)th row of the Q table
            
            sample = R(s,a,n) + max(Q[k][j][0], Q[k][j][1])
            
            Q[k+1][i][b] = (1-Alpha)*Q[k][i][b] + Alpha*sample
            
            print("State", s, "Action", a, "Flip", coin, "Updated Value", Q[k+1][i][b])

    

['STOP', 'PLAY']
[0, 1, 2, 3, 4, 5, 'DONE']

0 [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]
State 0 Action STOP Flip NONE Updated Value 0.0
State 0 Action PLAY Flip HEADS Updated Value 0.0
State 1 Action STOP Flip NONE Updated Value 0.5
State 1 Action PLAY Flip HEADS Updated Value 0.0
State 2 Action STOP Flip NONE Updated Value 1.0
State 2 Action PLAY Flip TAILS Updated Value 0.0
State 3 Action STOP Flip NONE Updated Value 1.5
State 3 Action PLAY Flip HEADS Updated Value 0.0
State 4 Action STOP Flip NONE Updated Value 2.0
State 4 Action PLAY Flip HEADS Updated Value 0.0
State 5 Action STOP Flip NONE Updated Value 2.5
State 5 Action PLAY Flip HEADS Updated Value 0.0
State DONE Action STOP Flip NONE Updated Value 0.0
State DONE Action PLAY Flip NONE Updated Value 0.0

1 [[0.0, 0.0], [0.5, 0.0], [1.0, 0.0], [1.5, 0.0], [2.0, 0.0], [2.5, 0.0], [0.0, 0.0]]
State 0 Action STOP Flip NONE Updated Value 0.0
State 0 Action PLAY Flip HEADS Updated Value 0.75
State 1 Action STOP Flip N