Immediate Reward Prediction (Stateless environment, only one single state)

In [393]:
import random

A = ['L','R']
Q = {'L':0, 'R':0}           # Q : A -> Value 
alfa = 0.1
R = {'L':1, 'R':0}    # The reward function belongs to the environment and is invisible to the agent 
print("Q:", Q)

Q: {'L': 0, 'R': 0}


In [189]:
random.choice(A)

'L'

In [205]:
for _ in range(100):
    a = random.choice(A)
    Q[a] = Q[a] + alfa*(R[a] - Q[a])
print(Q)

{'L': 0.992144832788721, 'R': 0.0}


Lets change the reward to the other arm of the T-Maze

In [206]:
R = {'L':0, 'R':1}

In [208]:
for _ in range(1):
    a = random.choice(A)
    Q[a] = Q[a] + alfa*(R[a] - Q[a])
    print("Done", a, "  --> TD error: ", R[a] - Q[a])
print(Q)

Done L   --> TD error:  -0.8929303495098488
{'L': 0.8929303495098488, 'R': 0.1}


Environment with states and future discounted reward prediction

In [431]:
S = ['A','B','C']
R = {}
Q = {}
for s in S:
    for a in A:
        Q[(s,a)] = 0
        R[(s,a)] = 0

R[('B','L')] = 1
R[('C','R')] = -10
gamma = 0.9

print("Q:",Q)
print("R:",R)

Q: {('A', 'L'): 0, ('A', 'R'): 0, ('B', 'L'): 0, ('B', 'R'): 0, ('C', 'L'): 0, ('C', 'R'): 0}
R: {('A', 'L'): 0, ('A', 'R'): 0, ('B', 'L'): 1, ('B', 'R'): 0, ('C', 'L'): 0, ('C', 'R'): -10}


In [558]:
print("Q Before:", Q)

for _ in range(1):    # trials loop    
    # one single trial
    
    # decision on first junction
    S_current = 'A'
    a = random.choice(A)
    print("Was in", S_current, "did action", a)
    S_next = 'B' if a == 'L' else 'C'  # transition function from environment
    
    q_max_S_next = max([Q[(S_next, a_)] for a_ in A])
    TD_error = R[(S_current,a)] + gamma*q_max_S_next - Q[(S_current,a)]
    Q[(S_current,a)] = Q[(S_current,a)] + alfa*TD_error

    S_current = S_next
    # decision on second junction
    a = random.choice(A)
    print("Was in", S_current, "did action", a)
    Q[(S_current,a)] = Q[(S_current,a)] + alfa*(R[(S_current,a)] - Q[(S_current,a)])

print("Q After:", Q)

Q Before: {('A', 'L'): 0.4240860395391629, ('A', 'R'): -4.225637961664771, ('B', 'L'): 0.9721871610556306, ('B', 'R'): 0.0, ('C', 'L'): 0.0, ('C', 'R'): -9.65663161797075}
Was in A did action L
Was in B did action R
Q After: {('A', 'L'): 0.42542585783275, ('A', 'R'): -4.225637961664771, ('B', 'L'): 0.9721871610556306, ('B', 'R'): 0.0, ('C', 'L'): 0.0, ('C', 'R'): -9.65663161797075}
