### Notebook to complete in class

In [1]:
import random
import numpy as np
import gym
import matplotlib
import matplotlib.pyplot as plt

from Plotting import plot

#### Grace Lindsay Example

<img src="./imgs/Lindsay.png" width="350px" align="left">

In [2]:
S = {0:"GRE", 1:"COU", 2:"QUE", 3:"TIM", 4:"34t", 5:"14t", 6:"MET", 7:"UNI", 8:"BLE"}
A = {0:[0,1], 1:[0,1], 2:[0],   3:[0,1], 4:[0],   5:[0],   6:[0],   7:[0],   8:[]}
T = {0:[1,6], 1:[2,3], 2:[4],   3:[4,5], 4:[8],   5:[7],   6:[7],   7:[8],   8:[]}
R = {0: 0,    1:5,     2:5,     3:40,    4:20,    5:8,     6:10,    7:12,    8:0}

Q = {}
for s in S:
    for a in A[s]:
        Q[(s,a)] = 0

def reset(): return 0
def step(s,a): return T[s][a], R[T[s][a]], A[T[s][a]] == []
    
def printQ(Q):
    print("\nQ value table:\n---------------")
    for s in S:
        if len(A[s]) > 0:
            Vs = R[s] + gamma*max([Q[(s,a)] for a in A[s]])
            print(S[s], ":", round(Vs,2))            

print("Here the transition edges:\n--------------------------")
for i,s in S.items():
    print(S[i], ":", [S[T[i][a]] for a in A[i]])

Here the transition edges:
--------------------------
GRE : ['COU', 'MET']
COU : ['QUE', 'TIM']
QUE : ['34t']
TIM : ['34t', '14t']
34t : ['BLE']
14t : ['UNI']
MET : ['UNI']
UNI : ['BLE']
BLE : []


In [3]:
alpha, gamma = 0.2, 0.8

def episode(bPrint=False):
    s = reset()
    done = False
    while not done:    # trials loop    
        a = random.choice(A[s])
        s_next, r, done = step(s,a)      # transition function from environment
        q_max_S_next = max([Q[(s_next, a_)] for a_ in A[s_next]]) if not done else 0    
        if not done:
            TD_error = r + gamma*q_max_S_next - Q[(s,a)]
            Q[(s,a)] = Q[(s,a)] + alpha*TD_error

        if bPrint: print("Was in", s, "did action", a, "got in", s_next, "and made %d$"%r, "and I'm done" if done else "")
        s = s_next

for _ in range(1000):
    episode()
    
printQ(Q)


Q value table:
---------------
GRE : 39.84
COU : 49.8
QUE : 21.0
TIM : 56.0
34t : 20.0
14t : 17.6
MET : 19.6
UNI : 12.0


### Plotting Exercise
Make a TD_Error evolution plot, the one you want. You will need to modify the previous function episode(). You can use the function plot that we used in the AddictionRedish notebook imported here from Plotting.py as plot: <BR>

`def plot(td_list, ax=None, color=None, label=None, title=None, xlabel='Updates', ylabel='Error', ylim=None, marker=None)` <br>

and can be called with a list of values: <br>
    
`plot(td_errors)`

### Introducing a Q function class

We defined the Q function as a dictionary. We are now going to define the Q function as a class with two main functions "predict" and "update".

In [4]:
class Q_function():  
    def __init__(self, env):       
        self.nactions = env.nactions   
        self.f = {}
        
    def predict(self, s, a=None):  # get value of a state (s) and action (a)
        if s not in self.f:
             self.f[s] = [0]*self.nactions       
        return self.f[s] if a is None else self.f[s][a]
    
    def update(self, s, a, y):     # update couplet state, action (s, a) with y
        self.f[s][a] = y

### Introducing an Object Oriented class: Q_function Exercises

In [5]:
# Define a Q_function and test to set the following function:
#   States are ['BCN', 'MAD', 'BIL', 'VAL'], 
#   all states have initial value 0 except BCN that has value 10
#   query the values of the Q_function, BCN, MAD for example


In [6]:
# Redefine the previous function episode() with the new class Q_funtion


### Introducing a policy class

We defined the Policy Class to be able to implement epsilon greedy.

In [7]:
class Policy():  
    def __init__(self, Q):       
        self.nactions = len(A)   
        self.Q = Q
 
    def act(self, state, epsilon):
        if isinstance(Q,dict):
            q_values = [Q[(state,a)] for a in A[state]]
        else:
            q_values = Q.predict(state)
        
        if np.random.uniform() < epsilon:
            return np.random.randint(env.nactions)   # explore
        else:
            return np.argmax(q_values)               # exploit

In [8]:
# Define a Policy and test it. For example, found out what to do in state 0 (GRE).


In [9]:
# Redefine the previous function episode() with the new class Policy


In [10]:
# Evaluate wht is the improvment in learning speed when using a greedy policy
