In [1]:
import numpy as np
import matplotlib as plt

# Rescorla-Wagner Model of Classical Conditioning

### Step 1. Initializing association weights $w \in \mathbb{R}^n$

In [2]:
[np.random.uniform() for _ in range(2)]

[0.7099717143963378, 0.8007649148606881]

In [3]:
# define w
n_feat = 2  # CSes like bell and light 
w = np.random.uniform(high=0.0001, size=2)
print("w:", np.round(w,2))

w: [0. 0.]


### Step 2. Defining value of a Stimulus vector $x$
Given: $w$ weight vector and stimulus vector $x$

In [4]:
x = [1, 0] # bell, be careful its a python list
x = np.array(x)
l = [w[i]*x[i] for i in range(len(x))]
print(np.round(l,2))
print(np.round(np.sum(l),2))

[0. 0.]
0.0


In [5]:
# define a function, call it value (receives as input w and x)
def value(w,x):
    sum_value = 0
    for i,wi in enumerate(w):
        sum_value = sum_value + wi*x[i]
    return sum_value

# Testing the value function in this situation: 
print("Value: ",value(w=[1,0.5],x=[1,1]))

Value:  1.5


### Step 3. Defining associative weight update $\Delta w$
$x$: stimulus vector, $r$: reward, $\alpha$: learning rate, n_times: number of updates

In [6]:
# define the weight update 
def update_rw(w, x, r=1, alpha=0.5, n_times=1):  # times being trials
    x = np.array(x)
    for _ in range(n_times):
        td_error = r - value(w,x)
        w = w + alpha*td_error*x
    return w

### Step 4. Define Classical Conditioning paradigms

In [7]:
# take the example of forward blocking and define all steps to test results with the model
# 1) initialize weights w and stimulus x
# 2) update weights 10 times for A -> + association
# 3) update weights 10 times for AB -> + association
# 4) print the value of B?

w = np.array([0,0])

# First phase A->+
x = [1,0]
w = update_rw(w,x,r=1, n_times=10)
print("> w after A->+:", np.round(w,4))

# Second AB -> +
x = [1,1]
w = update_rw(w,x,r=1, n_times=10)

print("> w after AB->+", np.round(w,3))
print("Value of B->?", np.round(value(w,[0,1]),2))

> w after A->+: [0.999 0.   ]
> w after AB->+ [1. 0.]
Value of B->? 0.0


In [8]:
def overexpectation():
    print("overexpectation -------")
    w = np.random.uniform(high=0.001, size=n_feat)
    print("> w init:", np.round(w,2))
    x = [1,0]
    w = update_rw(w, x, n_times=10)
    print("> w after A->+:", np.round(w,2))
    x = [0,1]
    w = update_rw(w, x, n_times=10)
    print("> w after B->+:", np.round(w,2))
    print("Value of B->?", np.round(value(w,[0,1]),2))
    
    w = update_rw(w,[1,1], n_times=10)
    print("> w after AB->+", np.round(w,2))
    
    print("Value of B->?", np.round(value(w,[0,1]),2))

    
overexpectation()

overexpectation -------
> w init: [0. 0.]
> w after A->+: [1. 0.]
> w after B->+: [1. 1.]
Value of B->? 1.0
> w after AB->+ [0.5 0.5]
Value of B->? 0.5


In [9]:
def overshadowing():
    print("overshadowing -------")
    w = np.random.uniform(high=0.001, size=n_feat)
    print("> w init:", np.round(w,2))
    w = update_rw(w,[1,1], n_times=10)
    print("> w after AB->+", np.round(w,2))
    print("Value of B->?", np.round(value(w,[0,1]),2))

    
overshadowing()

overshadowing -------
> w init: [0. 0.]
> w after AB->+ [0.5 0.5]
Value of B->? 0.5


# Temoral Difference Learning

### Solution 1. Future discounted reward prediction within R&W 
Function approximation $V_t = w^\top \phi(s_t)$ and Indicator features $x_t = \phi(s_t) \in \mathbb{R}^n$ sets a 1 at the corresponding state index (one-hot encoding): $x_t(t) = 1$  

In [10]:
# define the weight update 
def update_td(w, x, x_next, r=1, alpha=0.5, gamma=0.8, n_times=1):  # times being trials
    x = np.array(x)
    for _ in range(n_times):
        td_error = r + gamma*value(w, x_next) - value(w,x)
        w = w + alpha*td_error*x
    return w

Second Order Conditioning is now captured:

In [11]:
# Our aim is to test Second Order Conditioning: 
# B->+,   A->B,   A->?
w = [0,0]

A = [1,0]
B = [0,1]

# B->+
w = update_td(w, x=B, x_next=[0,0], r=1, n_times=10)  # with a dummy next state [0,0]
print("w after B->+", w)

# A->B
w = update_td(w, A, B, r=0, gamma=.8, n_times=100)
print("w after A->B", w)

# Value of A after training?
print("Value of A->?", np.round(value(w,[1,0]),2))   # Now second order conditioning works!


w after B->+ [0.         0.99902344]
w after A->B [0.79921875 0.99902344]
Value of A->? 0.8


### Solution 2. Global states, no feature approximation

In [12]:
# initializations
alpha=0.5
gamma=0.8
t=0               # will index time steps, 0,1,2
R = [0,1,0]       # defines the reward at time t:  corresponds to A,B,dummy next state  
S = [0,1,2]       # states are represented as integers A=0, B=1
A,B = 0,1
V = np.zeros(3)   # we use an array to store the state value: no association weight vector

In [13]:
# B->+
t = 1
for _ in range(10): 
    td_error = R[t] + gamma*V[t+1] - V[t] 
    V[t] = V[t] + alpha*td_error

print("Value of B after B->+ (V[B]):", V[B])

# A->B
t = 0
for _ in range(10): 
    td_error = R[t] + gamma*V[t+1] - V[t] 
    V[t] = V[t] + alpha*td_error

print("Value of A after A->B (V[A]):", V[A])


Value of B after B->+ (V[B]): 0.9990234375
Value of A after A->B (V[A]): 0.7984382629394532
