In [12]:
import numpy as np
import gym
env = gym.make('NChain-v0')

# Temporal-Difference-Learning-Action-State-Space

### NChain- environment

<img src="images/NChain-illustration.png">

# Dynamic Programming

# Temporal-Difference-Learning

### SARSA

“SARSA” refers to the procedure of updaing Q-value by following a sequence of $…,S_t,A_t,R_{t+1},S_{t+1},A_{t+1},…$. The idea follows the same route of GPI:

1. At time step $t$, we start from state $S_t$ and pick action according to $Q$ values, $A_t=\arg\max_{a\in A}Q(S_t,a)$; $\epsilon$-greedy is commonly applied.
2. With action $A_t$, we observe reward $R_{t+1}$ and get into the next state $S_{t+1}$.
3. Then pick the next action in the same way as in step 1.: $A_{t+1}=\arg\max_{a\in A}Q(S_{t+1},a)$.
4. Update the action-value function: $Q(S_t,A_t)=Q(S_t,A_t)+ \alpha(R_{t+1}+\gamma Q(S_{t+1},A_{t+1})−Q(S_t,A_t))$.   $t = t+1$ and repeat from step 1.

$Q(S_t, A_t) = Q(S_t, A_t) + \alpha (R_{t+1} + \gamma Q(S_{t+1}, A_{t+1}) - Q(S_t, A_t))$

Q-Learning technique is an Off Policy technique and uses the greedy approach to learn the Q-value. SARSA technique, on the other hand, is an On Policy and uses the action performed by the current policy to learn the Q-value.

In [7]:
#Defining the different parameters 
epsilon = 0.9
total_episodes = 10000
max_steps = 100
alpha = 0.85
gamma = 0.95
  
#Initializing the Q-matrix 
Q = np.zeros((env.observation_space.n, env.action_space.n)) 

In [8]:
#Function to choose the next action 
def choose_action(state): 
    action=0
    if np.random.uniform(0, 1) < epsilon: 
        action = env.action_space.sample() 
    else: 
        action = np.argmax(Q[state, :]) 
    return action 
  
#Function to learn the Q-value 
def update(state, state2, reward, action, action2): 
    predict = Q[state, action] 
    target = reward + gamma * Q[state2, action2] 
    Q[state, action] = Q[state, action] + alpha * (target - predict) 

In [10]:
#Initializing the reward 
reward=0
  
# Starting the SARSA learning 
for episode in range(total_episodes): 
    t = 0
    state1 = env.reset() 
    action1 = choose_action(state1) 
  
    while t < max_steps: 
        #Getting the next state 
        state2, reward, done, info = env.step(action1) 
  
        #Choosing the next action 
        action2 = choose_action(state2) 
          
        #Learning the Q-value 
        update(state1, state2, reward, action1, action2) 
  
        state1 = state2 
        action1 = action2 
          
        #Updating the respective vaLues 
        t += 1
        reward += 1
          
        #If at the end of learning process 
        if done: 
            break

In [11]:
#Evaluating the performance 
print ("Performace : ", reward/total_episodes) 
  
#Visualizing the Q-matrix 
print(Q) 

Performace :  0.0001
[[19.45661715 16.9136222 ]
 [20.19172285 19.27388429]
 [32.76469682 21.216865  ]
 [18.48082876 21.25478373]
 [32.68564449 19.34836536]]


### Q function update

$Q(s, a) = Q(s, a) + \alpha (r + \gamma \max\limits_{a’} Q(s’, a’) – Q(s, a))$

In [13]:
def q_learning_with_table(env, num_episodes=500):
    q_table = np.zeros((5, 2))
    y = 0.95
    lr = 0.8
    for i in range(num_episodes):
        s = env.reset()
        done = False
        while not done:
            if np.sum(q_table[s,:]) == 0:
                # make a random selection of actions
                a = np.random.randint(0, 2)
            else:
                # select the action with largest q value in state s
                a = np.argmax(q_table[s, :])
            new_s, r, done, _ = env.step(a)
            q_table[s, a] += r + lr*(y*np.max(q_table[new_s, :]) - q_table[s, a])
            s = new_s
    return q_table

In [14]:
def eps_greedy_q_learning_with_table(env, num_episodes=500):
    q_table = np.zeros((5, 2))
    y = 0.95
    eps = 0.5
    lr = 0.8
    decay_factor = 0.999
    for i in range(num_episodes):
        s = env.reset()
        eps *= decay_factor
        done = False
        while not done:
            # select the action with highest cummulative reward
            if np.random.random() < eps or np.sum(q_table[s, :]) == 0:
                a = np.random.randint(0, 2)
            else:
                a = np.argmax(q_table[s, :])
            # pdb.set_trace()
            new_s, r, done, _ = env.step(a)
            q_table[s, a] += r + lr * (y * np.max(q_table[new_s, :]) - q_table[s, a])
            s = new_s
    return q_table

In [15]:
def run_game(table, env):
    s = env.reset()
    tot_reward = 0
    done = False
    while not done:
        a = np.argmax(table[s, :])
        s, r, done, _ = env.step(a)
        tot_reward += r
    return tot_reward

In [16]:
def test_methods(env, num_iterations=100):
    winner = np.zeros((2,))
    for g in range(num_iterations):
        m1_table = q_learning_with_table(env, 500)
        m2_table = eps_greedy_q_learning_with_table(env, 500)
        m1 = run_game(m1_table, env)
        m2 = run_game(m2_table, env)
        w = np.argmax(np.array([m1, m2]))
        winner[w] += 1
        print("Game {} of {}".format(g + 1, num_iterations))
    return winner

In [17]:
env = gym.make('NChain-v0')
test_methods(env, num_iterations=100)

Game 1 of 100
Game 2 of 100
Game 3 of 100
Game 4 of 100
Game 5 of 100
Game 6 of 100
Game 7 of 100
Game 8 of 100
Game 9 of 100
Game 10 of 100
Game 11 of 100
Game 12 of 100
Game 13 of 100
Game 14 of 100
Game 15 of 100
Game 16 of 100
Game 17 of 100
Game 18 of 100
Game 19 of 100
Game 20 of 100
Game 21 of 100
Game 22 of 100
Game 23 of 100
Game 24 of 100
Game 25 of 100
Game 26 of 100
Game 27 of 100
Game 28 of 100
Game 29 of 100
Game 30 of 100
Game 31 of 100
Game 32 of 100
Game 33 of 100
Game 34 of 100
Game 35 of 100
Game 36 of 100
Game 37 of 100
Game 38 of 100
Game 39 of 100
Game 40 of 100
Game 41 of 100
Game 42 of 100
Game 43 of 100
Game 44 of 100
Game 45 of 100
Game 46 of 100
Game 47 of 100
Game 48 of 100
Game 49 of 100
Game 50 of 100
Game 51 of 100
Game 52 of 100
Game 53 of 100
Game 54 of 100
Game 55 of 100
Game 56 of 100
Game 57 of 100
Game 58 of 100
Game 59 of 100
Game 60 of 100
Game 61 of 100
Game 62 of 100
Game 63 of 100
Game 64 of 100
Game 65 of 100
Game 66 of 100
Game 67 of 100
Game

array([20., 80.,  0.])