# TX00DQ05-3001 Exercises 3

In [1]:
import numpy as np
import numpy.linalg as LA
import random

## Exercise 1: Sample behaviour of an MDP

Let's take (again) a look at Sutton & Barto example 4.1 gridworld. On each iteration start at every (non-terminating) state and sample actions in succeeding states by selecting them from uniform distribution (each action - up, down, left, right - is equally probable). Run the episode until terminal state is encountered. Collect statistics to calculate average number of steps needed before completion for each start state. Should this number match with something you have seen earlier in the exercises?

In [2]:
# common for all exercises
STATES = [(i, j) for i in range(4) for j in range(4)]
N_STATES = len(STATES)
STATES_TERMINAL = [(0, 0), (3, 3)]
STATES_NON_TERMINAL = [s for s in STATES if s not in STATES_TERMINAL]

UP   = np.array((-1, 0))
DOWN = np.array((1, 0))
LEFT = np.array((0, -1))
RIGHT= np.array((0, 1))

ACTIONS = (UP, DOWN, LEFT, RIGHT)
ACTION_LABELS = ('↑', '↓', '←', '→')
N_ACTIONS = len(ACTIONS)

def is_terminal(state):
    return state in STATES_TERMINAL

In [3]:
# YOUR CODE

# a kinda general and slow approach here
# but I thought was pretty - until I re-read the task
# which says to count the steps, not the -1 reward
# as in last week's exercises.
# So I'll just flip the sign of the rewards
# and will keep it this general...

def take_action(state, action):
    "Returns reached state and associated reward"
    assert not is_terminal(state)
    # treat action that would lead out of the grid as before
    new_state = tuple(np.clip(state + action, 0, 3))
    reward = 1 # a constant reward for counting steps
    return new_state, reward

def MC_pathlengths(maxiters):
    lengths = dict()
    # YOUR CODE
    NON_TERM_STATES = [s for s in STATES if not is_terminal(s)]
    returns = dict()
    for i in range(maxiters):
        # for each non-terminating state as starting state
        for state in NON_TERM_STATES:
            
            # generate an episode
            episode_states = list()
            episode_rewards = list()
            while not is_terminal(state):
                episode_states.append(state)
                # choose a random action uniformly
                action = random.choice(ACTIONS)
                state, reward = take_action(state, action)
                episode_rewards.append(reward)
                
            # evaluate and improve expectations
            G = 0
            for t in reversed(range(len(episode_states))):
                state = episode_states[t]
                reward = episode_rewards[t]
                
                # non-discounted returns
                G += reward
                
                if state not in episode_states[:t]:
                    state_returns = returns.setdefault(state, list())
                    state_returns.append(G)
                    lengths[state] = np.mean(state_returns)

    return lengths


In [4]:
# it's rather slow, so just a few iterations
lengths = MC_pathlengths(123)

for row in range(4):
    for col in range(4):
        l = lengths.get((row, col))
        s = f'{l:5.2f}' if l else '-----'
        print(s, end='\t')
    print()

-----	14.18	21.37	23.27	
14.28	19.32	21.20	20.73	
21.03	20.84	19.05	14.88	
22.96	21.43	15.15	-----	


It is close (forgetting it's opposite) to the state values of random policy evaluation from 2nd session's exercises. Both are approximating the expected number of steps from a state.

## Exercise 2: Monte Carlo state value function estimation. 

Calculate state-value function V for the gridworld of Sutton & Barto example 4.1 using first-visit or every-visit Monte Carlo policy evaluation (see for example page 92 of Sutton & Barto). Policy to be evaluated is the same as before; each action (up, down, left, right) is equally probable.  Action that would result in leaving the grid (for example moving up in top row) will leave state unchanged (but action has been taken). Gamma (discount factor) is assumed to be = 1, ie. no discounting.

Try out both exploring starts (see Sutton & Barto, p. 96) and fixed start points. Any difference?

Take a look at the value function you get when you run the algorithm multiple times (with fixed # of iterations). Any observations?

---

I'll try to answer the questions quantitatively instead of just talking...

In [5]:
# slightly modified previous exercise

def take_action(state, action):
    "Returns reached state and associated reward"
    assert not is_terminal(state)
    new_state = tuple(np.clip(state + action, 0, 3))
    reward = -1
    return new_state, reward

def MC(
    maxiters: 'number of iterations, no early stopping',
    every_visit: 'if true evaluate every visit, otherwise only first visit',
    get_init_state: 'Callable() -> state',
) -> 'state:value dictionary':
    
    V = dict()
    mean_count = dict()
    
    for i in range(maxiters):

        # generate an episode
        episode_states = list()
        episode_rewards = list()
        state = get_init_state()
        while not is_terminal(state):
            episode_states.append(state)
            # choose a random action uniformly
            action = random.choice(ACTIONS)
            state, reward = take_action(state, action)
            episode_rewards.append(reward)

        # evaluate and improve expectations
        G = 0
        for t in reversed(range(len(episode_states))):
            state = episode_states[t]
            reward = episode_rewards[t]

            # non-discounted returns
            G += reward

            if every_visit or state not in episode_states[:t]:
                mean = V.get(state, 0)
                V[state] = mean + (G - mean)/mean_count.setdefault(state, 1)
                mean_count[state] += 1

    return V

In [6]:
def stats(repeat, mc_args):
    "Reruns MC `repeat` times and prints statistics"
    results = np.zeros((repeat, 4, 4))
    for i in range(repeat):
        v = MC(*mc_args)
        for state, value in v.items():
            results[(i, *state)] = value
            
    std = results.std(axis=0)
    std = std[std != 0] # only non-zero
    print(f'Standard deviation: min={std.min():.2f}  max={std.max():.2f} avg={std.mean():.2f}')
            
    # a quick hack to print mean and std. dev. side-by-side
    with np.printoptions(formatter={'float':"{:6.2f}".format}):
        mean = str(results.mean(axis=0)).splitlines()
        std =  str(results.std(axis=0)).splitlines()
        
    print(f'{"Mean (over repeats)":^27}  ¦  {"Standard deviation":^27}')
    print('\n'.join(
        f'{m.strip(" []"):>27}  ¦  {s.strip(" []"):>27}'
        for m, s in zip(mean, std)
    ))
    
    return results

In [7]:
# try these iterations
ITERS = (4, 16, 64, 256, 1024, 4096)
# this many times
REPEAT = 100

get_init_state_explore = lambda: random.choice(STATES_NON_TERMINAL)
get_init_state_fixed = lambda: (1,0)

for iters in ITERS:
    print(f"\nExploring states, first-visit, {iters} iterations (repeated {REPEAT} times)")
    stats(REPEAT, (iters, False, get_init_state_explore))
    
    print(f"\nExploring states, every-visit, {iters} iterations (repeated {REPEAT} times)")
    stats(REPEAT, (iters, True, get_init_state_explore))
    
    print(f"\nFixed state, first-visit, {iters} iterations (repeated {REPEAT} times)")
    stats(REPEAT, (iters, False, get_init_state_fixed))
    
    print(f"\nFixed state, every-visit, {iters} iterations (repeated {REPEAT} times)")
    stats(REPEAT, (iters, True, get_init_state_fixed))
    
    print('\n', '='*60, sep='')


Exploring states, first-visit, 4 iterations (repeated 100 times)
Standard deviation: min=11.36  max=16.36 avg=13.21
    Mean (over repeats)      ¦      Standard deviation     
  0.00 -13.32 -18.22 -17.44  ¦    0.00  12.54  14.04  14.97
-12.09 -17.48 -19.73 -18.03  ¦   13.48  11.49  12.30  14.71
-17.81 -18.81 -16.51 -13.29  ¦   11.95  11.57  13.08  16.36
-18.64 -17.08 -10.94   0.00  ¦   14.23  12.91  11.36   0.00

Exploring states, every-visit, 4 iterations (repeated 100 times)
Standard deviation: min=8.50  max=12.72 avg=10.73
    Mean (over repeats)      ¦      Standard deviation     
  0.00 -10.13 -13.23 -14.71  ¦    0.00  10.87  10.15  12.72
-10.17 -13.76 -15.45 -14.40  ¦   12.45   9.75  10.50  12.66
-13.82 -14.79 -13.31  -9.93  ¦   11.02   9.28   8.87  10.76
-13.75 -13.06  -8.30   0.00  ¦   11.49  11.19   8.50   0.00

Fixed state, first-visit, 4 iterations (repeated 100 times)
Standard deviation: min=7.85  max=20.28 avg=16.82
    Mean (over repeats)      ¦      Standard deviation  

Standard deviation: min=0.34  max=0.50 avg=0.39
    Mean (over repeats)      ¦      Standard deviation     
  0.00 -13.93 -19.94 -21.99  ¦    0.00   0.35   0.41   0.50
-13.98 -17.94 -19.97 -20.00  ¦    0.38   0.34   0.36   0.44
-19.99 -19.99 -18.01 -14.02  ¦    0.39   0.38   0.36   0.38
-21.96 -19.97 -14.03   0.00  ¦    0.41   0.41   0.38   0.00

Exploring states, every-visit, 4096 iterations (repeated 100 times)
Standard deviation: min=0.46  max=0.64 avg=0.54
    Mean (over repeats)      ¦      Standard deviation     
  0.00 -14.11 -20.08 -22.09  ¦    0.00   0.61   0.61   0.64
-14.04 -18.06 -20.07 -20.10  ¦    0.54   0.47   0.49   0.60
-19.99 -20.03 -18.04 -14.10  ¦    0.54   0.48   0.46   0.52
-22.03 -20.00 -14.04   0.00  ¦    0.62   0.52   0.49   0.00

Fixed state, first-visit, 4096 iterations (repeated 100 times)
Standard deviation: min=0.27  max=0.76 avg=0.53
    Mean (over repeats)      ¦      Standard deviation     
  0.00 -14.03 -19.96 -22.01  ¦    0.00   0.50   0.62   0.76
-13

## Exercise 3*: Monte Carlo action value function estimation

Use the same idea as in exercise 2 to estimate q function.

*) - not mandatory

In [8]:
def MCq(
    maxiters: 'number of iterations, no early stopping',
    every_visit: 'if true evaluate every visit, otherwise only first visit',
    get_init_state: 'Callable() -> state',
) -> 'action-value table - 2D array: state_indices×action_indices':
    
    Q = np.zeros((N_STATES, N_ACTIONS))
    mean_count = np.ones_like(Q)
    
    for i in range(maxiters):

        # generate an episode
        episode = list()
        state = get_init_state()
        while not is_terminal(state):
            action = np.random.randint(N_ACTIONS)
            new_state, reward = take_action(state, ACTIONS[action])
            
            state_index = STATES.index(state)
            episode.append((state_index, action , reward))
            state = new_state

        # evaluate and improve expectations
        G = 0
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            
            # non-discounted returns
            G += reward

            prior_sa_pairs = ((s,a) for s, a, r in episode[:t])
            if every_visit or ((state, action) not in prior_sa_pairs):
                mean = Q[state, action]
                Q[state, action] = mean + (G - mean)/mean_count[state, action]
                mean_count[state, action] += 1
                
    return Q

In [9]:
def print_action_value(Q):
    print('Action-value table:')
    print('State ' + ('  {:>4}  '*4).format(*ACTION_LABELS))
    for i in range(len(STATES)):
        print('{}  {:6.2f}  {:6.2f}  {:6.2f}  {:6.2f}'.format(STATES[i], *Q[i]))
    print()

In [10]:
# explore starts, first visit
q = MCq(12345, False, get_init_state_explore)
print_action_value(q)

Action-value table:
State      ↑       ↓       ←       →  
(0, 0)    0.00    0.00    0.00    0.00
(0, 1)  -14.90  -18.82   -1.00  -20.99
(0, 2)  -20.99  -21.15  -14.81  -23.13
(0, 3)  -22.74  -21.19  -21.02  -22.85
(1, 0)   -1.00  -20.93  -14.85  -19.68
(1, 1)  -15.45  -21.26  -15.16  -20.80
(1, 2)  -21.14  -19.06  -18.85  -20.76
(1, 3)  -23.03  -14.82  -21.25  -21.43
(2, 0)  -15.27  -22.65  -21.06  -20.81
(2, 1)  -19.00  -20.90  -21.04  -19.37
(2, 2)  -20.77  -14.91  -21.16  -14.97
(2, 3)  -21.06   -1.00  -18.88  -14.12
(3, 0)  -21.01  -22.92  -22.81  -20.94
(3, 1)  -21.47  -20.76  -23.17  -14.93
(3, 2)  -18.78  -15.07  -21.18   -1.00
(3, 3)    0.00    0.00    0.00    0.00



## Exercise 4*: Monte Carlo control

Compute the optimal policy for the 4x4 gridworld example. Start with random policy. Consider the epsilon adjustment schedule - can it in practise be 1/k, or is something more conservative better? Can you think of any other tricks to manage the noisiness of MC?

*) - not mandatory

In [11]:
def MCC(
    maxiters: 'number of iterations, no early stopping',
    every_visit: 'if true evaluate every visit, otherwise only first visit',
    get_init_state: 'Callable() -> state',
    policy: 'Callable(q, state_index, eps) -> action_index',
    eps_schedule: 'Callable(iteration, previous_eps?) -> eps',
) -> 'action-value table - 2D array: state_indices×action_indices':
    
    Q = np.zeros((N_STATES, N_ACTIONS))
    mean_count = np.ones_like(Q)
    eps = None
    
    for i in range(1, maxiters+1):

        # generate an episode
        episode = list()
        state = get_init_state()
        eps = eps_schedule(i, eps)
        while not is_terminal(state):
            state_index = STATES.index(state)
            
            action = policy(Q, state_index, eps)
            new_state, reward = take_action(state, ACTIONS[action])
            
            episode.append((state_index, action , reward))
            state = new_state

        # evaluate and improve expectations
        G = 0
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            
            # non-discounted returns
            G += reward

            prior_sa_pairs = ((s,a) for s, a, r in episode[:t])
            if every_visit or ((state, action) not in prior_sa_pairs):
                mean = Q[state, action]
                Q[state, action] = mean + (G - mean)/mean_count[state, action]
                mean_count[state, action] += 1
                
    return Q

In [12]:
def eps_greedy_policy(q, state_index, eps):
    """
    eps=0 for greedy action,
    eps=1 for random action
    
    returns action index"""
    if np.random.random() < eps:
        return np.random.randint(N_ACTIONS)
    else:
        return np.argmax(q[state_index])

In [13]:
def print_resulting_greedy_policy(Q):
    print('Resulting greedy policy:')
    for state_index in range(N_STATES):
        action_label = '×'

        if not is_terminal(STATES[state_index]):
            action_index = np.argmax(Q[state_index])
            action_label = ACTION_LABELS[action_index]

        print(action_label, end=' ')
        if state_index % 4 == 3:
            print()
    print()

In [14]:
# eps - 1/i, explore starts, first visit
def eps_schedule(iteration, prev_eps):
    return 1/iteration

q = MCC(5000, False, get_init_state_explore, eps_greedy_policy, eps_schedule)
print_action_value(q)
print_resulting_greedy_policy(q)

Action-value table:
State      ↑       ↓       ←       →  
(0, 0)    0.00    0.00    0.00    0.00
(0, 1)   -7.67  -17.00   -1.00  -46.00
(0, 2)  -57.00  -36.00   -2.06  -53.00
(0, 3)  -25.67  -32.00   -3.11  -20.50
(1, 0)   -1.00  -74.00  -26.00  -53.00
(1, 1)  -16.00   -4.04  -75.00  -39.00
(1, 2)  -58.00   -3.00   -5.00  -42.33
(1, 3)  -22.67  -28.00   -4.18  -72.00
(2, 0)   -2.00  -18.33  -41.00  -72.00
(2, 1)  -69.00  -71.00   -3.01  -23.00
(2, 2)  -61.00  -22.00   -4.00   -2.00
(2, 3)  -27.00   -1.00   -4.00  -11.50
(3, 0)   -3.01  -29.50  -22.00  -38.00
(3, 1)  -70.00  -10.50   -4.00   -2.00
(3, 2)  -21.00   -2.50  -19.00   -1.00
(3, 3)    0.00    0.00    0.00    0.00

Resulting greedy policy:
× ← ← ← 
↑ ↓ ↓ ← 
↑ ← → ↓ 
↑ → → × 



The schedule is way too agressive - i.e. very slow exploration and thus convergence. For this problem, schedule 100/i was already able do find optimal policy often.

In [15]:
# eps - logarithmic decay, explore starts, first visit
def eps_schedule(iteration, prev_eps):
    return 1/(1 + np.log(iteration))

q = MCC(5000, False, get_init_state_explore, eps_greedy_policy, eps_schedule)
print_action_value(q)
print_resulting_greedy_policy(q)

Action-value table:
State      ↑       ↓       ←       →  
(0, 0)    0.00    0.00    0.00    0.00
(0, 1)   -5.72   -5.10   -1.00   -7.93
(0, 2)   -9.35   -4.44   -5.73   -8.96
(0, 3)  -12.92  -10.71   -5.55  -11.36
(1, 0)   -1.00   -5.32   -2.26   -3.42
(1, 1)   -2.43   -8.23   -6.88  -10.06
(1, 2)   -6.58   -3.33   -5.39   -5.35
(1, 3)  -13.71   -2.41   -8.43   -9.38
(2, 0)   -2.25   -7.20   -6.00   -7.30
(2, 1)   -8.86   -7.79   -3.44   -8.11
(2, 2)   -4.72   -2.18   -4.34   -3.87
(2, 3)   -8.70   -1.00   -3.39   -6.13
(3, 0)   -6.64   -4.82   -7.94   -3.27
(3, 1)   -6.07   -5.71   -6.95   -2.22
(3, 2)   -3.26   -2.19   -3.95   -1.00
(3, 3)    0.00    0.00    0.00    0.00

Resulting greedy policy:
× ← ↓ ← 
↑ ↑ ↓ ↓ 
↑ ← ↓ ↓ 
→ → → × 



Much slower decay in the long run, although drops quite quickly in the beginning.

In [16]:
# eps - exponential decay, explore starts, first visit
def eps_schedule(iteration, prev_eps):
    prev_eps = prev_eps or 1
    return 0.99*prev_eps

q = MCC(5000, False, get_init_state_explore, eps_greedy_policy, eps_schedule)
print_action_value(q)
print_resulting_greedy_policy(q)

Action-value table:
State      ↑       ↓       ←       →  
(0, 0)    0.00    0.00    0.00    0.00
(0, 1)   -4.00  -12.77   -1.00   -8.23
(0, 2)  -12.50  -14.08   -2.12  -12.64
(0, 3)  -11.12  -10.29   -3.35  -10.40
(1, 0)   -1.00  -11.64   -3.08   -9.50
(1, 1)   -9.33  -10.83   -2.10  -13.33
(1, 2)  -13.40  -13.88   -3.44  -14.11
(1, 3)  -14.63   -2.12  -12.22  -11.20
(2, 0)   -2.13  -18.00  -11.40  -15.15
(2, 1)   -3.38  -17.27  -17.36  -16.40
(2, 2)  -11.27   -2.34  -15.80  -13.17
(2, 3)   -8.82   -1.00  -14.50   -5.60
(3, 0)   -3.19  -15.70  -13.17  -13.06
(3, 1)  -16.18  -18.45   -4.44  -15.27
(3, 2)  -11.67   -9.82  -18.80   -1.00
(3, 3)    0.00    0.00    0.00    0.00

Resulting greedy policy:
× ← ← ← 
↑ ← ← ↓ 
↑ ↑ ↓ ↓ 
↑ ← → × 



Although exponential decay approaches zero much faster in the limit, it has a plenty of time to explore.

In [17]:
# eps - constant, explore starts, first visit
def eps_schedule(iteration, prev_eps):
    return 0.3

q = MCC(5000, False, get_init_state_explore, eps_greedy_policy, eps_schedule)
print_action_value(q)
print_resulting_greedy_policy(q)

Action-value table:
State      ↑       ↓       ←       →  
(0, 0)    0.00    0.00    0.00    0.00
(0, 1)   -3.22   -3.93   -1.00   -5.03
(0, 2)   -6.25   -5.71   -2.73   -7.07
(0, 3)   -7.19   -4.50   -6.47   -7.34
(1, 0)   -1.00   -4.43   -2.73   -3.98
(1, 1)   -2.68   -5.41   -2.91   -5.40
(1, 2)   -6.05   -4.00   -4.39   -4.98
(1, 3)   -6.46   -2.65   -5.28   -4.68
(2, 0)   -2.62   -5.71   -5.24   -5.29
(2, 1)   -4.48   -5.23   -4.68   -3.95
(2, 2)   -4.90   -3.08   -5.32   -2.63
(2, 3)   -4.58   -1.00   -4.11   -2.81
(3, 0)   -5.33   -5.32   -5.66   -4.04
(3, 1)   -5.28   -4.61   -5.69   -2.57
(3, 2)   -4.48   -2.68   -4.15   -1.00
(3, 3)    0.00    0.00    0.00    0.00

Resulting greedy policy:
× ← ← ↓ 
↑ ↑ ↓ ↓ 
↑ → → ↓ 
→ → → × 



Not GLIE but still works here.

---

The MC noisiness could be mitigated for example by updating the action values more conservatively (putting less weight on single rewards)