# TX00DQ05-3001 Exercises 3

In [1]:
import numpy as np
import numpy.linalg as LA
from numpy import random

import copy

## Exercise 1: Sample behaviour of an MDP

Let's take (again) a look at Sutton & Barto example 4.1 gridworld. On each iteration start at every (non-terminating) state and sample actions in succeeding states by selecting them from uniform distribution (each action - up, down, left, right - is equally probable). Run the episode until terminal state is encountered. Collect statistics to calculate average number of steps needed before completion for each start state. Should this number match with something you have seen earlier in the exercises?

In [2]:
# YOUR CODE
ROWS_COUNT = 4
COLUMNS_COUNT = 4
TERMINATING = [(0,0), (ROWS_COUNT-1, COLUMNS_COUNT-1)]
ACTIONS = ['←','↑','↓','→']
rewards = [-1, -1, -1, -1]

prob = [0.25, 0.25, 0.25, 0.25]

def state_after_action(action, row, column):
    if action == ACTIONS[0]:
        column -= 1
    elif action == ACTIONS[1]:
        row -= 1
    elif action == ACTIONS[2]:
        row += 1
    elif action == ACTIONS[3]:
        column += 1
        
    if row >= ROWS_COUNT:
        row = ROWS_COUNT - 1
    elif row <= 0:
        row = 0
    if column >= COLUMNS_COUNT:
        column = COLUMNS_COUNT - 1
    elif column <= 0:
        column = 0
    
    return {'row':row, 'column':column}

def random_episode_from(row, column, policy=None):
    global prob
    current_state = {'row': row, 'column': column}
    episode = []
    step_prob = copy.deepcopy(prob)
        
    while(True):
        if (current_state['row'],current_state['column']) in TERMINATING:
            break
            
        if policy:
            for a in ACTIONS:
                step_prob[ACTIONS.index(a)] = policy[current_state['row']][current_state['column']][a]
        
        selected_action = random.choice(ACTIONS, p=step_prob)
        episode.append({'state' : current_state, 'action': selected_action, 'reward': reward_for(selected_action)})
        current_state = state_after_action(selected_action, current_state['row'], current_state['column'])
    return episode

def reward_for(action):
    for act in ACTIONS:
        if act == action:
            return rewards[ACTIONS.index(action)]

def calculate_mean(mean, new_number, n):
    mean += (new_number - mean)/(n+1)
    return mean

def init_returns(returns_for='state'):
    returns = []
    for r in range(ROWS_COUNT):
        returns.append([])
        for c in range(COLUMNS_COUNT):
            if returns_for == 'state':
                returns[r].append({'mean' : 0, 'members' : 0})
            elif returns_for == 'action':
                temp_dict = {}
                for a in ACTIONS:
                    temp_dict.update({a : {'mean' : 0, 'members' : 0}})
                returns[r].append(temp_dict)
    return returns

def print_returns(returns, value_type='state'):
    for r in range(ROWS_COUNT):
        print('[',end='')
        for c in range(COLUMNS_COUNT):
            if value_type == 'state':
                print(returns[r][c]['mean']," ", end="")
            elif value_type == 'action':
                for a in ACTIONS:
                    print('(',r,',',c,',',a,')',returns[r][c][a]['mean'],end='')
                    if a != ACTIONS[3]:
                        print(' , ',end='')
                    print()
                print()
        print(']')
    return

In [3]:
#The initial values of lenghts is the same as returns so the same function is used
lengths = init_returns()

for i in range(100):
    for r in range(ROWS_COUNT):
        for c in range(COLUMNS_COUNT):
            ep = random_episode_from(r,c)
            if i == 0:
                lengths[r][c] = {'mean':len(ep), 'members':1}
            else:
                lengths[r][c]['mean'] = calculate_mean(lengths[r][c]['mean'], len(ep), lengths[r][c]['members'])
            lengths[r][c]['members'] += 1

print_returns(lengths)

[0.0  12.247524752475252  18.831683168316825  20.21782178217823  ]
[16.900990099009913  18.425742574257423  19.524752475247517  20.069306930693067  ]
[21.306930693069297  18.584158415841582  18.68316831683168  12.17821782178218  ]
[22.0891089108911  19.999999999999993  10.485148514851483  0.0  ]


## Exercise 2: Monte Carlo state value function estimation. 

Calculate state-value function V for the gridworld of Sutton & Barto example 4.1 using first-visit or every-visit Monte Carlo policy evaluation (see for example page 92 of Sutton & Barto). Policy to be evaluated is the same as before; each action (up, down, left, right) is equally probable.  Action that would result in leaving the grid (for example moving up in top row) will leave state unchanged (but action has been taken). Gamma (discount factor) is assumed to be = 1, ie. no discounting.

Try out both exploring starts (see Sutton & Barto, p. 96) and fixed start points. Any difference?

Take a look at the value function you get when you run the algorithm multiple times (with fixed # of iterations). Any observations?

In [4]:
def extract_states(episode, need_action=False):
    states = []
    for step in episode:
        if need_action:
            states.append((step['state']['row'], step['state']['column'], step['action']))
        else:
            states.append((step['state']['row'], step['state']['column']))
        
    return states

def state_value_function(returns, visit_type, start_type):
    global fixed_state
    for i in range(100):
        for r in range(ROWS_COUNT):
            for c in range(COLUMNS_COUNT):
                if start_type == 'es':
                    row = r
                    col = c
                elif start_type == 'fixed':
                    row = fixed_state['row']
                    col = fixed_state['column']
                    
                if not (row,col) in TERMINATING:
                    episode = random_episode_from(row,col)
                    episode.reverse()
                    G = 0
                    states = extract_states(episode)
                    
                    for step in episode:
                        G += step['reward']
                        state_row = step['state']['row']
                        state_col = step['state']['column']
                        members = returns[state_row][state_col]['members']
                        mean = returns[state_row][state_col]['mean']
                        
                        if visit_type == 'first':
                            states = states[1:]
                            should_cal_mean = False
                            if not (state_row, state_col) in states:
                                should_cal_mean = True
                        elif visit_type == 'every':
                            should_cal_mean = True
                            
                        if should_cal_mean:
                            if mean == 0:
                                mean = G
                            else:
                                mean = calculate_mean(mean, G, members)
                            members += 1
                            returns[step['state']['row']][step['state']['column']] = {'mean':mean, 'members': members}
    return returns

In [5]:
returns = init_returns()
returns = state_value_function(returns,'first','es')
print_returns(returns)

[0  -14.412621359223307  -20.18367346938777  -22.5863539445629  ]
[-14.925686591276257  -18.387012987012987  -20.094540612516646  -20.284757118927967  ]
[-20.260797342192706  -19.816406250000032  -17.93483709273183  -14.563779527559047  ]
[-21.27111984282908  -19.586097946287506  -13.42146189735614  0  ]


In [6]:
returns = init_returns()
returns = state_value_function(returns,'every','es')
print_returns(returns)

[0  -14.58790436005625  -19.727134146341474  -22.224549439844147  ]
[-14.386331938633186  -18.301087578706323  -19.854404145077705  -19.945349448239593  ]
[-19.827127659574483  -19.90688049663733  -17.113132461625916  -12.99644886363636  ]
[-21.52111959287532  -19.8241935483871  -13.600864553314109  0  ]


In [7]:
fixed_state = {'row': 2,'column': 1}

In [8]:
returns = init_returns()
returns = state_value_function(returns,'first','fixed')
print_returns(returns)

[0  -13.879870129870126  -20.660678642714586  -22.143266475644698  ]
[-13.88711656441717  -17.683852140077814  -20.059945504087217  -19.54601226993866  ]
[-20.254107338444705  -19.815  -18.01292246520877  -13.305172413793095  ]
[-22.451367781155017  -20.297727272727233  -14.454081632653065  0  ]


In [9]:
returns = init_returns()
returns = state_value_function(returns,'every','fixed')
print_returns(returns)

[0  -14.729784366576817  -20.921892189218912  -24.093750000000004  ]
[-15.30425299890949  -18.98454221165284  -21.014760147601475  -21.384907709011948  ]
[-20.929870129870107  -21.376916140667255  -19.198143664245357  -15.049604601006466  ]
[-23.10305218247448  -21.223021582733818  -14.387820512820529  0  ]


Both exploring starts and fixed starts give the same results.
However, in both case, the results will differ each time the code is run. This is the cause of the random function that gives different actions in each random.

## Exercise 3*: Monte Carlo action value function estimation

Use the same idea as in exercise 2 to estimate q function.

*) - not mandatory

In [10]:
def action_value_function(returns, visit_type, start_type, policy=None):
    global fixed_state
    for i in range(100):
        epsilon = 1/(i+1)
        for r in range(ROWS_COUNT):
            for c in range(COLUMNS_COUNT):
                if start_type == 'es':
                    row = r
                    col = c
                elif start_type == 'fixed':
                    row = fixed_state['row']
                    col = fixed_state['column']
                
                if not (row,col) in TERMINATING:
                    episode = random_episode_from(row, col, policy)
                    episode.reverse()
                    G = 0
                    states = extract_states(episode, True)
                    
                    for step in episode:
                        G += step['reward']
                        state_row = step['state']['row']
                        state_col = step['state']['column']
                        step_action = step['action']
                        
                        members = returns[state_row][state_col][step_action]['members']
                        mean = returns[state_row][state_col][step_action]['mean']
                        
                        should_cal_mean = False
                        if visit_type == 'first':
                            states = states[1:]
                            if not (state_row, state_col, step_action) in states:
                                should_cal_mean = True
                        elif visit_type == 'every':
                            should_cal_mean = True
                            
                        if should_cal_mean:
                            if mean == 0:
                                mean = G
                            else:
                                mean = calculate_mean(mean, G, members)
                            members += 1
                            returns[state_row][state_col][step_action] = {'mean': mean, 'members': members}
                            
                        if policy:
                            val_list = []
                            action_count = len(ACTIONS)
                            
                            for a in ACTIONS:
                                val_list.append(returns[state_row][state_col][a]['mean'])
                                if policy[state_row][state_col][a] == 0:
                                    action_count -= 1
                                    
                            best_action_index = val_list.index(max(val_list))
                            for a in ACTIONS:
                                if ACTIONS.index(a) == best_action_index:
                                    policy[state_row][state_col][a] = 1 - epsilon + (epsilon/action_count)
                                else:
                                    policy[state_row][state_col][a] = epsilon/action_count
                                    
    if not policy:
        return returns
    else:
        return returns, policy

In [11]:
fixed_state = {'row': 2,'column': 1}

In [12]:
returns = init_returns('action')
returns = action_value_function(returns,'first','es')
print_returns(returns,'action')

[( 0 , 0 , ← ) 0 , 
( 0 , 0 , ↑ ) 0 , 
( 0 , 0 , ↓ ) 0 , 
( 0 , 0 , → ) 0

( 0 , 1 , ← ) -1.0 , 
( 0 , 1 , ↑ ) -16.13364055299539 , 
( 0 , 1 , ↓ ) -20.374045801526727 , 
( 0 , 1 , → ) -21.511811023622062

( 0 , 2 , ← ) -14.774725274725283 , 
( 0 , 2 , ↑ ) -21.39338235294118 , 
( 0 , 2 , ↓ ) -20.965616045845255 , 
( 0 , 2 , → ) -23.458745874587468

( 0 , 3 , ← ) -21.943925233644855 , 
( 0 , 3 , ↑ ) -22.632000000000005 , 
( 0 , 3 , ↓ ) -19.742774566473987 , 
( 0 , 3 , → ) -21.730468749999993

]
[( 1 , 0 , ← ) -14.863849765258212 , 
( 1 , 0 , ↑ ) -1.0 , 
( 1 , 0 , ↓ ) -19.98393574297187 , 
( 1 , 0 , → ) -18.017921146953412

( 1 , 1 , ← ) -14.972067039106156 , 
( 1 , 1 , ↑ ) -16.28783382789319 , 
( 1 , 1 , ↓ ) -21.619631901840496 , 
( 1 , 1 , → ) -21.492401215805465

( 1 , 2 , ← ) -20.111731843575416 , 
( 1 , 2 , ↑ ) -19.749279538904908 , 
( 1 , 2 , ↓ ) -19.45205479452053 , 
( 1 , 2 , → ) -20.91596638655462

( 1 , 3 , ← ) -20.754335260115603 , 
( 1 , 3 , ↑ ) -23.32885906040267 , 
( 1 , 3 ,

In [13]:
returns = init_returns('action')
returns = action_value_function(returns,'every','es')
print_returns(returns,'action')

[( 0 , 0 , ← ) 0 , 
( 0 , 0 , ↑ ) 0 , 
( 0 , 0 , ↓ ) 0 , 
( 0 , 0 , → ) 0

( 0 , 1 , ← ) -1.0 , 
( 0 , 1 , ↑ ) -15.113268608414232 , 
( 0 , 1 , ↓ ) -18.00571428571429 , 
( 0 , 1 , → ) -20.964809384164226

( 0 , 2 , ← ) -15.734913793103456 , 
( 0 , 2 , ↑ ) -22.62776659959758 , 
( 0 , 2 , ↓ ) -22.059880239520975 , 
( 0 , 2 , → ) -24.590654205607493

( 0 , 3 , ← ) -22.345070422535226 , 
( 0 , 3 , ↑ ) -24.40562913907283 , 
( 0 , 3 , ↓ ) -21.586956521739133 , 
( 0 , 3 , → ) -25.604569420035137

]
[( 1 , 0 , ← ) -14.256024096385538 , 
( 1 , 0 , ↑ ) -1.0 , 
( 1 , 0 , ↓ ) -19.987080103359165 , 
( 1 , 0 , → ) -17.57837837837837

( 1 , 1 , ← ) -15.09473684210526 , 
( 1 , 1 , ↑ ) -14.237362637362633 , 
( 1 , 1 , ↓ ) -21.484716157205217 , 
( 1 , 1 , → ) -20.679069767441852

( 1 , 2 , ← ) -19.43574297188754 , 
( 1 , 2 , ↑ ) -22.527494908350302 , 
( 1 , 2 , ↓ ) -18.306451612903235 , 
( 1 , 2 , → ) -22.259793814433007

( 1 , 3 , ← ) -22.476284584980238 , 
( 1 , 3 , ↑ ) -23.512241054613924 , 
( 1 , 3 

In [14]:
returns = init_returns('action')
returns = action_value_function(returns,'first','fixed')
print_returns(returns,'action')

[( 0 , 0 , ← ) 0 , 
( 0 , 0 , ↑ ) 0 , 
( 0 , 0 , ↓ ) 0 , 
( 0 , 0 , → ) 0

( 0 , 1 , ← ) -1.0 , 
( 0 , 1 , ↑ ) -16.607438016528924 , 
( 0 , 1 , ↓ ) -20.532319391635 , 
( 0 , 1 , → ) -22.344569288389508

( 0 , 2 , ← ) -16.753943217665604 , 
( 0 , 2 , ↑ ) -22.491379310344836 , 
( 0 , 2 , ↓ ) -21.564102564102573 , 
( 0 , 2 , → ) -23.738970588235297

( 0 , 3 , ← ) -20.87985865724382 , 
( 0 , 3 , ↑ ) -21.556561085972852 , 
( 0 , 3 , ↓ ) -19.68041237113403 , 
( 0 , 3 , → ) -21.749999999999993

]
[( 1 , 0 , ← ) -16.703832752613245 , 
( 1 , 0 , ↑ ) -1.0 , 
( 1 , 0 , ↓ ) -24.56249999999999 , 
( 1 , 0 , → ) -20.163101604278086

( 1 , 1 , ← ) -16.52350427350426 , 
( 1 , 1 , ↑ ) -15.70393374741201 , 
( 1 , 1 , ↓ ) -22.49885057471263 , 
( 1 , 1 , → ) -21.691796008869186

( 1 , 2 , ← ) -19.60049019607844 , 
( 1 , 2 , ↑ ) -22.662952646239557 , 
( 1 , 2 , ↓ ) -19.500000000000014 , 
( 1 , 2 , → ) -22.28291316526611

( 1 , 3 , ← ) -22.671826625387 , 
( 1 , 3 , ↑ ) -22.594982078853064 , 
( 1 , 3 , ↓ ) -1

In [15]:
returns = init_returns('action')
returns = action_value_function(returns,'every','fixed')
print_returns(returns,'action')

[( 0 , 0 , ← ) 0 , 
( 0 , 0 , ↑ ) 0 , 
( 0 , 0 , ↓ ) 0 , 
( 0 , 0 , → ) 0

( 0 , 1 , ← ) -1.0 , 
( 0 , 1 , ↑ ) -15.374233128834355 , 
( 0 , 1 , ↓ ) -18.00302114803624 , 
( 0 , 1 , → ) -20.611445783132538

( 0 , 2 , ← ) -13.653944020356242 , 
( 0 , 2 , ↑ ) -22.172749391727496 , 
( 0 , 2 , ↓ ) -21.11333333333334 , 
( 0 , 2 , → ) -22.97982062780269

( 0 , 3 , ← ) -19.772093023255817 , 
( 0 , 3 , ↑ ) -23.19791666666668 , 
( 0 , 3 , ↓ ) -21.733480176211454 , 
( 0 , 3 , → ) -24.565410199556556

]
[( 1 , 0 , ← ) -15.948497854077251 , 
( 1 , 0 , ↑ ) -1.0 , 
( 1 , 0 , ↓ ) -20.2008547008547 , 
( 1 , 0 , → ) -19.090109890109886

( 1 , 1 , ← ) -15.37833037300178 , 
( 1 , 1 , ↑ ) -15.185618729096987 , 
( 1 , 1 , ↓ ) -20.27441077441077 , 
( 1 , 1 , → ) -20.883838383838384

( 1 , 2 , ← ) -18.869070208728655 , 
( 1 , 2 , ↑ ) -21.76660341555975 , 
( 1 , 2 , ↓ ) -20.22283609576428 , 
( 1 , 2 , → ) -21.41715399610136

( 1 , 3 , ← ) -21.67324561403508 , 
( 1 , 3 , ↑ ) -22.46347031963472 , 
( 1 , 3 , ↓ ) -

## Exercise 4*: Monte Carlo control

Compute the optimal policy for the 4x4 gridworld example. Start with random policy. Consider the epsilon adjustment schedule - can it in practise be 1/k, or is something more conservative better? Can you think of any other tricks to manage the noisiness of MC?

*) - not mandatory

In [16]:
def init_policy():
    policy = []
    for r in range(ROWS_COUNT):
        policy.append([])
        for c in range(COLUMNS_COUNT):
            temp_dict = {}
            for a in range(len(ACTIONS)):
                if (r,c) in TERMINATING:
                    p = 0
                else:
                    p = prob[a]
                temp_dict.update({ACTIONS[a] : p})
            policy[r].append(temp_dict)
    return policy

def print_policy(policy):
    print_p = []
    for i in range(ROWS_COUNT):
        print_p.append([])

    for r in range(ROWS_COUNT):
        for c in range(COLUMNS_COUNT):
            actions_prob = []
            if (r,c) in TERMINATING:
                print_p[r].append('T')
            else:
                for a in ACTIONS:
                    actions_prob.append(policy[r][c][a])
                print_p[r].append(ACTIONS[actions_prob.index(max(actions_prob))])
    return print_p

In [17]:
policy = init_policy()

returns = init_returns('action')
returns, policy = action_value_function(returns, 'first', 'es', policy)
print_returns(returns, 'action')

[( 0 , 0 , ← ) 0 , 
( 0 , 0 , ↑ ) 0 , 
( 0 , 0 , ↓ ) 0 , 
( 0 , 0 , → ) 0

( 0 , 1 , ← ) -1.0 , 
( 0 , 1 , ↑ ) -9.625 , 
( 0 , 1 , ↓ ) -9.666666666666666 , 
( 0 , 1 , → ) -10.0

( 0 , 2 , ← ) -2.2251655629139075 , 
( 0 , 2 , ↑ ) -14.285714285714283 , 
( 0 , 2 , ↓ ) -10.0 , 
( 0 , 2 , → ) -9.88888888888889

( 0 , 3 , ← ) -3.588785046728973 , 
( 0 , 3 , ↑ ) -11.6 , 
( 0 , 3 , ↓ ) -12.166666666666666 , 
( 0 , 3 , → ) -8.25

]
[( 1 , 0 , ← ) -12.666666666666668 , 
( 1 , 0 , ↑ ) -1.0 , 
( 1 , 0 , ↓ ) -11.571428571428571 , 
( 1 , 0 , → ) -8.75

( 1 , 1 , ← ) -2.0757575757575766 , 
( 1 , 1 , ↑ ) -11.6 , 
( 1 , 1 , ↓ ) -8.444444444444445 , 
( 1 , 1 , → ) -14.571428571428571

( 1 , 2 , ← ) -17.6 , 
( 1 , 2 , ↑ ) -3.247619047619047 , 
( 1 , 2 , ↓ ) -15.666666666666666 , 
( 1 , 2 , → ) -11.6

( 1 , 3 , ← ) -13.857142857142856 , 
( 1 , 3 , ↑ ) -16.57142857142857 , 
( 1 , 3 , ↓ ) -2.135922330097087 , 
( 1 , 3 , → ) -16.333333333333332

]
[( 2 , 0 , ← ) -40.5 , 
( 2 , 0 , ↑ ) -2.897196261682242 , 
(

In [18]:
print_policy(policy)

[['T', '←', '←', '←'],
 ['↑', '←', '↑', '↓'],
 ['↑', '↑', '↓', '↓'],
 ['→', '→', '→', 'T']]

In [19]:
policy

[[{'←': 0, '↑': 0, '↓': 0, '→': 0},
  {'←': 0.9924999999999999, '↑': 0.0025, '↓': 0.0025, '→': 0.0025},
  {'←': 0.9924999999999999, '↑': 0.0025, '↓': 0.0025, '→': 0.0025},
  {'←': 0.9924999999999999, '↑': 0.0025, '↓': 0.0025, '→': 0.0025}],
 [{'←': 0.0025, '↑': 0.9924999999999999, '↓': 0.0025, '→': 0.0025},
  {'←': 0.9924999999999999, '↑': 0.0025, '↓': 0.0025, '→': 0.0025},
  {'←': 0.0025, '↑': 0.9924999999999999, '↓': 0.0025, '→': 0.0025},
  {'←': 0.0025, '↑': 0.0025, '↓': 0.9924999999999999, '→': 0.0025}],
 [{'←': 0.0025, '↑': 0.9924999999999999, '↓': 0.0025, '→': 0.0025},
  {'←': 0.0025, '↑': 0.9924999999999999, '↓': 0.0025, '→': 0.0025},
  {'←': 0.0025, '↑': 0.0025, '↓': 0.9924999999999999, '→': 0.0025},
  {'←': 0.0025, '↑': 0.0025, '↓': 0.9924999999999999, '→': 0.0025}],
 [{'←': 0.0025, '↑': 0.0025, '↓': 0.0025, '→': 0.9924999999999999},
  {'←': 0.0025, '↑': 0.0025, '↓': 0.0025, '→': 0.9924999999999999},
  {'←': 0.0025, '↑': 0.0025, '↓': 0.0025, '→': 0.9924999999999999},
  {'←': 0