In [26]:
import json
import mazes
import sys, os
import argparse
import numpy as np
from numpy.linalg import inv
from math import sqrt
import matplotlib.pyplot as plt
from collections import OrderedDict
from sr_main import run_learning_sr

In [27]:
def init_default_config():
    config = {'terminal_step' : 50000,
              'episode_length' : 20,
              'maze_type' : 'SIMPLE_MAZE',
              'maze_params' : {
                  'row' : 2,
                  'col' : 2,
                  'start_row' : 1,
                  'start_col' : 1
              },
              'learning_alg_params' : {
                  'alpha' : 0.01,
                  'gamma' : 0.95,
              },              
             }
    return config

## Question 1: Learning SR using TD(0)

In [28]:
config = init_default_config()
result = run_learning_sr(config)
terminal_step = config['terminal_step']
approx_Phi_pi = result[terminal_step-1]['Phi_pi']

In [29]:
# Calculate exact phi_pi for 2x2 gridworld
state_len = config['maze_params']['row']*config['maze_params']['col']
exact_Phi_pi = np.zeros((state_len, state_len))
gamma = config['learning_alg_params']['gamma']

P_pi = np.array([[0.5, 0.25, 0.25, 0.],
                 [0.25, 0.5, 0., 0.25],
                 [0.25, 0., 0.5, 0.25],
                 [0., 0., 0., 1.]])
exact_Phi_pi = inv(np.identity(4)-gamma*P_pi)

In [30]:
print(exact_Phi_pi)
print(approx_Phi_pi)
print(np.sum(np.abs(exact_Phi_pi-approx_Phi_pi)))

[[ 3.22456814  1.45873321  1.45873321 13.85796545]
 [ 1.45873321  2.56466502  0.65990312 15.31669866]
 [ 1.45873321  0.65990312  2.56466502 15.31669866]
 [ 0.          0.          0.         20.        ]]
[[ 3.09827982  1.55418676  1.3783947  13.96850073]
 [ 1.29646147  2.59044603  0.59418355 15.5184105 ]
 [ 1.60969178  0.72458808  2.56601032 15.09912126]
 [ 0.          0.          0.         19.99999582]]
1.302670224418


In [31]:
print(result[66])

{'Phi_pi': array([[6.85779469e-02, 0.00000000e+00, 5.63275420e-04, 5.53612555e-05],
       [0.00000000e+00, 2.97950500e-02, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 4.91952380e-02, 8.79149624e-03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 5.13424913e-01]]), 'experience': {'S': 3, 'A': 0, 'R': 0.0, 'S_prime': 3}, 'V_pi': array([[-0.26004077],
       [-0.07671333],
       [-0.07224012],
       [ 0.        ]])}


## Question 2: Value Prediction with SR


In [32]:
r_pi = np.array([-1, -0.75, -0.75, 0.0])
r_pi = r_pi.reshape(4,1)
print(r_pi)


[[-1.  ]
 [-0.75]
 [-0.75]
 [ 0.  ]]


In [33]:
exact_V_pi = exact_Phi_pi.dot(r_pi)
approx_V_pi = approx_Phi_pi.dot(r_pi)
print(exact_V_pi)
print(approx_V_pi)

[[-5.41266795]
 [-3.87715931]
 [-3.87715931]
 [ 0.        ]]
[[-5.29771592]
 [-3.68493365]
 [-4.07764058]
 [ 0.        ]]


In [34]:
print(result[terminal_step-1]['V_pi'])

[[-5.3062474 ]
 [-3.40206555]
 [-4.23330829]
 [ 0.        ]]
