In [1]:
import json
import mazes
import sys, os
import argparse
import numpy as np
from numpy.linalg import inv
from math import sqrt
import matplotlib.pyplot as plt
from collections import OrderedDict
from sr_main import run_learning_sr

In [2]:
def init_default_config():
    config = {'terminal_step' : 50000,
              'episode_length' : 10,
              'maze_type' : 'SIMPLE_MAZE',
              'maze_params' : {
                  'row' : 2,
                  'col' : 2,
                  'start_row' : 1,
                  'start_col' : 1
              },
              'learning_alg_params' : {
                  'alpha' : 0.01,
                  'gamma' : 0.95,
              },              
             }
    return config

## Question 1: Learning SR using TD(0)

In [3]:
config = init_default_config()
result = run_learning_sr(config)
terminal_step = config['terminal_step']
approx_Phi_pi = result[terminal_step-1]['Phi_pi']

In [4]:
# Calculate exact phi_pi for 2x2 gridworld
state_len = config['maze_params']['row']*config['maze_params']['col']
exact_Phi_pi = np.zeros((state_len, state_len))
gamma = config['learning_alg_params']['gamma']

P_pi = np.array([[0.5, 0.25, 0.25, 0.],
                 [0.25, 0.5, 0., 0.25],
                 [0.25, 0., 0.5, 0.25],
                 [0., 0., 0., 1.]])
exact_Phi_pi = inv(np.identity(4)-gamma*P_pi)

In [5]:
print(exact_Phi_pi)
print(approx_Phi_pi)
print(np.sum(exact_Phi_pi-approx_Phi_pi))

[[ 3.22456814  1.45873321  1.45873321 13.85796545]
 [ 1.45873321  2.56466502  0.65990312 15.31669866]
 [ 1.45873321  0.65990312  2.56466502 15.31669866]
 [ 0.          0.          0.         20.        ]]
[[ 3.10632463  1.44504651  1.39367095 13.61328782]
 [ 1.08343326  2.41446027  0.49810384 15.52523091]
 [ 1.56773226  0.72503507  2.49549648 14.74254121]
 [ 0.          0.          0.         19.48479095]]
1.9048458385950555


In [6]:
print(result[66])

{'Phi_pi': array([[2.32167885e-01, 5.62850188e-03, 3.64757846e-03, 4.02062344e-05],
       [1.88439362e-03, 1.34663329e-01, 1.72261990e-06, 1.40814023e-03],
       [2.18985306e-03, 5.35193998e-05, 1.35555770e-01, 2.15107093e-03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.39545909e-01]]), 'experience': {'S': 0, 'A': 0, 'R': -1.0, 'S_prime': 0}, 'V_pi': array([[-0.84101987],
       [-0.36081363],
       [-0.37602802],
       [ 0.        ]])}


## Question 2: Value Prediction with SR


In [7]:
r_pi = np.array([-1, -0.75, -0.75, 0.0])
r_pi = r_pi.reshape(4,1)
print(r_pi)


[[-1.  ]
 [-0.75]
 [-0.75]
 [ 0.  ]]


In [8]:
exact_V_pi = exact_Phi_pi.dot(r_pi)
approx_V_pi = approx_Phi_pi.dot(r_pi)
print(exact_V_pi)
print(approx_V_pi)

[[-5.41266795]
 [-3.87715931]
 [-3.87715931]
 [ 0.        ]]
[[-5.23536273]
 [-3.26785635]
 [-3.98313092]
 [ 0.        ]]


In [9]:
print(result[terminal_step-1]['V_pi'])

[[-4.84891614]
 [-2.67169042]
 [-3.65181079]
 [ 0.        ]]
