# Assignment 6: Track 2 Project Report

## Team: dayan-hunt
* Nikhil Kakodkar: 260578689 | nikhil.kakodkar@mail.mcgill.ca
* Karim Koreitem: 260460964 | karim.koreitem@mail.mcgill.ca

In [4]:
%matplotlib inline
import numpy as np
from numpy.linalg import inv
from math import sqrt
import matplotlib.pyplot as plt
from sr_main import run_experiment, compute_transition_mtx

In [5]:
def init_default_config():
    config = {'switch_reward': False,
              'terminal_step' : 50000,
              'switch_reward_at_step' : 25000,
              'episode_length' : 20,
              'maze_type' : 'twobytwo',
              'maze_params' : {
                  'row' : 2,
                  'col' : 2,
                  'start_row' : 1,
                  'start_col' : 1
              },
              'learning_alg_params' : {
                  'alpha' : 0.01,
                  'gamma' : 0.9,
              },              
             }
    return config

def init_default_config_tenbyten():
    config = {'switch_reward': False,
              'terminal_step' : 50000,
              'switch_reward_at_step' : 25000,
              'episode_length' : 20,
              'maze_type' : 'tenbyten',
              'maze_params' : {
                  'row' : 10,
                  'col' : 10,
                  'start_row' : 1,
                  'start_col' : 1
              },
              'learning_alg_params' : {
                  'alpha' : 0.01,
                  'gamma' : 0.9,
              },              
             }
    return config

## Question 1: Learning SR using TD(0)

In [6]:
config = init_default_config_tenbyten()
result = run_experiment(config)
terminal_step = config['terminal_step']
approx_Phi_pi = result[terminal_step-1]['Phi_pi']

In [9]:
# Calculate exact phi_pi for 10x10 gridworld
state_len = config['maze_params']['row']*config['maze_params']['col']
exact_Phi_pi = np.zeros((state_len, state_len))
gamma = config['learning_alg_params']['gamma']

P_pi_twobytwo = np.array([[0.5, 0.25, 0.25, 0.],
                          [0.25, 0.5, 0., 0.25],
                          [0.25, 0., 0.5, 0.25],
                          [0., 0., 0., 1.]])

P_pi_tenbyten = compute_transition_mtx(config)

exact_Phi_pi = inv(np.identity(state_len)-gamma*P_pi_tenbyten)

In [10]:
print(exact_Phi_pi)
print(approx_Phi_pi)
print(np.sum(np.abs(exact_Phi_pi-approx_Phi_pi)))

[[2.74710869e+00 1.13535506e+00 4.96026726e-01 ... 6.26343000e-04
  3.36244745e-04 1.51310135e-03]
 [1.13535506e+00 2.10778035e+00 8.64460340e-01 ... 7.24522639e-04
  3.99060340e-04 1.84934610e-03]
 [4.96026726e-01 8.64460340e-01 1.98757006e+00 ... 9.17067301e-04
  5.26374194e-04 2.55321850e-03]
 ...
 [6.26343000e-04 7.24522639e-04 9.17067301e-04 ... 1.89800603e+00
  6.59457077e-01 1.80563231e+00]
 [3.36244745e-04 3.99060340e-04 5.26374194e-04 ... 6.59457077e-01
  1.63854846e+00 4.13290933e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 1.00000000e+01]]
[[2.51713965 1.14752993 0.51632091 ... 0.         0.         0.        ]
 [0.91312156 2.05232677 0.96432905 ... 0.         0.         0.        ]
 [0.39548418 0.73046517 1.91757682 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.    

## Question 2: Value Prediction with SR


In [11]:
r1_pi = np.array([-1, -0.75, -0.75, 0.0])
r1_pi = r1_pi.reshape(4,1)
r2_pi = np.array([0., 0.25, 0.25, 0.])
r2_pi = r2_pi.reshape(4,1)

print(r1_pi)
print(r2_pi)



[[-1.  ]
 [-0.75]
 [-0.75]
 [ 0.  ]]
[[0.  ]
 [0.25]
 [0.25]
 [0.  ]]


In [12]:
exact_V_pi_R1 = exact_Phi_pi.dot(r1_pi)
approx_V_pi = approx_Phi_pi.dot(r1_pi)
td_V_pi = result[terminal_step-1]['V_pi']
print(exact_V_pi_R1)
print(approx_V_pi)
print(td_V_pi)

ValueError: shapes (100,100) and (4,1) not aligned: 100 (dim 1) != 4 (dim 0)

## Question 3: Tracking with SR

In [None]:
config['switch_reward'] = True
result = run_experiment(config)

state_len = config['maze_params']['row']*config['maze_params']['col']
terminal_step = config['terminal_step']
switch_step = config['switch_reward_at_step']
approx_Phi_pi = result[terminal_step-1]['Phi_pi']

In [None]:
exact_V_R1 = exact_V_pi_R1 
exact_V_R2 = exact_Phi_pi.dot(r2_pi)

V_pi_sr = np.zeros((state_len,1))
V_pi_td = np.zeros((state_len,1))

err_sr = []
err_regular_td = []

for s in range(terminal_step):
    if s <= switch_step:
        V_pi_sr = result[s]['Phi_pi'].dot(r1_pi)
        err_sr.append(np.sum(np.abs(V_pi_sr - exact_V_R1)))
        
        V_pi_td = result[s]['V_pi']
        err_regular_td.append(np.sum(np.abs(V_pi_td - exact_V_R1)))
    else:
        V_pi_sr = result[s]['Phi_pi'].dot(r2_pi)
        err_sr.append(np.sum(np.abs(V_pi_sr - exact_V_R2)))
    
        V_pi_td = result[s]['V_pi']
        err_regular_td.append(np.sum(np.abs(V_pi_td - exact_V_R2)))

In [None]:
# Plotting value function error for the two algorithms:
plt.ylabel('Value function error')
plt.xlabel('Number of steps')
plt.title('Value Function error comparison between SR and regular TD(0)')
plt.plot(err_sr, label='SR')
plt.plot(err_regular_td, label='TD0')
plt.axvline(x=switch_step, color='grey', linestyle='--')
plt.legend()

## Question 4: Eigendecomposition

In [None]:
eig_vals, eig_vecs = np.linalg.eig(exact_Phi_pi)
print(eig_vecs)

In [None]:
for vec in eig_vecs.T:
    new_vec = vec.reshape([2,2])
    print(new_vec)
    plt.figure()
    plt.imshow(new_vec)