In [287]:
import numpy as np

In [288]:
GRID_SIZE = 5
START_STATE = (0, 0)
FOOD_STATE = (4, 4)
ACTIONS = [(0, -1), (0, 1), (-1, 0), (1, 0)]
FORBIDDEN_FURNITURES = [(2, 1), (2, 2), (2, 3), (3, 2)]
MONSTERS = [(0, 3), (4, 1)]
GAMMA = 0.925
MAX_ITERATIONS = 50
DELTA = 0.0005
ALPHA = 0.05
p={}
p['specified'] = 0.7
p['right'] = 0.12
p['left'] = 0.12
p['sleepy'] = 0.06

In [289]:
value_iteration_v = np.array([
    [2.6638, 2.9969, 2.8117, 3.6671, 4.8497],
    [2.9713, 3.5101, 4.0819, 4.8497, 7.1648],
    [2.5936, 0.0,    0.0,    0.0,    8.4687],
    [2.0992, 1.0849, 0.0,    8.6097, 9.5269],
    [1.0849, 4.9465, 8.4687, 9.5269, 0.0]
])

In [290]:
policy_mat = [
    [(0, 1), (1, 0), (0, -1), (1, 0), (1, 0)],
    [(0, 1), (0, 1), (0, 1), (0, 1), (1, 0)],   
    [(-1, 0), None, None, None, (1, 0)],
    [(-1, 0), (0, -1), None, (0, 1), (1, 0)],
    [(-1, 0), (0, 1), (0, 1), (0, 1), None]
]

In [291]:
def is_valid_state(i, j):
    if i < 0 or j < 0 or i >= GRID_SIZE or j >= GRID_SIZE:
        return False
    if (i, j) in FORBIDDEN_FURNITURES:
        return False
    return True

In [292]:
def random_initial_state():
    while True:
        state = (np.random.randint(GRID_SIZE), np.random.randint(GRID_SIZE))
        if is_valid_state(*state):
            return state

In [293]:
def getReward(state):
    if state == FOOD_STATE:
        return 10
    elif state in MONSTERS:
        return -8
    else:
        return -0.05

In [294]:
def getTransitionProbabilities(action, p):
    if action == (0, -1):
        specified = (0, -1)
        left = (1, 0)
        right = (-1, 0)
    elif action == (0, 1):
        specified = (0, 1)
        left = (-1, 0)
        right = (1, 0)
    elif action == (-1, 0):
        specified = (-1, 0)
        left = (0, -1)
        right = (0, 1)
    else:
        specified = (1, 0)
        left = (0, 1)
        right = (0, -1)

    sleepy = (0, 0)
    return [(specified, p['specified']), (left, p['left']), (right, p['right']), (sleepy, p['sleepy'])]

In [295]:
def td_zero():
    all_value_funcs = []
    episode_counts = []

    for _ in range(50):
        value_func = np.zeros((GRID_SIZE, GRID_SIZE))
        count = 0

        while True:
            count += 1
            # print(count)
            orig_value_func = value_func.copy()
            max_norm_diff = 0

            current_state = random_initial_state()
            while(current_state) == (4,4):
                current_state = random_initial_state()

            while current_state != FOOD_STATE:
                i, j = current_state

                action = policy_mat[i][j]
                transitions = getTransitionProbabilities(action, p)

                moves, probs = zip(*transitions)
                chosen_move = np.random.choice(len(moves), p=probs)
                move = moves[chosen_move]
                next_state = (i + move[0], j + move[1])

                if not is_valid_state(*next_state):
                    next_state = current_state

                reward = getReward(next_state)
                td_target = reward + (GAMMA * value_func[next_state])
                td_error = td_target - value_func[current_state]
                value_func[current_state] = value_func[current_state]+ (ALPHA * td_error)

                current_state = next_state
            # print("value function: ", value_func)
            # print("orig value function: ", orig_value_func)
            max_norm_diff = np.max(np.abs(value_func - orig_value_func))
            if max_norm_diff < DELTA:
                break
        # print("value function: ", value_func)
        all_value_funcs.append(value_func)
        episode_counts.append(count)

    return all_value_funcs, episode_counts

In [296]:
def calculate_stats(all_value_funcs, episode_counts):
    avg_value_func = np.mean(all_value_funcs, axis=0)
    avg_episodes = np.mean(episode_counts)
    std_episodes = np.std(episode_counts)
    max_norm_diff_final = np.max(np.abs(avg_value_func - value_iteration_v))
    return avg_value_func, avg_episodes, std_episodes, max_norm_diff_final

In [297]:
all_value_funcs, episode_counts = td_zero()
avg_value_func, avg_episodes, std_episodes, max_norm_diff_final = calculate_stats(all_value_funcs, episode_counts)
print("******** TD(0) Results ********")
print(f"Step size (alpha): {ALPHA}")
print(f"Average value function:\n{avg_value_func}")
print(f"Max-norm difference from optimal value function: {max_norm_diff_final}")
print(f"Average episodes to converge: {avg_episodes}")
print(f"Standard deviation of episodes: {std_episodes}")

******** TD(0) Results ********
Step size (alpha): 0.05
Average value function:
[[2.57689945 2.94936797 2.73430937 3.34233355 4.62226352]
 [2.92017038 3.47198838 4.05922131 4.77991045 7.20378061]
 [2.51610129 0.         0.         0.         8.43594195]
 [1.99876111 0.85255582 0.         8.49809292 9.41378751]
 [0.87748029 4.72251056 8.34175815 9.37925267 0.        ]]
Max-norm difference from optimal value function: 0.324766452910056
Average episodes to converge: 7487.92
Standard deviation of episodes: 6383.217821255985
