## Defining Grid Environment

### Simple grid with predefined Obstacles, starting Position & Goal

In [None]:
import numpy as np
import random

In [None]:
# define class grid
class GridWorld:
    def __init__(self, size=(5, 5), start=(0, 0), goal=(4, 4), obstacles=[]):
        self.size = size
        self.start = start
        self.goal = goal
        self.obstacles = obstacles
        self.state = self.start

    def reset(self):
        self.state = self.start
        return self.state

    def step(self, action):
        x, y = self.state
        if action == 'up':
            x -= 1
        elif action == 'down':
            x += 1
        elif action == 'left':
            y -= 1
        elif action == 'right':
            y += 1
        new_state = (max(0, min(x, self.size[0] - 1)), max(0, min(y, self.size[1] - 1)))
        if new_state in self.obstacles:
            new_state = self.state
        self.state = new_state
        reward = 1 if new_state == self.goal else -1
        return new_state, reward

    def is_terminal(self, state):
        return state == self.goal

## Defining Q-learning & SARSA Functions

### Parameter Initialization

In [None]:
# Parameters
ALPHA = 0.1
GAMMA = 0.9
EPSILON = 0.1
ACTIONS = ['up', 'down', 'left', 'right']

# Q-table
def initialize_q_table(env):
    q_table = {}
    for x in range(env.size[0]):
        for y in range(env.size[1]):
            q_table[(x, y)] = {action: 0 for action in ACTIONS}
    return q_table

### Helper Functions

In [None]:
# Epsilon-greedy policy
def epsilon_greedy(q_table, state):
    if random.uniform(0, 1) < EPSILON:
        return random.choice(ACTIONS)
    else:
        return max(q_table[state], key=q_table[state].get)

In [None]:
# display grid with position, obstacles and goal
def print_grid(env, agent_position=None):
    grid = [[" " for _ in range(env.size[1])] for _ in range(env.size[0])]
    x, y = env.goal
    grid[x][y] = "G"  # Goal
    for obs in env.obstacles:
        ox, oy = obs
        grid[ox][oy] = "X"  # Obstacle
    if agent_position:
        ax, ay = agent_position
        grid[ax][ay] = "A"  # Agent
    for row in grid:
        print(" | ".join(row))
    print("\n")

### TD(0) with Q-learning and SARSA

In [None]:
def q_learning_td0(env, q_table, episodes=500):
    for episode in range(episodes):
        state = env.reset()

        if episode % 100 == 0:
            print(f"\nEpisode {episode}: Starting new episode...\n")
            print_grid(env, agent_position=state)

        while not env.is_terminal(state):
            action = epsilon_greedy(q_table, state)
            next_state, reward = env.step(action)
            next_action = max(q_table[next_state], key=q_table[next_state].get)
            old_value = q_table[state][action]
            q_table[state][action] += ALPHA * (reward + GAMMA * q_table[next_state][next_action] - q_table[state][action])

            if episode % 100 == 0:
                print(f"Updated Q-value for state {state}, action '{action}': {old_value} -> {q_table[state][action]}")
            state = next_state

    return q_table

In [None]:
# SARSA TD(0)
def sarsa_td0(env, q_table, episodes=500):
    for episode in range(episodes):
        state = env.reset()
        action = epsilon_greedy(q_table, state)

        if episode % 100 == 0:
            print(f"\nEpisode {episode}: Starting new episode...\n")
            print_grid(env, agent_position=state)

        while not env.is_terminal(state):
            next_state, reward = env.step(action)
            next_action = epsilon_greedy(q_table, next_state)
            old_value = q_table[state][action]
            q_table[state][action] += ALPHA * (reward + GAMMA * q_table[next_state][next_action] - q_table[state][action])

            if episode % 100 == 0:
                print(f"Updated Q-value for state {state}, action '{action}': {old_value} -> {q_table[state][action]}")
            state, action = next_state, next_action

    return q_table

### TD(N) with Q-learning and SARSA (N=2)

In [35]:
def q_learning_td2(env, q_table, episodes=500):
    for episode in range(episodes):
        state = env.reset()
        trajectory = []

        if episode % 100 == 0:
            print(f"\nEpisode {episode}: Starting new episode...\n")
            print_grid(env, agent_position=state)

        while not env.is_terminal(state):
            action = epsilon_greedy(q_table, state)
            next_state, reward = env.step(action)

            trajectory.append((state, action, reward))
            if len(trajectory) > 2:
                trajectory.pop(0)

            if len(trajectory) == 2:
                (state_t, action_t, reward_t1) = trajectory[0]
                (state_t2, action_t2, reward_t2) = trajectory[1]

                old_value = q_table[state_t][action_t]
                q_table[state_t][action_t] += ALPHA * (
                    reward_t1 + GAMMA * reward_t2 + GAMMA**2 * q_table[state_t2][action_t2] - old_value
                )

                if episode % 100 == 0:
                    print(f"Updated Q-value for state {state_t}, action '{action_t}': {old_value} -> {q_table[state_t][action_t]}")

            state = next_state

    return q_table

In [None]:
# SARSA TD(2)
def sarsa_td2(env, q_table, episodes=500):
    for episode in range(episodes):
        state = env.reset()
        action = epsilon_greedy(q_table, state)
        trajectory = [(state, action, 0)]
        if episode % 100 == 0:
            print(f"\nEpisode {episode}: Starting new episode...\n")
            print_grid(env, agent_position=state)
        while not env.is_terminal(state):
            next_state, reward = env.step(action)
            next_action = epsilon_greedy(q_table, next_state)
            trajectory.append((next_state, next_action, reward))
            if len(trajectory) > 2:
                state_t, action_t, _ = trajectory.pop(0)
                _, _, reward_t1 = trajectory[0]
                state_t2, action_t2, _ = trajectory[1]
                old_value = q_table[state_t][action_t]
                q_table[state_t][action_t] += ALPHA * (reward_t1 + GAMMA * reward + GAMMA**2 * q_table[state_t2][action_t2] - old_value)
                if episode % 100 == 0:
                    print(f"Updated Q-value for state {state_t}, action '{action_t}': {old_value} -> {q_table[state_t][action_t]}")
            state, action = next_state, next_action
    return q_table

### Path post Training

In [None]:
def display_agent_path(env, q_table):
    state = env.reset()
    path = [state]

    print("\nAgent's path to goal after training:\n")
    while not env.is_terminal(state):
        print_grid(env, agent_position=state)

        # choose best action based on learned Q-table
        action = max(q_table[state], key=q_table[state].get)
        state, _ = env.step(action)
        path.append(state)

        # end path if stuck in a loop (to handle bad policies)
        if len(path) > 100:  # Arbitrary loop detection limit
            print("Loop detected, stopping path display.")
            break

    print("Final Path:", path)
    print_grid(env, agent_position=state)

## Deploying the Agent

In [None]:
env = GridWorld(size=(5, 5), start=(0, 0), goal=(4, 4), obstacles=[(1, 1), (2, 2), (3, 3)])
q_table = initialize_q_table(env)

In [None]:
for i in q_table.items():
  print(i)

((0, 0), {'up': 0, 'down': 0, 'left': 0, 'right': 0})
((0, 1), {'up': 0, 'down': 0, 'left': 0, 'right': 0})
((0, 2), {'up': 0, 'down': 0, 'left': 0, 'right': 0})
((0, 3), {'up': 0, 'down': 0, 'left': 0, 'right': 0})
((0, 4), {'up': 0, 'down': 0, 'left': 0, 'right': 0})
((1, 0), {'up': 0, 'down': 0, 'left': 0, 'right': 0})
((1, 1), {'up': 0, 'down': 0, 'left': 0, 'right': 0})
((1, 2), {'up': 0, 'down': 0, 'left': 0, 'right': 0})
((1, 3), {'up': 0, 'down': 0, 'left': 0, 'right': 0})
((1, 4), {'up': 0, 'down': 0, 'left': 0, 'right': 0})
((2, 0), {'up': 0, 'down': 0, 'left': 0, 'right': 0})
((2, 1), {'up': 0, 'down': 0, 'left': 0, 'right': 0})
((2, 2), {'up': 0, 'down': 0, 'left': 0, 'right': 0})
((2, 3), {'up': 0, 'down': 0, 'left': 0, 'right': 0})
((2, 4), {'up': 0, 'down': 0, 'left': 0, 'right': 0})
((3, 0), {'up': 0, 'down': 0, 'left': 0, 'right': 0})
((3, 1), {'up': 0, 'down': 0, 'left': 0, 'right': 0})
((3, 2), {'up': 0, 'down': 0, 'left': 0, 'right': 0})
((3, 3), {'up': 0, 'down': 0

### Train with Q-learning TD(0)

In [31]:
print("\nTraining with Q-learning TD(0):\n")
q_learning_td0_q_table = q_learning_td0(env, q_table, episodes=500)


Training with Q-learning TD(0):


Episode 0: Starting new episode...

A |   |   |   |  
  | X |   |   |  
  |   | X |   |  
  |   |   | X |  
  |   |   |   | G


Updated Q-value for state (0, 0), action 'up': 0 -> -0.1
Updated Q-value for state (0, 0), action 'down': 0 -> -0.1
Updated Q-value for state (1, 0), action 'up': 0 -> -0.1
Updated Q-value for state (0, 0), action 'left': 0 -> -0.1
Updated Q-value for state (0, 0), action 'left': -0.1 -> -0.19
Updated Q-value for state (0, 0), action 'right': 0 -> -0.1
Updated Q-value for state (0, 1), action 'up': 0 -> -0.1
Updated Q-value for state (0, 1), action 'left': 0 -> -0.10900000000000001
Updated Q-value for state (0, 0), action 'up': -0.1 -> -0.199
Updated Q-value for state (0, 0), action 'down': -0.1 -> -0.19
Updated Q-value for state (1, 0), action 'down': 0 -> -0.1
Updated Q-value for state (2, 0), action 'up': 0 -> -0.1
Updated Q-value for state (1, 0), action 'left': 0 -> -0.1
Updated Q-value for state (1, 0), action 'right': 

In [32]:
# Train with SARSA TD(0)
print("\nTraining with SARSA TD(0):\n")
sarsa_td0_q_table = sarsa_td0(env, q_table, episodes=500)


Training with SARSA TD(0):


Episode 0: Starting new episode...

A |   |   |   |  
  | X |   |   |  
  |   | X |   |  
  |   |   | X |  
  |   |   |   | G


Updated Q-value for state (0, 0), action 'up': -5.126229559267231 -> -5.139861018789744
Updated Q-value for state (0, 0), action 'down': -4.736160171658178 -> -4.736247917643473
Updated Q-value for state (1, 0), action 'down': -4.15226403501236 -> -4.152328834916348
Updated Q-value for state (2, 0), action 'down': -3.5032355933913846 -> -3.503307253452103
Updated Q-value for state (3, 0), action 'right': -2.782169104442851 -> -2.8346227989359365
Updated Q-value for state (3, 1), action 'left': -2.5630067215263423 -> -2.6571016133793397
Updated Q-value for state (3, 0), action 'down': -2.7821729333959047 -> -2.7822412392871967
Updated Q-value for state (4, 0), action 'right': -1.9809511025653632 -> -1.9809559925110227
Updated Q-value for state (4, 1), action 'right': -1.09000000224662 -> -1.090000002030856
Updated Q-value for state

### Train with Q-learning TD(2)

In [36]:
q_table = initialize_q_table(env)

print("\nTraining with Q-learning TD(2):\n")
q_learning_td2_q_table = q_learning_td2(env, q_table, episodes=500)


Training with Q-learning TD(2):


Episode 0: Starting new episode...

A |   |   |   |  
  | X |   |   |  
  |   | X |   |  
  |   |   | X |  
  |   |   |   | G


Updated Q-value for state (0, 0), action 'up': 0 -> -0.19
Updated Q-value for state (0, 0), action 'up': -0.19 -> -0.361
Updated Q-value for state (0, 0), action 'down': 0 -> -0.19
Updated Q-value for state (1, 0), action 'up': 0 -> -0.19
Updated Q-value for state (0, 0), action 'left': 0 -> -0.19
Updated Q-value for state (0, 0), action 'left': -0.19 -> -0.361
Updated Q-value for state (0, 0), action 'right': 0 -> -0.19
Updated Q-value for state (0, 1), action 'up': 0 -> -0.19
Updated Q-value for state (0, 1), action 'up': -0.19 -> -0.361
Updated Q-value for state (0, 1), action 'down': 0 -> -0.19
Updated Q-value for state (0, 1), action 'down': -0.19 -> -0.361
Updated Q-value for state (0, 1), action 'left': 0 -> -0.20539000000000002
Updated Q-value for state (0, 0), action 'down': -0.19 -> -0.361
Updated Q-value for state 

In [37]:
sarsa_td0_q_table = sarsa_td2(env, q_table, episodes=500)


Episode 0: Starting new episode...

A |   |   |   |  
  | X |   |   |  
  |   | X |   |  
  |   |   | X |  
  |   |   |   | G


Updated Q-value for state (0, 0), action 'down': -7.298327537545865 -> -7.23445142003769
Updated Q-value for state (1, 0), action 'down': -6.659225772551028 -> -6.604616236830273
Updated Q-value for state (2, 0), action 'right': -5.876007854893968 -> -5.954363705650982
Updated Q-value for state (2, 1), action 'left': -5.2013955744981155 -> -5.269264363003364
Updated Q-value for state (2, 0), action 'right': -5.954363705650982 -> -5.849841597563477
Updated Q-value for state (2, 1), action 'down': -4.9136832833958 -> -4.791271890098189
Updated Q-value for state (3, 1), action 'down': -3.7149908947851102 -> -3.560640328102451
Updated Q-value for state (4, 1), action 'right': -2.209344877061347 -> -2.178410389355212
Updated Q-value for state (4, 2), action 'right': -0.3351669480969355 -> -0.31165025328724194

Episode 100: Starting new episode...

A |   |   |   | 