In [2]:
import numpy as np
import random

ModuleNotFoundError: No module named 'numpy'

In [None]:
class Gridworld:
    def __init__(self, size=5):
        self.size = size
        self.n_states = size * size
        self.n_actions = 4  # Up, Down, Left, Right
        self.state = 0  
        self.terminal_states = [self.n_states - 1]  # Start and end states
        
        self.actions= {
            0: (-1, 0),  # Up
            1: (1, 0),   # Down
            2: (0, -1),  # Left
            3: (0, 1)    # Right
        }
        
    def state_to_pos(self, s):
        """Convert state index to (row, col) position."""
        return divmod(s, self.size)
            
    def pos_to_state(self,row, col):
        """Convert (row, col) position to state index."""
        return row* self.size + col
    
    def step(self, action):
        if self.state in self.terminal_states:
            return self.state, 0, True, {}

        row, col = self.state_to_pos(self.state)
        d_row, d_col = self.actions[action]
        new_row = np.clip(row + d_row, 0, self.size - 1)
        new_col = np.clip(col + d_col, 0, self.size - 1)
        new_state = self.pos_to_state(new_row, new_col)
        reward = -1
        done = new_state in self.terminal_states
        self.state = new_state
        return new_state, reward, done, {}
    
    def reset(self):
        self.state = 0
        return self.state

In [None]:
n_states = 5
n_actions = 4
alpha = 0.1
gamma = 0.9
epsilon = 0.1
n_episodes = 500
Q = np.zeros((n_states, n_actions))   

In [None]:

def epsilon_greedy(Q, state, epsilon):
    if random.uniform(0, 1) < epsilon:
        return random.randint(0, n_actions - 1)
    else:
        return np.argmax(Q[state])
    
def sarsa(env, alpha=0.1, gamma=0.9, epsilon=0.1, n_episodes=500):
    Q = np.zeros((env.n_states, env.n_actions))
    for episode in range(n_episodes):
        state = env.reset()  
      #  print(f"Episode {episode} starts at state {state}")
        action = epsilon_greedy(Q, state, epsilon)
        
        done = False
        while not done:
            next_state, reward, done, _ = env.step(action)  
            next_action = epsilon_greedy(Q, next_state, epsilon)
            
            Q[state, action] += alpha * (reward + gamma * Q[next_state, next_action] - Q[state, action])
            
            state = next_state
            action = next_action
       #     print( Q[state,:])
    return Q

In [None]:
def q_learning(env, alpha=0.1, gamma=0.99, epsilon=0.1, n_episodes=500):
    Q = np.zeros((env.n_states, env.n_actions))
    for episode in range(n_episodes):
        state = env.reset()
        done = False
        while not done:
            action = epsilon_greedy(Q, state, epsilon)
            next_state, reward, done, _ = env.step(action)
            Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
            state = next_state
    return Q


In [None]:
def td_zero_prediction(env, policy, alpha=0.1, gamma=0.99, n_episodes=500):
    V = np.zeros(env.n_states)
    for episode in range(n_episodes):
        state = env.reset()
        done = False
        while not done:
            action = policy[state]  
            next_state, reward, done, _ = env.step(action)
            V[state] += alpha * (reward + gamma * V[next_state] - V[state])
            state = next_state
    return V

    

In [None]:
env = Gridworld(size=5)

Q_sarsa = sarsa(env)
Q_ql = q_learning(env)

policy_sarsa = np.argmax(Q_sarsa, axis=1)
policy_ql = np.argmax(Q_ql, axis=1)
V_sarsa = td_zero_prediction(env, policy_sarsa)

In [None]:
def print_policy_grid(env, policy, agent_state=None):
    # Mapping of action indices to arrows:
    action_symbols = {0: '↑', 1: '↓', 2: '←', 3: '→'}
    for row in range(env.size):
        row_symbols = []
        for col in range(env.size):
            state = env.pos_to_state(row, col)
            if agent_state is not None and state == agent_state:
                symbol = 'A'  # Agent's current position
            elif state == 0:
                symbol = 'S'  # Start
            elif state == env.n_states - 1:
                symbol = 'G'  # Goal
            elif state in env.terminal_states:
                symbol = '-'  # Terminal
            else:
                symbol = action_symbols[policy[state]]
            row_symbols.append(symbol)
        print(' '.join(row_symbols))
    print()  # Blank line after the grid

env = Gridworld(size=5)

# After learning Q-table from SARSA or Q-Learning
Q = sarsa(env)
Q_ql= q_learning(env)# or q_learning(env)
policy = np.argmax(Q, axis=1)  # Greedy policy

print('SARSA-learned policy:')
print_policy_grid(env, policy, agent_state=env.state)  # Show current agent location

state = env.reset()
done = False
while not done:
    print_policy_grid(env, policy, agent_state=state)
    action = policy[state]
    state, _, done, _ = env.step(action)

print("Q-learning policy:")
print_policy_grid(env, policy_ql)


SARSA-learned policy:
S → → → ↓
↓ ↓ → ↓ ↓
↓ → ↓ → ↓
→ → ↓ ↓ ↓
→ → → → A

A → → → ↓
↓ ↓ → ↓ ↓
↓ → ↓ → ↓
→ → ↓ ↓ ↓
→ → → → G

S A → → ↓
↓ ↓ → ↓ ↓
↓ → ↓ → ↓
→ → ↓ ↓ ↓
→ → → → G

S → A → ↓
↓ ↓ → ↓ ↓
↓ → ↓ → ↓
→ → ↓ ↓ ↓
→ → → → G

S → → A ↓
↓ ↓ → ↓ ↓
↓ → ↓ → ↓
→ → ↓ ↓ ↓
→ → → → G

S → → → A
↓ ↓ → ↓ ↓
↓ → ↓ → ↓
→ → ↓ ↓ ↓
→ → → → G

S → → → ↓
↓ ↓ → ↓ A
↓ → ↓ → ↓
→ → ↓ ↓ ↓
→ → → → G

S → → → ↓
↓ ↓ → ↓ ↓
↓ → ↓ → A
→ → ↓ ↓ ↓
→ → → → G

S → → → ↓
↓ ↓ → ↓ ↓
↓ → ↓ → ↓
→ → ↓ ↓ A
→ → → → G

Q-learning policy:
S ↓ → → ↓
↓ → → ↓ ↓
↓ ↓ → → ↓
↓ ↓ ↓ ↓ ↓
→ → → → G

