<a href="https://colab.research.google.com/github/smitjiwani/reinforcement-learning/blob/main/rl5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import numpy as np
import random

class GridWorld:
  def __init__(self, size=4, start = (0,0), goal = (3,3)):
    self.size = size
    self.start = start
    self.goal = goal
    self.state = start
    self.action = [(0, 1), (1, 0), (0, -1), (-1, 0)]

  def reset(self):
    self.state = self.start
    return self.state

  def step(self, action):
    next_state = (self.state[0] + self.action[action][0],
                  self.state[1] + self.action[action][1])
    if 0 <= next_state[0] < self.size and 0 <= next_state[1] < self.size:
      self.state = next_state
    reward = 1 if self.state == self.goal else -0.1
    done = self.state == self.goal
    return self.state, reward, done

# **Monte Carlo**

In [None]:

def monte_carlo(env, episode=5000, gamma=0.9, epsilon=0.1):
    Q = {((i, j), a): 0 for i in range(env.size) for j in range(env.size) for a in range(4)}
    returns = {((i, j), a): [] for i in range(env.size) for j in range(env.size) for a in range(4)}
    for _ in range(episode):
        state = env.reset()
        episode = []
        while True:
            if random.random() < epsilon:
                action = random.choice(range(4))
            else:
                action = max(range(4), key=lambda a: Q[(state, a)])
            next_state, reward, done = env.step(action)
            episode.append((state, action, reward))
            state = next_state
            if done:
                break

        G = 0
        visited = set()
        for state, action, reward in reversed(episode):
            G = gamma * G + reward
            if (state, action) not in visited:
                visited.add((state, action))
                returns[(state, action)].append(G)
                Q[(state, action)] = np.mean(returns[(state, action)])

    return Q

# **Temporal Differnce**

In [None]:

def sarsa(env, episodes=5000, alpha=0.1, gamma=0.9, epsilon=0.1):
    Q = {((i, j), a): 0 for i in range(env.size) for j in range(env.size) for a in range(4)}

    for _ in range(episodes):
        state = env.reset()
        if random.random() < epsilon:
            action = random.choice(range(4))
        else:
            action = max(range(4), key=lambda a: Q[(state, a)])
        while True:
            next_state, reward, done = env.step(action)
            if random.random() < epsilon:
                next_action = random.choice(range(4))
            else:
                next_action = max(range(4), key=lambda a: Q[(next_state, a)])
            Q[(state, action)] += alpha * (
                reward + gamma * Q[(next_state, next_action)] - Q[(state, action)]
            )
            state = next_state
            action = next_action
            if done:
                break
    return Q

In [None]:

def q_learning(env, episodes=5000, alpha=0.1, gamma=0.9, epsilon=0.1):
    Q = {((i, j), a): 0 for i in range(env.size) for j in range(env.size) for a in range(4)}
    for _ in range(episodes):
        state = env.reset()
        while True:
            if random.random() < epsilon:
                action = random.choice(range(4))
            else:
                action = max(range(4), key=lambda a: Q[(state, a)])
            next_state, reward, done = env.step(action)
            best_next_q = max(Q[(next_state, a)] for a in range(4))
            Q[(state, action)] += alpha * (reward + gamma * best_next_q - Q[(state, action)])
            state = next_state
            if done:
                break
    return Q

In [None]:

#Initialize Environment
env = GridWorld()

#train Monte Carlo
Q_mc = monte_carlo(env)

#Train with SARSA
Q_sarsa = sarsa(env)

#Train with Q learning
Q_q_learning = q_learning(env)

In [None]:

#Print Sample Q-values

print("Sample Q-values for Monte Carlo:", {k: Q_mc[k] for k in list(Q_mc.keys())[:17]})
print("Sample Q-values for SARSA:", {k: Q_sarsa[k] for k in list(Q_sarsa.keys())[:17]})
print("Sample Q-values for Q-learning:", {k: Q_q_learning[k] for k in list(Q_q_learning.keys())[:17]})


Sample Q-values for Monte Carlo: {((0, 0), 0): np.float64(0.0963000168336826), ((0, 0), 1): np.float64(0.1296902888395616), ((0, 0), 2): np.float64(0.020337589160000103), ((0, 0), 3): np.float64(0.020052882544594693), ((0, 1), 0): np.float64(0.16640000332856175), ((0, 1), 1): np.float64(0.25584389684000014), ((0, 1), 2): np.float64(0.042687242000000104), ((0, 1), 3): np.float64(0.0968351750000001), ((0, 2), 0): np.float64(-0.2709997424145312), ((0, 2), 1): np.float64(0.4580000000000002), ((0, 2), 2): np.float64(-0.999999973038651), ((0, 2), 3): 0, ((0, 3), 0): np.float64(-0.9999612734804219), ((0, 3), 1): np.float64(0.08001434315539939), ((0, 3), 2): np.float64(-0.9999995363461562), ((0, 3), 3): np.float64(-0.9999833295031642), ((1, 0), 0): np.float64(0.26552576374580134)}
Sample Q-values for SARSA: {((0, 0), 0): 0.08991569068500306, ((0, 0), 1): 0.13732403987598207, ((0, 0), 2): -0.002837208823490987, ((0, 0), 3): -0.0025913679190554266, ((0, 1), 0): -0.05930870420369839, ((0, 1), 1):