<a href="https://colab.research.google.com/github/rajanaids-hub/Reinforcement_Learning_Lab/blob/main/Custom_Grid_Agent_Exp2_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML, display
import random

# =====================================================================
# üéì STUDENT EXPERIMENT SETTINGS üéì
# =====================================================================
GRID_SIZE = 5           # A 5x5 grid
EPISODES = 500          # Number of training attempts
ALPHA = 0.5             # Learning Rate
GAMMA = 0.9             # Discount Factor
EPSILON = 0.2           # Exploration Rate (20% chance to wander)
# =====================================================================

# 1. Build Our Own Custom Environment
class CustomGridEnv:
    """
    A custom grid world where the agent must avoid pits to reach the goal.
    Coordinates are (row, column). Top-Left is (0,0).
    """
    def __init__(self, size=GRID_SIZE):
        self.size = size
        self.start_pos = (0, 0)                 # Top-Left
        self.goal_pos = (size - 1, size - 1)    # Bottom-Right

        # Add some dangerous pits the agent must learn to avoid!
        self.pits = [(1, 1), (2, 3), (3, 1)]

        self.state_pos = self.start_pos

    def _get_state_id(self, pos):
        """Converts a 2D (row, col) coordinate into a 1D state ID for the Q-Table."""
        return pos[0] * self.size + pos[1]

    def reset(self):
        """Put the agent back at the start."""
        self.state_pos = self.start_pos
        return self._get_state_id(self.state_pos)

    def step(self, action):
        """Move the agent and calculate rewards."""
        r, c = self.state_pos

        # Apply movement logic
        if action == 0: r = max(0, r - 1)               # 0: UP
        elif action == 1: c = min(self.size - 1, c + 1) # 1: RIGHT
        elif action == 2: r = min(self.size - 1, r + 1) # 2: DOWN
        elif action == 3: c = max(0, c - 1)             # 3: LEFT

        self.state_pos = (r, c)
        state_id = self._get_state_id(self.state_pos)

        # Check what happened after the move
        if self.state_pos == self.goal_pos:
            return state_id, 10.0, True    # üéâ HUGE REWARD for winning!
        elif self.state_pos in self.pits:
            return state_id, -10.0, True   # üí• HUGE PENALTY for falling in a pit!
        else:
            return state_id, -1.0, False   # ‚è≥ Small penalty just for wasting time (taking a step)

# 2. Build the Simple Q-Learning Agent
class SimpleAgent:
    def __init__(self, n_states, n_actions):
        self.n_actions = n_actions
        self.q_table = np.zeros((n_states, n_actions))

    def choose_action(self, state, epsilon):
        """Epsilon-Greedy choice: explore or exploit."""
        if random.uniform(0, 1) < epsilon:
            return random.randint(0, self.n_actions - 1)
        else:
            # Break ties randomly to prevent getting stuck early on
            max_q = np.max(self.q_table[state, :])
            best_actions = np.where(self.q_table[state, :] == max_q)[0]
            return random.choice(best_actions)

    def learn(self, state, action, reward, next_state):
        """Q-Learning Update Rule"""
        current_q = self.q_table[state, action]
        max_future_q = np.max(self.q_table[next_state, :])

        # Calculate new value
        new_q = current_q + ALPHA * (reward + GAMMA * max_future_q - current_q)
        self.q_table[state, action] = new_q

# 3. Visualization Code
def animate_custom_grid(path, env):
    """Draws our custom grid world and animates the agent walking through it."""
    fig, ax = plt.subplots(figsize=(6, 6))

    def update(frame):
        ax.clear()

        # Create a blank white grid
        grid_visual = np.zeros((env.size, env.size))
        ax.matshow(grid_visual, cmap='Blues', alpha=0.1) # Light blue background

        agent_pos = path[frame]

        # Draw the static elements of the world
        for r in range(env.size):
            for c in range(env.size):
                pos = (r, c)

                # Draw Pits
                if pos in env.pits:
                    ax.text(c, r, "üï≥Ô∏è", va='center', ha='center', fontsize=24)
                # Draw Goal
                elif pos == env.goal_pos:
                    ax.text(c, r, "üèÅ", va='center', ha='center', fontsize=24)
                # Draw Start Marker
                elif pos == env.start_pos:
                    ax.text(c, r, "START", va='center', ha='center', color='gray', fontsize=10, weight='bold')

                # Highlight Agent
                if pos == agent_pos:
                    circle = plt.Circle((c, r), 0.3, color='red', alpha=0.5)
                    ax.add_patch(circle)
                    ax.text(c, r, "ü§ñ", va='center', ha='center', fontsize=20)

        # Formatting
        ax.set_xticks(np.arange(-0.5, env.size, 1), minor=True)
        ax.set_yticks(np.arange(-0.5, env.size, 1), minor=True)
        ax.grid(which="minor", color="black", linestyle='-', linewidth=2)
        ax.set_xticks([]); ax.set_yticks([]) # Hide major ticks

        ax.set_title(f"Custom Grid Agent | Step: {frame + 1}/{len(path)}", pad=20)

    anim = animation.FuncAnimation(fig, update, frames=len(path), interval=400, repeat=False)
    plt.close(fig)
    display(HTML(anim.to_jshtml()))

# =====================================================================
# üöÄ MAIN EXECUTION
# =====================================================================
if __name__ == "__main__":
    env = CustomGridEnv(size=GRID_SIZE)
    agent = SimpleAgent(n_states=env.size * env.size, n_actions=4)

    print(f"üß† Training the agent for {EPISODES} episodes...")

    # --- TRAINING LOOP ---
    for episode in range(EPISODES):
        state = env.reset()
        done = False

        while not done:
            action = agent.choose_action(state, EPSILON)
            next_state, reward, done = env.step(action)
            agent.learn(state, action, reward, next_state)
            state = next_state

    print("‚úÖ Training complete! Let's watch it play.")

    # --- TESTING LOOP ---
    # Put the agent back at start and turn off random exploration (epsilon = 0)
    env.reset()
    state = env._get_state_id(env.start_pos)
    path = [env.start_pos]
    done = False

    # Safety break to prevent infinite loops if agent fails
    max_steps = 25
    step_count = 0

    while not done and step_count < max_steps:
        action = agent.choose_action(state, epsilon=0.0)
        next_state, reward, done = env.step(action)
        path.append(env.state_pos)
        state = next_state
        step_count += 1

    animate_custom_grid(path, env)

üß† Training the agent for 500 episodes...
‚úÖ Training complete! Let's watch it play.


  display(HTML(anim.to_jshtml()))
  display(HTML(anim.to_jshtml()))
  display(HTML(anim.to_jshtml()))
