<a href="https://colab.research.google.com/github/rajanaids-hub/Reinforcement_Learning_Lab/blob/main/Gymnasium_Q_Learning_Agent_exp1_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML, display
import random

# NOTE: To run this locally or in Colab, you need the gymnasium library:
# pip install gymnasium[toy-text]

import gymnasium as gym

# =====================================================================
# ðŸŽ“ STUDENT EXPERIMENT SETTINGS ðŸŽ“
# =====================================================================
EPISODES = 2000         # How many times the agent plays the game
ALPHA = 0.8             # Learning Rate: How fast it learns from new events
GAMMA = 0.95            # Discount Factor: How much it cares about the future
EPSILON = 0.5           # Exploration Rate: Increased to 50% to help find the goal!
# =====================================================================

def train_agent():
    """
    Trains a Q-Learning agent on the FrozenLake environment.
    """
    # 1. Create the Environment
    # is_slippery=False means the agent goes exactly where it chooses.
    # (Challenge: Set it to True later to see what happens when the ice is slippery!)
    env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False)

    # 2. Initialize the Q-Table
    # env.observation_space.n gives us the total number of states (16)
    # env.action_space.n gives us the total number of actions (4: Left, Down, Right, Up)
    q_table = np.zeros((env.observation_space.n, env.action_space.n))

    print(f"ðŸ§  Training Q-Learning Agent for {EPISODES} episodes...")

    for episode in range(EPISODES):
        # Reset the environment to start a new game.
        # Modern Gymnasium returns a tuple: (state, info_dictionary)
        state, info = env.reset()

        done = False
        while not done:
            # --- CHOOSE ACTION (Epsilon-Greedy) ---
            if random.uniform(0, 1) < EPSILON:
                action = env.action_space.sample() # Gym's built-in random action picker
            else:
                # BREAK TIES RANDOMLY: If all Q-values are 0, pick a random direction
                # instead of always picking 0 (Left) and getting stuck on the wall!
                max_q = np.max(q_table[state, :])
                best_actions = np.where(q_table[state, :] == max_q)[0]
                action = random.choice(best_actions)

            # --- TAKE ACTION ---
            # Gym's step() returns 5 values:
            # 1. next_state: Where did we end up?
            # 2. reward: Did we get a point? (1 for goal, 0 otherwise)
            # 3. terminated: Did the game end naturally? (Hit goal or fell in hole)
            # 4. truncated: Did we run out of time? (Hit max steps)
            # 5. info: Extra debug info
            next_state, reward, terminated, truncated, info = env.step(action)

            # The game is "done" if we terminated OR truncated
            done = terminated or truncated

            # --- Q-LEARNING UPDATE RULE ---
            # Notice how this differs from SARSA: We don't care what action we ACTUALLY take next.
            # We boldly assume we will take the absolute BEST action next (np.max).
            current_q = q_table[state, action]
            max_future_q = np.max(q_table[next_state, :])

            # Calculate new Q value
            new_q = current_q + ALPHA * (reward + GAMMA * max_future_q - current_q)
            q_table[state, action] = new_q

            state = next_state

    print("âœ… Training Complete!")
    env.close()
    return q_table

def test_and_animate(q_table):
    """
    Tests the trained agent and captures the frames to create a video.
    """
    print("ðŸŽ¬ Recording the trained agent's performance...")

    # We must use render_mode="rgb_array" to capture image frames for our video
    env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode="rgb_array")

    state, info = env.reset()
    frames = [] # This will store our video frames

    # Capture the starting frame
    frames.append(env.render())

    done = False
    while not done:
        # Pure exploitation: purely use the learned Q-Table
        max_q = np.max(q_table[state, :])
        best_actions = np.where(q_table[state, :] == max_q)[0]
        action = random.choice(best_actions)

        state, reward, terminated, truncated, info = env.step(action)

        # Capture the frame after moving
        frames.append(env.render())
        done = terminated or truncated

    env.close()

    # --- Create the Animation ---
    fig, ax = plt.subplots(figsize=(5, 5))
    plt.axis('off') # Hide the axes

    # Convert frames to matplotlib image plots
    ims = [[ax.imshow(frame, animated=True)] for frame in frames]

    # Stitch them together into a video
    anim = animation.ArtistAnimation(fig, ims, interval=500, blit=True, repeat_delay=1000)
    plt.close()
    display(HTML(anim.to_jshtml()))

# =====================================================================
# ðŸš€ MAIN EXECUTION
# =====================================================================
if __name__ == "__main__":
    # 1. Train the agent and get the learned brain (Q-Table)
    learned_q_table = train_agent()

    # 2. Put the brain in a new agent and film it!
    test_and_animate(learned_q_table)

ðŸ§  Training Q-Learning Agent for 2000 episodes...
âœ… Training Complete!
ðŸŽ¬ Recording the trained agent's performance...
