<a href="https://colab.research.google.com/github/saileepanchbhai/Advance-Machine-Learning-Lab/blob/main/RL2_%E2%80%93_Calculating_Optimal_Quantities.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gymnasium



In [None]:
# Used for numerical operations and to create the Q-table (matrix of state-action values)
import numpy as np
#Import Gymnasium library
# Used to create and interact with Reinforcement Learning environments
import gymnasium as gym
# Import random module
# Used for implementing epsilon-greedy action selection (exploration)
import random


In [None]:
# Create the Frozen Lake environment
# "FrozenLake-v1" is a 4x4 grid world environment
# is_slippery=True makes the surface slippery (stochastic movement)
# This means the agent may not always move in the intended direction
env = gym.make("FrozenLake-v1", is_slippery=True)

# Get the total number of states in the environment
# For 4x4 Frozen Lake → 16 states (0 to 15)
state_space = env.observation_space.n
print("State space:", state_space)

# Get the total number of possible actions
# Frozen Lake has 4 actions:
# 0 = Left, 1 = Down, 2 = Right, 3 = Up
action_space = env.action_space.n

State space: 16


In [None]:
# Initialize the Q-table with zeros
# Rows represent states (0 to 15 in 4x4 Frozen Lake)
# Columns represent actions (0=Left, 1=Down, 2=Right, 3=Up)
# Initially, all state-action values are set to 0
# The agent will update these values during training
Q = np.zeros((state_space, action_space))

In [None]:
# -----------------------------
# Hyperparameters for Q-Learning
# -----------------------------

# Learning Rate (α)
# Determines how much newly learned information overrides old information
# Value range: 0 to 1
# Higher value → Faster learning but may be unstable
alpha = 0.8

# Discount Factor (γ)
# Determines importance of future rewards
# Value close to 1 → Agent values future rewards strongly
gamma = 0.95

# Exploration Rate (ε)
# Probability of choosing a random action (exploration)
# Starts at 1.0 → 100% exploration at beginning
epsilon = 1.0

# Epsilon Decay Rate
# After each episode, epsilon is reduced gradually
# Helps shift from exploration to exploitation
epsilon_decay = 0.995

# Minimum Epsilon
# Ensures agent never completely stops exploring
min_epsilon = 0.01

# Number of training episodes
# Total times the agent will interact with environment
episodes = 5000

# Maximum steps per episode
# Prevents infinite loops if agent never reaches goal
max_steps = 100

In [None]:
# -----------------------------
# Training the Agent
# -----------------------------

# Loop over all training episodes
for episode in range(episodes):

    # RESET environment at the start of each episode
    # Some gym versions return only state
    # Newer versions return (state, info)
    state = env.reset()
    if isinstance(state, tuple):
        state = state[0]

    # done = True means episode finished (goal reached or fell in hole)
    done = False

    # Loop for maximum steps allowed in one episode
    for step in range(max_steps):

        # -----------------------------
        # Epsilon-Greedy Action Selection
        # -----------------------------
        # With probability epsilon → choose random action (exploration)
        # Otherwise → choose best action from Q-table (exploitation)
        if random.uniform(0,1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state])

        # Take selected action in environment
        step_result = env.step(action)

        # -----------------------------
        # Handle Different Gym Versions
        # -----------------------------
        # New Gym returns 5 values
        # Old Gym returns 4 values
        if len(step_result) == 5:
            next_state, reward, done, truncated, info = step_result
            done = done or truncated
        else:
            next_state, reward, done, info = step_result

        # -----------------------------
        # Q-Learning Update Rule (Bellman Equation)
        # -----------------------------
        # Q(s,a) = Q(s,a) + α [ R + γ max(Q(s',a')) - Q(s,a) ]
        Q[state, action] = Q[state, action] + alpha * (
            reward + gamma * np.max(Q[next_state]) - Q[state, action]
        )

        # Move to next state
        state = next_state

        # If episode finished → stop this episode
        if done:
            break

    # -----------------------------
    # Reduce Exploration Rate
    # -----------------------------
    # Gradually shift from exploration to exploitation
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

# Training finished
print("Training Completed")

# Extract Optimal Policy
# For each state, choose action with highest Q-value
print("Optimal Policy:")
print(np.argmax(Q, axis=1))

Training Completed
Optimal Policy:
[0 3 3 3 0 0 3 0 3 1 0 0 0 2 1 0]
