<a href="https://colab.research.google.com/github/sujithh1110/reinforcement-learning/blob/main/lab12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Fast Exploration Strategies Comparison (Tabular Q-Learning)

import gymnasium as gym
import numpy as np
import random

# -----------------------------
# Environment (very fast)
# -----------------------------
env = gym.make("FrozenLake-v1", is_slippery=False)  # deterministic version
n_states = env.observation_space.n
n_actions = env.action_space.n

# -----------------------------
# Q-Learning Parameters
# -----------------------------
EPISODES = 2000
ALPHA = 0.1
GAMMA = 0.99

EPS_START = 1.0
EPS_END = 0.1
EPS_DECAY = 0.999

TEMP = 1.0   # softmax temperature


# -----------------------------
# Strategies
# -----------------------------
def epsilon_greedy(q_table, state, epsilon):
    if random.random() < epsilon:
        return random.randint(0, n_actions - 1)
    return np.argmax(q_table[state])


def softmax_action(q_table, state, temp):
    prefs = q_table[state] / temp
    probs = np.exp(prefs) / np.sum(np.exp(prefs))
    return np.random.choice(n_actions, p=probs)


def random_action(q_table, state):
    return random.randint(0, n_actions - 1)


# -----------------------------
# Training loop for each strategy
# -----------------------------
def train(strategy_name):
    print(f"\n=== Training with {strategy_name} ===")
    q_table = np.zeros((n_states, n_actions))
    epsilon = EPS_START
    rewards = []

    for ep in range(EPISODES):
        state, _ = env.reset()
        total_reward = 0

        for _ in range(100):
            # Choose action
            if strategy_name == "epsilon_greedy":
                action = epsilon_greedy(q_table, state, epsilon)

            elif strategy_name == "softmax":
                action = softmax_action(q_table, state, TEMP)

            elif strategy_name == "random":
                action = random_action(q_table, state)

            # Step
            next_state, reward, terminated, truncated, _ = env.step(action)

            # Update Q-value
            q_table[state, action] += ALPHA * (
                reward + GAMMA * np.max(q_table[next_state]) - q_table[state, action]
            )

            state = next_state
            total_reward += reward

            if terminated or truncated:
                break

        # epsilon decay (only for epsilon greedy)
        if strategy_name == "epsilon_greedy":
            epsilon = max(EPS_END, epsilon * EPS_DECAY)

        rewards.append(total_reward)

        if ep % 500 == 0:
            print(f"Episode {ep}: avg reward (last 100) = {np.mean(rewards[-100:]):.2f}")

    return q_table, rewards


# -----------------------------
# Train all 3 strategies
# -----------------------------
q_eps, r_eps = train("epsilon_greedy")
q_soft, r_soft = train("softmax")
q_rand, r_rand = train("random")

print("\n=== FINAL RESULTS (avg last 200 episodes) ===")
print(f"Epsilon Greedy : {np.mean(r_eps[-200:]):.2f}")
print(f"Softmax        : {np.mean(r_soft[-200:]):.2f}")
print(f"Random         : {np.mean(r_rand[-200:]):.2f}")



=== Training with epsilon_greedy ===
Episode 0: avg reward (last 100) = 0.00
Episode 500: avg reward (last 100) = 0.35
Episode 1000: avg reward (last 100) = 0.46
Episode 1500: avg reward (last 100) = 0.79

=== Training with softmax ===
Episode 0: avg reward (last 100) = 0.00
Episode 500: avg reward (last 100) = 0.05
Episode 1000: avg reward (last 100) = 0.06
Episode 1500: avg reward (last 100) = 0.05

=== Training with random ===
Episode 0: avg reward (last 100) = 0.00
Episode 500: avg reward (last 100) = 0.00
Episode 1000: avg reward (last 100) = 0.01
Episode 1500: avg reward (last 100) = 0.01

=== FINAL RESULTS (avg last 200 episodes) ===
Epsilon Greedy : 0.84
Softmax        : 0.08
Random         : 0.03
