In [None]:
!pip install gym

In [None]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

In [None]:
env = gym.make('FrozenLake-v1')

In [None]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n
Q = np.zeros((state_space_size, action_space_size))

In [None]:
num_episodes = 10 ** 4
max_steps_per_episode = 100
alpha, gamma, epsilon, max_epsilon, min_epsilon, csi = 0.1, 0.99, 1, 1, 0.1, 0.01

In [None]:
# Define parameters for the linear equation
csi = 0.01  # ξ (slope)
epsilon = 1  # ɛ (intercept)

# Data for the linear graph
x = np.linspace(0, 99, 100)  # Generate 100 points between 0 and 90
y = -1 * csi * x + epsilon  # Linear equation: y = -ξx + ɛ

# Plotting the linear graph
plt.figure(figsize=(8, 6))
plt.plot(x, y, label=r'$y = -\xi x + \epsilon$', color='blue', linewidth=2)

# Adding a horizontal line at y = 0.01
plt.axhline(y=0.01, color='red', linestyle='--', label='y = 0.01')

# Adding labels and title
plt.title("Exploration Rate: y = -ξx + ɛ", fontsize=16)
plt.xlabel("Current Step", fontsize=14)
plt.ylabel("Exploration Rate", fontsize=14)

# Adding grid and legend
plt.grid(True, linestyle='--', alpha=0.7)
plt.axhline(0, color='black', linewidth=0.8)  # X-axis
plt.axvline(0, color='black', linewidth=0.8)  # Y-axis
plt.legend(fontsize=12)

# Show the plot
plt.show()


In [None]:
rewards_all_episodes = []

# Q-learning loop
for episode in range(num_episodes):
    s = env.reset()  # Reset the environment
    done = False
    current_episode_reward = 0

    for step in range(max_steps_per_episode):
        # Choose action using epsilon-greedy policy
        r = random.uniform(0, 1)
        if r > epsilon:
            a = np.argmax(Q[s, :])  # Exploit: Choose the best action
        else:
            a = env.action_space.sample()  # Explore: Random action

        # Take action and observe reward and next state
        new_state, reward, done, info = env.step(a)

        # Update Q(s, a) using the Q-learning update rule
        Q[s, a] = (1 - alpha) * Q[s, a] + alpha * (reward + gamma * np.max(Q[new_state, :]))

        # Update state and accumulate reward
        s = new_state
        current_episode_reward += reward

        if done:  # End episode if done
            break

    # Update exploration rate after the episode
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-csi * episode)

    # Store total reward for this episode
    rewards_all_episodes.append(current_episode_reward)


In [None]:
print(rewards_all_episodes)


In [None]:
Q