# CartPole Reinforcement Learning
This notebook demonstrates a step-by-step Q-learning approach to solve the classic CartPole environment.

## 1. Environment Setup
In this section we import required libraries and create the CartPole environment.

In [None]:
import numpy as np  # Provides numerical functions and array support.
import gymnasium as gym  # Supplies the CartPole environment and other reinforcement learning tools.
from collections import defaultdict  # Allows dictionaries with default values for new keys.
env = gym.make("CartPole-v1")  # Instantiate the CartPole environment.

## 2. Defining the Agent
Here we create a simple Q-learning agent that discretizes the continuous state space.

In [None]:
class QLearningAgent:
    def __init__(self, env, buckets=(6, 12, 6, 12), alpha=0.1, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
        self.env = env  # Save the environment for access to action and observation spaces.
        self.buckets = buckets  # Define how finely to discretize each observation dimension.
        self.alpha = alpha  # Learning rate controls how much new information overrides old.
        self.gamma = gamma  # Discount factor balances immediate and future rewards.
        self.epsilon = epsilon  # Probability of choosing a random action (exploration).
        self.epsilon_decay = epsilon_decay  # Multiplicative factor to decrease exploration over time.
        self.epsilon_min = epsilon_min  # Lower bound on exploration probability.
        self.Q = defaultdict(lambda: np.zeros(self.env.action_space.n))  # Q-table mapping states to action values.

    def discretize(self, obs):
        upper_bounds = self.env.observation_space.high  # Maximum values for each observation component.
        lower_bounds = self.env.observation_space.low  # Minimum values for each observation component.
        ratios = (obs - lower_bounds) / (upper_bounds - lower_bounds)  # Normalize observations to 0-1.
        new_obs = [int(np.clip(ratios[i] * (self.buckets[i] - 1), 0, self.buckets[i] - 1)) for i in range(len(obs))]  # Map to discrete bins.
        return tuple(new_obs)  # Return as a tuple so it can be used as a dictionary key.

    def choose_action(self, state):
        if np.random.random() < self.epsilon:  # Decide whether to explore.
            return self.env.action_space.sample()  # Random action for exploration.
        return int(np.argmax(self.Q[state]))  # Best known action for exploitation.

    def learn(self, state, action, reward, next_state, done):
        best_next_action = np.argmax(self.Q[next_state])  # Value of the best next action.
        td_target = reward + self.gamma * self.Q[next_state][best_next_action] * (1 - done)  # Bellman target.
        td_delta = td_target - self.Q[state][action]  # Temporal difference error.
        self.Q[state][action] += self.alpha * td_delta  # Update Q-value.
        if done:  # After each episode,
            self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)  # Decay exploration rate.


## 3. Training Loop
We repeatedly interact with the environment so the agent can learn good actions.

In [None]:
agent = QLearningAgent(env)  # Create an instance of our Q-learning agent.
n_episodes = 500  # Total number of episodes to train.
for episode in range(n_episodes):  # Loop over each episode.
    obs, _ = env.reset()  # Reset environment and get initial observation.
    state = agent.discretize(obs)  # Convert observation to discrete state.
    done = False  # Track whether the episode has ended.
    total_reward = 0  # Sum of rewards for this episode.
    while not done:  # Continue until episode finishes.
        action = agent.choose_action(state)  # Pick action based on policy.
        next_obs, reward, terminated, truncated, _ = env.step(action)  # Apply action to environment.
        done = terminated or truncated  # Determine if episode is over.
        next_state = agent.discretize(next_obs)  # Discretize the next observation.
        agent.learn(state, action, reward, next_state, done)  # Update agent with observed transition.
        state = next_state  # Move to the next state.
        total_reward += reward  # Accumulate episode reward.
    if (episode + 1) % 50 == 0:  # Every 50 episodes,
        print(f"Episode {episode + 1}: Total Reward = {total_reward}")  # Output progress information.


## 4. Evaluation
After training, we run one episode without exploration to evaluate the policy.

In [None]:
obs, _ = env.reset()  # Reset environment for evaluation.
state = agent.discretize(obs)  # Discretize starting observation.
done = False  # Track episode completion.
total_reward = 0  # Total reward accumulated during evaluation.
while not done:  # Run until the episode ends.
    action = int(np.argmax(agent.Q[state]))  # Select the best known action without exploration.
    obs, reward, terminated, truncated, _ = env.step(action)  # Apply the action.
    state = agent.discretize(obs)  # Update the current state.
    done = terminated or truncated  # Check if the episode has finished.
    total_reward += reward  # Update the cumulative reward.
print(f"Evaluation reward: {total_reward}")  # Display the total reward from the evaluation episode.
env.close()  # Close the environment to free resources.
