# 🎯 Reinforcement Learning Demo: CartPole Balance

## Problem Statement
**Goal**: Train an AI agent to balance a pole on a moving cart

**Environment**: CartPole-v1 from OpenAI Gymnasium
- **State**: [cart_position, cart_velocity, pole_angle, pole_velocity]
- **Actions**: 2 discrete actions (0=Push Left, 1=Push Right)
- **Reward**: +1 for each timestep the pole remains upright
- **Episode End**: Pole angle > 12°, cart position > 2.4, or 500 steps
- **Success**: Average score ≥ 475 over 100 consecutive episodes

**Learning Algorithm**: Q-Learning with state discretization

In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import random
from IPython.display import clear_output
import time

print("✅ Libraries loaded successfully!")

## Step 1: Environment Setup

In [None]:
# Create CartPole environment
env = gym.make('CartPole-v1')

print("🎯 CartPole Environment Created!")
print(f"   State Space: {env.observation_space}")
print(f"   Action Space: {env.action_space} (0=Left, 1=Right)")
print(f"   Max Episode Steps: 500")

# Test environment
state, _ = env.reset()
print(f"\n📊 Sample State: {state}")
print("   [cart_pos, cart_vel, pole_angle, pole_vel]")
print(f"   Cart Position: {state[0]:.3f}")
print(f"   Cart Velocity: {state[1]:.3f}")
print(f"   Pole Angle: {state[2]:.3f} radians ({np.degrees(state[2]):.1f}°)")
print(f"   Pole Velocity: {state[3]:.3f}")

## Step 2: Q-Learning Agent with State Discretization

In [None]:
class QLearningAgent:
    def __init__(self, n_actions=2, lr=0.1, gamma=0.99, epsilon=1.0, epsilon_decay=0.995):
        self.n_actions = n_actions
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = 0.01
        
        # Q-table as dictionary for discrete states
        self.q_table = defaultdict(lambda: np.zeros(n_actions))
        
        # Tracking metrics
        self.episode_rewards = []
        self.episode_lengths = []
        self.epsilons = []
        
    def discretize_state(self, state):
        """Convert continuous state to discrete bins"""
        # Define bins for each state dimension
        bins = [10, 10, 10, 10]  # Number of bins per dimension
        ranges = [
            (-2.4, 2.4),    # Cart position
            (-3.0, 3.0),    # Cart velocity  
            (-0.2, 0.2),    # Pole angle
            (-3.0, 3.0)     # Pole velocity
        ]
        
        discrete_state = []
        for i, (value, (low, high)) in enumerate(zip(state, ranges)):
            # Clip value to range
            value = max(low, min(high, value))
            # Convert to discrete bin
            bin_index = int((value - low) / (high - low) * (bins[i] - 1))
            discrete_state.append(bin_index)
        
        return tuple(discrete_state)
    
    def choose_action(self, state):
        """Epsilon-greedy action selection"""
        discrete_state = self.discretize_state(state)
        
        if random.random() < self.epsilon:
            return random.randint(0, self.n_actions - 1)  # Explore
        else:
            return np.argmax(self.q_table[discrete_state])  # Exploit
    
    def learn(self, state, action, reward, next_state, done):
        """Q-learning update"""
        discrete_state = self.discretize_state(state)
        discrete_next_state = self.discretize_state(next_state)
        
        # Current Q-value
        current_q = self.q_table[discrete_state][action]
        
        # Next state's maximum Q-value
        if done:
            next_max_q = 0
        else:
            next_max_q = np.max(self.q_table[discrete_next_state])
        
        # Q-learning update rule
        target = reward + self.gamma * next_max_q
        self.q_table[discrete_state][action] = current_q + self.lr * (target - current_q)
        
        # Decay epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Create agent
agent = QLearningAgent()
print("🧠 Q-Learning Agent Created!")
print(f"   Learning Rate: {agent.lr}")
print(f"   Discount Factor: {agent.gamma}")
print(f"   Initial Exploration: {agent.epsilon}")
print(f"   Epsilon Decay: {agent.epsilon_decay}")