# Reinforcement Learning Demo: CartPole Balance

## Problem: Balance a pole on a cart
- **Goal**: Keep the pole upright as long as possible
- **Actions**: Push cart left (0) or right (1)
- **Reward**: +1 for each step the pole stays up
- **Episode ends**: Pole falls or cart moves too far

In [None]:
!pip install gymnasium numpy matplotlib pandas

import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import random
import pandas as pd

In [None]:
# Create environment
env = gym.make('CartPole-v1')
print(f"Actions: {env.action_space.n} (0=Left, 1=Right)")
print(f"Observations: {env.observation_space.shape}")
print("State: [cart_position, cart_velocity, pole_angle, pole_velocity]")

In [None]:
# Test random agent
def test_random_agent(episodes=5):
    scores = []
    for episode in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        
        for step in range(500):
            action = env.action_space.sample()  # Random action
            state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            
            if terminated or truncated:
                break
        
        scores.append(total_reward)
        print(f"Episode {episode + 1}: {total_reward} steps")
    
    print(f"Random agent average: {np.mean(scores):.1f} steps")
    return scores

random_scores = test_random_agent()

In [None]:
# Q-Learning Agent
class QLearningAgent:
    def __init__(self, n_actions=2, lr=0.1, gamma=0.99, epsilon=1.0, epsilon_decay=0.995):
        self.n_actions = n_actions
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.q_table = defaultdict(lambda: np.zeros(n_actions))
        
    def discretize_state(self, state):
        # Discretize continuous state space
        bins = [10, 10, 10, 10]  # Number of bins for each dimension
        ranges = [(-2.4, 2.4), (-3, 3), (-0.2, 0.2), (-3, 3)]
        
        discrete = []
        for i, (val, (low, high)) in enumerate(zip(state, ranges)):
            val = max(low, min(high, val))  # Clip to range
            discrete.append(int((val - low) / (high - low) * (bins[i] - 1)))
        
        return tuple(discrete)
    
    def choose_action(self, state):
        discrete_state = self.discretize_state(state)
        
        if random.random() < self.epsilon:
            return random.randint(0, self.n_actions - 1)
        return np.argmax(self.q_table[discrete_state])
    
    def learn(self, state, action, reward, next_state, done):
        discrete_state = self.discretize_state(state)
        discrete_next_state = self.discretize_state(next_state)
        
        current_q = self.q_table[discrete_state][action]
        next_max_q = 0 if done else np.max(self.q_table[discrete_next_state])
        target = reward + self.gamma * next_max_q
        
        self.q_table[discrete_state][action] = current_q + self.lr * (target - current_q)
        
        if self.epsilon > 0.01:
            self.epsilon *= self.epsilon_decay

print("Q-Learning Agent created!")

In [None]:
# Training
agent = QLearningAgent()
scores = []
epsilons = []

print("Training agent...")
for episode in range(1000):
    state, _ = env.reset()
    total_reward = 0
    
    for step in range(500):
        action = agent.choose_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        
        agent.learn(state, action, reward, next_state, done)
        
        state = next_state
        total_reward += reward
        
        if done:
            break
    
    scores.append(total_reward)
    epsilons.append(agent.epsilon)
    
    if (episode + 1) % 200 == 0:
        avg_score = np.mean(scores[-100:])
        print(f"Episode {episode + 1}: Avg = {avg_score:.1f}, Epsilon = {agent.epsilon:.3f}")

print("Training complete!")

In [None]:
# Test trained agent
def test_trained_agent(agent, episodes=10):
    original_epsilon = agent.epsilon
    agent.epsilon = 0  # No exploration, pure exploitation
    
    scores = []
    for episode in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        
        for step in range(500):
            action = agent.choose_action(state)
            state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            
            if terminated or truncated:
                break
        
        scores.append(total_reward)
        print(f"Test {episode + 1}: {total_reward} steps")
    
    agent.epsilon = original_epsilon
    print(f"Trained agent average: {np.mean(scores):.1f} steps")
    return scores

trained_scores = test_trained_agent(agent)

In [None]:
# Visualize results
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Training progress
ax1.plot(scores, alpha=0.6)
ax1.set_title('Training Scores')
ax1.set_xlabel('Episode')
ax1.set_ylabel('Steps Survived')
ax1.grid(True)

# Moving average
window = 50
moving_avg = pd.Series(scores).rolling(window).mean()
ax2.plot(moving_avg, color='red', linewidth=2)
ax2.set_title(f'Moving Average (window={window})')
ax2.set_xlabel('Episode')
ax2.set_ylabel('Average Steps')
ax2.grid(True)

# Epsilon decay
ax3.plot(epsilons, color='green')
ax3.set_title('Exploration Rate (Epsilon)')
ax3.set_xlabel('Episode')
ax3.set_ylabel('Epsilon')
ax3.grid(True)

# Before vs After comparison
comparison_data = [random_scores, trained_scores]
labels = ['Random\n(Before)', 'Trained\n(After)']
ax4.boxplot(comparison_data, labels=labels)
ax4.set_title('Performance Comparison')
ax4.set_ylabel('Steps Survived')
ax4.grid(True)

plt.tight_layout()
plt.show()

# Print improvement
improvement = np.mean(trained_scores) - np.mean(random_scores)
print(f"\nImprovement: {improvement:.1f} steps ({improvement/np.mean(random_scores)*100:.1f}% better)")
print(f"Success rate: {sum(1 for s in trained_scores if s >= 475)/len(trained_scores)*100:.1f}%")

## Key Concepts Demonstrated:

1. **Trial and Error Learning**: Agent starts random, improves over time
2. **Q-Learning**: Learns state-action values through experience
3. **Exploration vs Exploitation**: Epsilon-greedy strategy balances learning
4. **State Discretization**: Converts continuous observations to discrete states
5. **Reward Signal**: +1 per step guides the learning process

The agent learned to balance the pole much longer than random actions!