In [None]:
import gymnasium as gym
import numpy as np
import random

env = gym.make("Blackjack-v1", sab=True)  # Using Stable API (sab=True)

## Implement Q-Learning for Blackjack

#### Define Q-Learning Parameters

In [None]:
Q_table = {}

# Hyperparameters
alpha = 0.1   # Learning rate
gamma = 0.9   # Discount factor
epsilon = 1.0 # Initial exploration rate
epsilon_decay = 0.999  # Decay rate for exploration
epsilon_min = 0.1      # Minimum exploration rate
episodes = 500_000     # Number of episodes

#### Q-Learning Algorithm

In [None]:
def get_Q(state, action=None):
    """Get Q-value for a state-action pair, initializing to 0 if unseen."""
    if state not in Q_table:
        Q_table[state] = np.zeros(2)  # Two actions: Stick (0) or Hit (1)
    return Q_table[state] if action is None else Q_table[state][action]

# Training loop
for episode in range(episodes):
    state, _ = env.reset()
    done = False
    
    while not done:
        get_Q(state)

        # Epsilon-greedy action selection
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(get_Q(state))  # Exploit

        # Take action and observe reward & next state
        next_state, reward, done, truncated, _ = env.step(action)

        # Ensure the next state is in the Q-table
        get_Q(next_state)
        
        # Q-learning update rule
        best_next_action = np.argmax(get_Q(next_state))
        Q_table[state][action] += alpha * (reward + gamma * get_Q(next_state, best_next_action) - Q_table[state][action])

        # Move to next state
        state = next_state

    # Decay exploration rate
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    # Print progress every 50,000 episodes
    if episode % 50000 == 0:
        print(f"Episode {episode}: Epsilon {epsilon:.4f}")

print("Training complete!")


Episode 0: Epsilon 0.9990
Episode 50000: Epsilon 0.1000
Episode 100000: Epsilon 0.1000
Episode 150000: Epsilon 0.1000
Episode 200000: Epsilon 0.1000
Episode 250000: Epsilon 0.1000
Episode 300000: Epsilon 0.1000
Episode 350000: Epsilon 0.1000
Episode 400000: Epsilon 0.1000
Episode 450000: Epsilon 0.1000
Training complete!


#### Test the Trained Agent

In [5]:
wins, losses, draws = 0, 0, 0
test_episodes = 10_000

for _ in range(test_episodes):
    state, _ = env.reset()
    done = False
    
    while not done:
        action = np.argmax(get_Q(state))
        next_state, reward, done, truncated, _ = env.step(action)
        state = next_state

    if reward > 0:
        wins += 1
    elif reward < 0:
        losses += 1
    else:
        draws += 1

print(f"Results after {test_episodes} games:")
print(f"Wins: {wins} ({wins/test_episodes:.2%})")
print(f"Losses: {losses} ({losses/test_episodes:.2%})")
print(f"Draws: {draws} ({draws/test_episodes:.2%})")


Results after 10000 games:
Wins: 3946 (39.46%)
Losses: 5200 (52.00%)
Draws: 854 (8.54%)
