In [7]:
import gymnasium as gym

# Create the environment
env = gym.make("Blackjack-v1", sab=True)

# utils
action_mapper = {0: "Stick", 1: "Hit"}

### Initialize the game 

In [8]:
from collections import defaultdict
import numpy as np 
import random

# initializing a dictionary to hold the Q-values for each state-action pair
q_values = defaultdict(lambda: np.zeros(2))

# one forward pass 
obs, info = env.reset() # initialize the environment
done = False

### E-greedy algorithm
With probability ε: choose a random action a ∈ A  
With probability 1 − ε: choose a = argmax<sub>a</sub> Q(s, a)

In [9]:
# how greedy 5% 
e = 0.5

# e-greedy algorithm 
def e_greedy(obs, q_values, e=e):
    if np.random.random() < e:
        return random.choice([0,1])
    else:
        return int(q_values[obs].argmax())
    
action = e_greedy(obs, q_values)
user_hand = env.unwrapped.player

print('User Cumulative Sum', obs[0], '| Dealer current sum', obs[1])
print('User chose to...', action_mapper.get(action), '...very greedily..')

User Cumulative Sum 17 | Dealer current sum 3
User chose to... Stick ...very greedily..


In [10]:
# 1 episode

"""
next_obs: (user_sum, dealer_sum, usable_ace)
reward: -1 or 0 or 1
termination: True or False | if the episode is done like game over
truncated: True or False | if the episode is truncated like time limit reached
info: dict | additional information about the environment 
"""
next_obs, reward, termination, truncated, info = env.step(action) # take action 0 (stick)

print('User Cumulative Sum', next_obs[0], '| Dealer current hand', next_obs[1])
print('User hand', env.unwrapped.player)

User Cumulative Sum 17 | Dealer current hand 3
User hand [7, 10]


### Update Rule in Q-Learning 

Q(S<sub>t</sub>, A<sub>t</sub>) ← Q(S<sub>t</sub>, A<sub>t</sub>) + α(R<sub>t+1</sub> + γ ⋅ Q(S<sub>t+1</sub>, A′) − Q(S<sub>t</sub>, A<sub>t</sub>))


1. Q(S<sub>t+1</sub>, A′)

In [16]:
if termination:
    print('Game is over')
    future_q_value = 0

else:
    print('Game is not over. Future Q value is used to update Q')
    future_q_value = np.max(q_values[next_obs])

discount_factor = 0.1

# td difference 
td_difference = reward + discount_factor*future_q_value - q_values[obs][action]
td_difference

Game is over


np.float64(1.0)

update the q table 

In [17]:
learning_rate = 0.001
q_values[obs][action] += learning_rate * td_difference

In [23]:
q_values

defaultdict(<function __main__.<lambda>()>,
            {(17, 3, 0): array([0.001, 0.   ])})