In [30]:
import gymnasium as gym

# Create the environment
env = gym.make("Blackjack-v1", sab=True)

# utils
action_mapper = {0: "Stick", 1: "Hit"}

### Initialize the game 

In [31]:
from collections import defaultdict
import numpy as np 
import random

# initializing a dictionary to hold the Q-values for each state-action pair
q_values = defaultdict(lambda: np.zeros(2))

# one forward pass 
obs, info = env.reset() # initialize the environment
done = False

In [32]:
q_values

defaultdict(<function __main__.<lambda>()>, {})

### E-greedy algorithm
With probability ε: choose a random action a ∈ A  
With probability 1 − ε: choose a = argmax<sub>a</sub> Q(s, a)

In [33]:
# how greedy 5% 
e = 0.5

# e-greedy algorithm 
def e_greedy(obs, q_values, e=e):
    if np.random.random() < e:
        return random.choice([0,1])
    else:
        return int(q_values[obs].argmax())
    
action = e_greedy(obs, q_values)
user_hand = env.unwrapped.player

print('User Cumulative Sum', obs[0], '| Dealer current sum', obs[1])
print('User chose to...', action_mapper.get(action), '...very greedily..')

User Cumulative Sum 17 | Dealer current sum 10
User chose to... Stick ...very greedily..


In [34]:
# 1 episode

"""
next_obs: (user_sum, dealer_sum, usable_ace)
reward: -1 or 0 or 1
termination: True or False | if the episode is done like game over
truncated: True or False | if the episode is truncated like time limit reached
info: dict | additional information about the environment 
"""
next_obs, reward, termination, truncated, info = env.step(action) # take action 0 (stick)

print('User Cumulative Sum', next_obs[0], '| Dealer current hand', next_obs[1])
print('User hand', env.unwrapped.player)

User Cumulative Sum 17 | Dealer current hand 10
User hand [7, 10]


### Update Rule in Q-Learning 

Q(S<sub>t</sub>, A<sub>t</sub>) ← Q(S<sub>t</sub>, A<sub>t</sub>) + α(R<sub>t+1</sub> + γ ⋅ Q(S<sub>t+1</sub>, A′) − Q(S<sub>t</sub>, A<sub>t</sub>))


1. Q(S<sub>t+1</sub>, A′)

In [35]:
if termination:
    print('Game is over')
    future_q_value = 0

else:
    print('Game is not over. Future Q value is used to update Q')
    future_q_value = np.max(q_values[next_obs])

discount_factor = 0.1

# td difference 
td_difference = reward + discount_factor*future_q_value - q_values[obs][action]
td_difference

Game is over


-1.0

update the q table 

In [36]:
learning_rate = 0.001
q_values[obs][action] += learning_rate * td_difference

-0.001

In [66]:
obs

(17, 10, 0)

In [1]:
import json 
import numpy as np
from collections import defaultdict

# initializing a dictionary to hold the Q-values for each state-action pair
obs = (17, 10, 0)
q_values = defaultdict(lambda: np.zeros(2))

q_values[obs][0] = -0.001

# Convert q_values to a serializable form
serializable_q_values = {
    str(key): value.tolist() for key, value in q_values.items()
}

# Save to JSON
with open('./policy.json', 'w') as f:
    json.dump(serializable_q_values, f, indent=2)


with open('./policy.json', 'r') as f:
    policy = json.load(f)

print('Current state -> ', obs)
print('Ideal Policy Raw -> ', policy.get(str(obs)))
print('Ideal Policy Action -> ', np.argmax(policy.get(str(obs))))

Current state ->  (17, 10, 0)
Ideal Policy Raw ->  [-0.001, 0.0]
Ideal Policy Action ->  1


In [25]:
import pandas as pd
import numpy as np
from collections import defaultdict

# Load the Excel file
df = pd.read_excel("../policies/PPO_action_grid_v0_2025_0330.xlsx", index_col=[0, 1])
df.columns = [int(c) if not pd.isna(c) else None for c in df.columns]
df

Unnamed: 0_level_0,Unnamed: 1_level_0,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
my hand,2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
dealer show card,,,,,,,,,,,,,,,,,,,,
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
10,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0


In [27]:
from collections import defaultdict
import numpy as np

q_values = defaultdict(lambda: {})

for dealer_card in df.index:
    for player_sum in df.columns:
        value = df.at[dealer_card, player_sum]
        if pd.notna(value):
            obs = (player_sum, dealer_card, 0)  # assuming no usable ace
            q_values[obs] = np.array([0., 0.])


In [28]:
q_values

defaultdict(<function __main__.<lambda>()>,
            {(3, (2, 1.0), 0): array([0., 0.]),
             (4, (2, 1.0), 0): array([0., 0.]),
             (5, (2, 1.0), 0): array([0., 0.]),
             (6, (2, 1.0), 0): array([0., 0.]),
             (7, (2, 1.0), 0): array([0., 0.]),
             (8, (2, 1.0), 0): array([0., 0.]),
             (9, (2, 1.0), 0): array([0., 0.]),
             (10, (2, 1.0), 0): array([0., 0.]),
             (11, (2, 1.0), 0): array([0., 0.]),
             (12, (2, 1.0), 0): array([0., 0.]),
             (13, (2, 1.0), 0): array([0., 0.]),
             (14, (2, 1.0), 0): array([0., 0.]),
             (15, (2, 1.0), 0): array([0., 0.]),
             (16, (2, 1.0), 0): array([0., 0.]),
             (17, (2, 1.0), 0): array([0., 0.]),
             (18, (2, 1.0), 0): array([0., 0.]),
             (19, (2, 1.0), 0): array([0., 0.]),
             (20, (2, 1.0), 0): array([0., 0.]),
             (21, (2, 1.0), 0): array([0., 0.]),
             (3, (3, 1.0), 0): a