In [None]:
import numpy as np
from collections import defaultdict
import gymnasium as gym

class BlackjackQLearningAgent:
    def __init__(self, learning_rate=0.1, discount_factor=0.95, epsilon=1.0, epsilon_decay=0.99999, min_epsilon=0.01):
        self.q_values = defaultdict(lambda: np.zeros(2))
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon

    def get_action(self, state):
        if np.random.random() < self.epsilon:
            action = np.random.choice([0, 1])
        else:
            action = np.argmax(self.q_values[state])
        
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)
        return action

    def update(self, state, action, reward, next_state, done):
        current_q = self.q_values[state][action]
        if done:
            target_q = reward
        else:
            target_q = reward + self.gamma * np.max(self.q_values[next_state])
        self.q_values[state][action] += self.lr * (target_q - current_q)

class BlackjackEnv(gym.Env):
    def __init__(self, render_mode=None, natural=False, sab=False, total_decks=5):
        self.action_space = gym.spaces.Discrete(2)
        self.observation_space = gym.spaces.Tuple(
            (gym.spaces.Discrete(32), gym.spaces.Discrete(11), gym.spaces.Discrete(2))
        )
        self.natural = natural
        self.sab = sab
        self.render_mode = render_mode
        self.running_count = 0
        self.betting_unit = 1
        self.money = 50
        self.current_bet = 1
        one_suite = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10]
        self.original_deck = one_suite * 4 * total_decks
        self.deck = self.original_deck.copy()

    def step(self, action):
        assert self.action_space.contains(action)
        if action:
            self.player.append(self.draw_card(self.np_random))
            if self.is_bust(self.player):
                terminated = True
                reward = -1.0
            else:
                terminated = False
                reward = 0.0
        else:
            terminated = True
            while self.sum_hand(self.dealer) < 17:
                self.dealer.append(self.draw_card(self.np_random))
            reward = self.cmp(self.score(self.player), self.score(self.dealer))
            if self.sab and self.is_natural(self.player) and not self.is_natural(self.dealer):
                reward = 1.0
            elif not self.sab and self.natural and self.is_natural(self.player) and reward == 1.0:
                reward = 1.5

        self.money += reward * self.current_bet
        return self._get_obs(), reward, terminated, self.money

    def _get_obs(self):
        return (self.sum_hand(self.player), self.dealer[0], self.usable_ace(self.player), self.running_count, self.getRemainingDecks())

    def reset(self, seed=None):
        super().reset(seed=seed)
        self.money = 50
        self.deck = self.original_deck.copy()
        self.running_count = 0
        return self.new_round()

    def new_round(self):
        self.dealer = self.draw_hand(self.np_random)
        self.player = self.draw_hand(self.np_random)
        return self._get_obs()

    def draw_card(self, np_random):
        card_index = np_random.choice(len(self.deck))
        card = self.deck[card_index]
        if card in [1, 10]:
            self.running_count -= 1
        elif 2 <= card <= 6:
            self.running_count += 1
        self.deck.pop(card_index)
        return card

    def draw_hand(self, np_random):
        return [self.draw_card(np_random), self.draw_card(np_random)]

    def getRemainingDecks(self):
        return round(len(self.deck) / 52 * 2) / 2

    @staticmethod
    def cmp(a, b):
        return float(a > b) - float(a < b)

    @staticmethod
    def usable_ace(hand):
        return 1 in hand and sum(hand) + 10 <= 21

    @staticmethod
    def sum_hand(hand):
        if BlackjackEnv.usable_ace(hand):
            return sum(hand) + 10
        return sum(hand)

    @staticmethod
    def is_bust(hand):
        return BlackjackEnv.sum_hand(hand) > 21

    @staticmethod
    def score(hand):
        return 0 if BlackjackEnv.is_bust(hand) else BlackjackEnv.sum_hand(hand)

    @staticmethod
    def is_natural(hand):
        return sorted(hand) == [1, 10]

# def train_agent(env, agent, num_episodes):
#     for episode in range(num_episodes):
#         if env.getRemainingDecks() < 1 or env.money < 1:
#             state = env.reset()
#         else:
#             state = env.new_round()
#         done = False
#         while not done:
#             action = agent.get_action(state)
#             next_state, reward, done, _ = env.step(action)
#             agent.update(state, action, reward, next_state, done)
#             state = next_state
#         if episode % 10000 == 0:
#             print(f"Episode {episode} completed")

def train_agent(env, agent, num_episodes):
    for episode in range(num_episodes):
        state = env.reset()
        while env.getRemainingDecks() >= 1 and env.money >= 1:
            state = env.new_round()
            done = False
            while not done:
                action = agent.get_action(state)
                next_state, reward, done, _ = env.step(action)
                agent.update(state, action, reward, next_state, done)
                state = next_state
        if episode % 10000 == 0:
            print(f"Episode {episode} completed")

def evaluate_agent(env, agent, num_episodes=10000):
    total_reward = 0
    env.reset()
    for _ in range(num_episodes):
        if env.getRemainingDecks() < 1 or env.money < 1:
            state = env.reset()
        else:
            state = env.new_round()
        done = False
        while not done:
            action = np.argmax(agent.q_values[state])
            state, reward, done, _ = env.step(action)
            total_reward += reward
    return total_reward / num_episodes


In [9]:
# Create environment and agent
env = BlackjackEnv()
agent = BlackjackQLearningAgent()

# Train the agent
train_agent(env, agent, num_episodes=100000)

# Evaluate the agent
env.reset()
average_reward = evaluate_agent(env, agent)
print(f"Average reward over 10,000 episodes: {average_reward}")

# money = env.money
# print("intital money = ", money)
# for i in range(10):
#     done = False
#     observation = env.new_round()  # starts a new round
#     while not done:
#         action = env.action_space.sample()
#         observation, reward, done, money = env.step(action)
#         print("Iteration-",i, ":",observation, reward, done, money, "action taken = ", action)

money = env.money
print("intital money = ", money)
for i in range(10):
    done = False
    observation = env.new_round()
    # observation = env.reset()
    while not done:
        action = env.action_space.sample()
        observation, reward, done, money = env.step(action)
        print("Iteration-", i, ":",observation, reward, done, money, "action taken = ", action)  

Episode 0 completed
Episode 10000 completed
Episode 20000 completed
Episode 30000 completed
Episode 40000 completed
Episode 50000 completed
Episode 60000 completed
Episode 70000 completed
Episode 80000 completed
Episode 90000 completed
Average reward over 10,000 episodes: -0.1337
intital money =  47.0
Iteration- 0 : (18, 8, False, -10, 3.0) 0.0 False 47.0 action taken =  1
Iteration- 0 : (18, 8, False, -10, 3.0) -1.0 True 46.0 action taken =  0
Iteration- 1 : (25, 6, False, -9, 3.0) -1.0 True 45.0 action taken =  1
Iteration- 2 : (17, 5, False, -5, 2.5) 0.0 False 45.0 action taken =  1
Iteration- 2 : (20, 5, False, -4, 2.5) 0.0 False 45.0 action taken =  1
Iteration- 2 : (20, 5, False, -1, 2.5) 1.0 True 46.0 action taken =  0
Iteration- 3 : (21, 2, True, 1, 2.5) 0.0 True 46.0 action taken =  0
Iteration- 4 : (22, 9, False, 1, 2.5) -1.0 True 45.0 action taken =  1
Iteration- 5 : (20, 8, False, 3, 2.5) 0.0 False 45.0 action taken =  1
Iteration- 5 : (20, 8, False, 2, 2.5) 0.0 True 45.0 a