<a href="https://colab.research.google.com/github/shubham-pyc/RL/blob/main/Blackjack_Q_Table_Backup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import gym

# Gym Environment

In [None]:
# https://github.com/openai/gym/blob/master/gym/envs/toy_text/blackjack.py
environment = gym.make(
    "Blackjack-v1",   # environment name
    natural=True,     # flag to payout 1.5x on a "natural" blackjack win
    new_step_api=True # avoids warnings and allows future compatibility
)

# Agent

In [None]:
class Agent(object):

    def __init__(self):
        self.quality_table = np.zeros(shape=(32, 11, 2, 2))
        self.states_seen = np.zeros(shape=(32, 11, 2, 2))

    # self.quality_table[observation] = [quality of stand, quality of hit]
    def act(self, observation):
        observation = self._clean_state(observation)
        return np.random.randint(0, 2)
        # return np.argmax(self.quality_table[observation])

    def update(self, state, action, reward):
        state = self._clean_state(state)
        self.states_seen[state + (action,)] += 1
        self.quality_table[state + (action,)] += (
            (reward - self.quality_table[state + (action,)]) / 
            self.states_seen[state + (action,)]
        )

    # cast (10, 5, True) -> np.array([10, 5, 1]) -> (10, 5, 1)
    def _clean_state(self, observation):
        return tuple(np.array(observation))

# Play one simulation with our agent

In [None]:
agent = Agent()
done = False

state = environment.reset()
states, actions, rewards = [], [], []

while not done:
    action = agent.act(state)
    next_state, reward, done, _, _ = environment.step(action)

    states.append(state)
    actions.append(action)
    rewards.append(reward)

    state = next_state

for (state, action, reward) in zip(states, actions, rewards):
    agent.update(state, action, reward)

print(f"Our hand sum: {state[0]}")
print(f"Our score: {reward}")

Our hand sum: 21
Our score: 1.0


# Play multiple simulations with our agent

In [None]:
# Start with variables that should persist between simulations
agent = Agent()
total_wins = 0
total_losses = 0

total_reward = 0
total_rewards = []

# Loop for N simulations
for iteration in range(1_000_000):
    
    state = environment.reset()

    done = False
    states, actions, rewards = [], [], []

    while not done:
        action = agent.act(state)
        next_state, reward, done, _, _ = environment.step(action)

        states.append(state)
        actions.append(action)
        rewards.append(reward)

        state = next_state

    for (state, action, reward) in zip(states, actions, rewards):
        agent.update(state, action, reward)

    # Metric logging
    if reward == -1.0:
        total_losses += 1
    else:
        total_wins += 1

    total_reward += reward
    total_rewards.append(total_reward)

# Metric printing and plotting

In [None]:
print(f"Total Wins: {total_wins}")
print(f"Total Losses: {total_losses}")

print(f"Win Rate: {100 * total_wins / (total_wins + total_losses):.3f}%")

In [None]:
plt.plot([x for x in range(1_000_000)], total_rewards)

plt.title("Total Reward as we play")
plt.xlabel("Iteration")
plt.ylabel("Money")
plt.show()

In [None]:
sns.heatmap(
    data = np.argmax(agent.quality_table[12:22, 1:, 0], axis=2),
    cbar = False,
    annot = True,
    xticklabels = np.arange(1, 11),
    yticklabels = np.arange(12, 22)
)
plt.title("No usable ace | 0 - stand, 1 - hit")
plt.xlabel("Dealer's Hand")
plt.ylabel("Our Hand")
plt.show()

sns.heatmap(
    data = np.argmax(agent.quality_table[12:22, 1:, 1], axis=2),
    annot = True,
    cbar = False,
    xticklabels = np.arange(1, 11),
    yticklabels = np.arange(12, 22)
)
plt.title("Usable ace | 0 - stand, 1 - hit")
plt.xlabel("Dealer's Hand")
plt.ylabel("Our Hand")
plt.show()