In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib.ticker import PercentFormatter

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

Hello, I've created this dataset and notebook for myself and others to gain insights into the game of Blackjack.

This notebook assumes basic knowledge of Blackjack and will attempt to look at some common theories/beliefs held by the general public.

# Read Dataset

For this notebook, we will limit ourselves to a subset of the data (10 million hands)

In [None]:
data = pd.read_csv('../input/blackjack-hands/blackjack_simulator.csv', nrows=10000000)

data.head()

# House Edge

Let's determine how much we can expect to win (or lose since "the house always wins") on average for every $1 we bet

In [None]:
ev = data.win.mean()

print(f'Expected Value: {ev:%}')

I guess the house always wins, right?

Maybe.  
Maybe not.

# Card Counting

Card counting is an attempt to predict how future hands will occur based on the previous cards that have already been played and thus removed from play.

As cards are played, players add or subtract a number to a running total.

The most common system (Hi-Lo) uses the following values to add/subtract:

| Card | Value |
| --- | --- |
| 2 | +1 |
| 3 | +1 |
| 4 | +1 |
| 5 | +1 |
| 6 | +1 |
| 7 | 0 |
| 8 | 0 |
| 9 | 0 |
| 10 | -1 |
| J | -1 |
| Q | -1 |
| K | -1 |
| A | -1 |

The running total is then divided by the number of decks remaining in the shoe to obtain the True Count (TC).

This means that when the TC is higher, it is more likely that Aces and face cards (10's, J's, Q's, K's), will be coming out next.

Let's inspect the data a bit with regards to TC:

In [None]:
num_of_true_counts = data.true_count.value_counts().sort_index()

plt.figure(figsize=(20, 10))
plt.title('Number of rounds by True Count')
plt.xlabel('True Count (TC)')
plt.ylabel('Rounds')
plt.plot(num_of_true_counts)
plt.show()

## Trim Data

As we can see, the vast majority of our data is centered around the TC of 0, and rarely reaches very high and low counts. We will trim the data to exclude outliers and reduce any potential noise.

In [None]:
data = data[data['true_count'].between(-10, 10)]

## Does Card Counting work?

Let's have a quick look at how True Count affects our winnings (or losings):

In [None]:
ev_by_count = data.loc[:, ['true_count', 'win']].groupby('true_count').mean()

plt.figure(figsize=(20, 10))
plt.title('Expected Value (EV) by True Count')
plt.xlabel('True Count (TC)')
plt.ylabel('EV (%)')
plt.grid(axis='both', linestyle=':')
plt.xticks(range(-10, 11, 1))
plt.axhline(y=0, color='r', linestyle='--')
plt.plot(ev_by_count)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.show()

As we can see, higher counts are better for us.

Notice how the point where Blackjack becomes a completely fair game and we break even is almost exactly at a TC of +1.

It would seem we should bet more when the TC is above +1, and less when it's below.

## Why is a higher True Count better for the Player?

Let's test some theories:

### Theory 1: The Dealer busts more often

This is a common theory amongst many Blackjack players. The reasoning is that when there are more face cards, the dealer has a higher probability of going over when they need to draw extra cards to make a hand.

Let's have a look at how often the dealer busts at different counts.

In [None]:
dealer_final_numeric = pd.to_numeric(data.dealer_final_value, errors='coerce')
busts = pd.concat([data.true_count, dealer_final_numeric > 21], keys=['true_count', 'bust'], axis=1)
pct_bust = (busts[busts.bust == True].groupby(['true_count']).count() / busts.groupby(['true_count']).count()).reset_index()

plt.figure(figsize=(20, 10))
plt.plot(pct_bust.true_count, pct_bust.bust)
plt.title('Dealer bust rate by True Count')
plt.xlabel('True Count (TC)')
plt.ylabel('Bust Rate (%)')
plt.grid(axis='both', linestyle=':')
plt.ylim(ymin=0.25, ymax=0.3)
plt.xticks(range(-10, 11, 1))
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.show()

**MYTH** <span style="color:red">**BUSTED**</span>

The dealer actually busts LESS on higher counts.

### Theory 2: We get more Blackjacks

More face cards means more Blackjacks right?

In [None]:
player_bjs = pd.concat([data.true_count, data.player_final_value == "['BJ']"], keys=['true_count', 'player_bj'], axis=1)
player_bj_rate = (player_bjs[player_bjs.player_bj == True].groupby(['true_count']).count() / player_bjs.groupby(['true_count']).count()).reset_index()

plt.figure(figsize=(20, 10))
plt.plot(player_bj_rate.true_count, player_bj_rate.player_bj)
plt.title('Blackjack Rate by True Count')
plt.xlabel('True Count (TC)')
plt.ylabel('Blackjack Rate (%)')
plt.grid(axis='both', linestyle=':')
plt.xticks(range(-10, 11, 1))
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.show()

**MYTH** <span style="color:green">**CONFIRMED**</span>

As a baseline let's see how often we get Blackjacks overall:

In [None]:
bj_rate_all = data[data.player_final_value == "['BJ']"].shape[0] / data.shape[0]
print(f'Blackjack Rate: {bj_rate_all:%}')

### But does it really matter?

Ok, so we get more Blackjacks. But they rarely happen anyways. They can't be that important right?

Let's see how much they account for our extra winnings from higher counts.

In [None]:
player_bj_rate_increase = player_bj_rate.player_bj - bj_rate_all
player_bj_ev_increase = player_bj_rate_increase * 1.5  # Blackjacks are paid 3:2 (equivalent to 1.5x bet)
player_bj_ev_increase.index = ev_by_count.index
ev_increase = ev_by_count.win - ev

plt.figure(figsize=(20, 10))
plt.bar(player_bj_rate.true_count, player_bj_ev_increase, color='b')
plt.bar(player_bj_rate.true_count, ev_increase - player_bj_ev_increase, color='r', bottom=player_bj_ev_increase)
plt.title('Breakdown of Extra EV gained from Higher True Count')
plt.xlabel('True Count')
plt.ylabel('EV Increase (%)')
plt.grid(axis='both', linestyle=':')
plt.xticks(range(-10, 11, 1))
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.legend(['Extra EV gained from Higher Blackjack Rate', 'Total Extra EV gained'])
plt.show()

Whoa, the majority of our extra EV comes from the extra Blackjacks we get!

# "2 is a Dealer's Ace"

Ah, the stories of the endless players who have been burned by a dealer showing a 2. Many of whom are traumatized and now fear the 2 more than any other card.

Some swear that the dealer will inevitably get a 21 far more often with a 2.

In [None]:
dealer_final_numeric = pd.to_numeric(data.dealer_final_value, errors='coerce')
twenty_ones = pd.concat([data.dealer_up, dealer_final_numeric == 21], keys=['dealer_up', 'twenty_one'], axis=1)
twenty_one_rate = (twenty_ones[twenty_ones.twenty_one == True].groupby(['dealer_up']).count() / twenty_ones.groupby(['dealer_up']).count()).reset_index()

plt.figure(figsize=(20, 10))
plt.bar(twenty_one_rate.dealer_up, twenty_one_rate.twenty_one)
plt.title('Dealer Bust Rate by Dealer Up Card')
plt.xlabel('Up Card')
plt.ylabel('Bust Rate (%)')
plt.grid(axis='y', linestyle=':')
plt.xticks(range(2, 12, 1))
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.show()

Looks like there's some truth to this one! (although only marginally more than cards 3-6)

One important thing to note is that Blackjacks are excluded from this. Since I'm attempting to measure the times when a dealer "magically" pulls out a 21 when the player is hoping for a bust, I'm only counting 21's in which the dealer draws extra cards after the player plays their hand. In the event of a Blackjack, the hand ends immediately before the player doesn't get a chance to actually play their hand and the dealer has no potential for busting by drawing cards.

# The Dealer has a Bust card. Don't take any cards, let them bust!

Have you heard this one before? Bust cards are considered cards 2-6. Some players say the dealer is always supposed to bust on these and get upset if the dealer doesn't. Let's see how it plays out.

In [None]:
dealer_final_numeric = pd.to_numeric(data.dealer_final_value, errors='coerce')
busts = pd.concat([data.dealer_up, dealer_final_numeric > 21], keys=['dealer_up', 'bust'], axis=1)
bust_rate = (busts[busts.bust == True].groupby(['dealer_up']).count() / busts.groupby(['dealer_up']).count()).reset_index()

plt.figure(figsize=(20, 10))
plt.bar(bust_rate.dealer_up, bust_rate.bust)
plt.title('Dealer Bust Rate by Dealer Up Card')
plt.xlabel('Up Card')
plt.ylabel('Bust Rate (%)')
plt.grid(axis='y', linestyle=':')
plt.xticks(range(2, 12, 1))
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.show()

The dealer certainly busts more often with cards 2-6. However, even with the worst card, the dealer still busts less than 50% of the time. Players shouldn't expect the dealer to bust.

# Ok, so how good/bad actually are these cards for the dealer?

Should you be happy or sad when the dealer flips that card?

In [None]:
ev_by_dealer_up = data.groupby(['dealer_up']).win.mean().reset_index()

plt.figure(figsize=(20, 10))
plt.bar(ev_by_dealer_up.dealer_up, ev_by_dealer_up.win)
plt.title('Expected Value (EV) by Dealer Up Card')
plt.xlabel('Up Card')
plt.ylabel('Expected Value (EV)')
plt.grid(axis='y', linestyle=':')
plt.xticks(range(2, 12, 1))
plt.axhline(y=0, color='r', linestyle='-')
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.show()

We should hope for the dealer to get something between 2-8.

9's, 10's, and A's are bad news.

# Insurance is a Sucker's Bet! (or is it...)

Insurance is a side bet which is offered if the dealer is showing an Ace. It is completely independant of the player's hand. All that matters is whether the dealer has a Blackjack or not.

It's often considered a sucker's bet by many.

Since it pays 2:1, in order for this to be a good bet, we need to be correct at least 1/3 (33.3%) of the time.

I've marked a red line to indicate that break-even point.

In [None]:
dealer_up_ace = data[data.dealer_up == 11].loc[:, ['true_count', 'dealer_final_value']]
bj_rate = dealer_up_ace[dealer_up_ace.dealer_final_value == 'BJ'].groupby(['true_count']).count() / dealer_up_ace.groupby(['true_count']).count()


plt.figure(figsize=(20, 10))
plt.plot(bj_rate)
plt.title('Blackjack Rate when Dealer has an Ace as Up Card')
plt.xlabel('True Count')
plt.ylabel('Blackjack Rate (%)')
plt.grid(axis='both', linestyle=':')
plt.xticks(range(-10, 11, 1))
plt.axhline(y=0.333, color='r', linestyle='--')
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.show()

Ah-ha! So if the true count is above +3, then insurance really isn't a sucker's bet at all.

# Conclusion

It would seem that counting cards is actually an effective strategy.

However, not all conventional wisdoms seem to accurate.

*Moral of the story:*  
> **Don't always believe everything people tell you. Trust the data instead.**