```
         Copyright Rein Halbersma 2020-2021.
Distributed under the Boost Software License, Version 1.0.
   (See accompanying file LICENSE_1_0.txt or copy at
         http://www.boost.org/LICENSE_1_0.txt)
```

# Dynamic programming for the game of Blackjack

In [1]:
import gym
import numpy as np
import pandas as pd

import gym_blackjack_v1 as bj
from doctrina.algorithms import dp
from doctrina.spaces import state_table

env = gym.make('Blackjack-v1', winning_blackjack_payoff=1.5)

## The payoff

In [2]:
pd.DataFrame(
    env.payoff,
    index=bj.terminal_labels,  # player count
    columns=bj.terminal_labels # dealer count
).style.format('{:.1f}').applymap(lambda x: f'color: {"red" if x < 0 else "black"}')

Unnamed: 0,BUST,<17,17,18,19,20,21,BJ
BUST,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
<17,1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
17,1.0,1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
18,1.0,1.0,1.0,0.0,-1.0,-1.0,-1.0,-1.0
19,1.0,1.0,1.0,1.0,0.0,-1.0,-1.0,-1.0
20,1.0,1.0,1.0,1.0,1.0,0.0,-1.0,-1.0
21,1.0,1.0,1.0,1.0,1.0,1.0,0.0,-1.0
BJ,1.5,1.5,1.5,1.5,1.5,1.5,1.5,0.0


## Value iteration

In [3]:
%timeit dp.Q_value_iter(env)

1.7 ms ± 11.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [4]:
policy, Q, *_ = dp.Q_value_iter(env)
# We don't further need the terminal state (see S&B, p. 54).
policy, Q = policy[:-1], Q[:-1]

## The optimal policy

In [5]:
pd.DataFrame(
    state_table(policy, env), 
    index=bj.hand_labels, 
    columns=bj.card_labels
).applymap(lambda a: bj.action_labels[a]).replace({'S': ' '})

Unnamed: 0,2,3,4,5,6,7,8,9,T,A
H4,H,H,H,H,H,H,H,H,H,H
H5,H,H,H,H,H,H,H,H,H,H
H6,H,H,H,H,H,H,H,H,H,H
H7,H,H,H,H,H,H,H,H,H,H
H8,H,H,H,H,H,H,H,H,H,H
H9,H,H,H,H,H,H,H,H,H,H
H10,H,H,H,H,H,H,H,H,H,H
H11,H,H,H,H,H,H,H,H,H,H
H12,H,H,,,,H,H,H,H,H
H13,,,,,,H,H,H,H,H


## The advantage of hitting over standing

In [6]:
assert (policy == Q.argmax(axis=1)).all()
delta = Q[:, bj.Action.HIT] - Q[:, bj.Action.STAND]
(pd
    .DataFrame(
        state_table(delta, env), 
        index=bj.hand_labels, 
        columns=bj.card_labels
    )
    .style.format('{:.2%}')
    .applymap(lambda x: f'color: {"red" if x < 0 else "black"}')
    .applymap(lambda x: f'font-weight: {"bold" if abs(x) < .01 else ""}')
    .applymap(lambda x: f'text-decoration: {"underline" if abs(x) < .01 else ""}')
)

Unnamed: 0,2,3,4,5,6,7,8,9,T,A
H4,17.79%,16.96%,16.17%,15.48%,16.48%,38.71%,35.12%,30.25%,23.19%,28.65%
H5,16.46%,15.69%,14.96%,14.32%,15.25%,35.59%,32.24%,27.65%,20.96%,26.89%
H6,15.20%,14.50%,13.81%,13.23%,14.07%,32.34%,29.33%,25.05%,18.71%,25.12%
H7,18.36%,17.57%,16.80%,15.99%,18.29%,40.66%,29.99%,25.78%,20.43%,24.71%
H8,27.10%,26.03%,24.98%,23.80%,26.87%,55.76%,45.06%,33.30%,26.87%,32.53%
H9,36.72%,35.35%,34.00%,32.52%,34.97%,64.72%,60.89%,49.10%,35.77%,41.63%
H10,47.53%,45.83%,44.15%,42.35%,44.15%,73.23%,70.85%,65.97%,52.22%,51.81%
H11,53.11%,51.26%,49.41%,47.45%,48.74%,76.75%,74.05%,70.14%,60.92%,56.07%
H12,3.94%,1.86%,-0.25%,-2.61%,-1.68%,26.25%,23.89%,20.31%,14.71%,21.91%
H13,-1.50%,-3.90%,-6.32%,-9.01%,-8.19%,20.63%,18.69%,15.60%,10.63%,18.69%


## The state-value function

In [7]:
V = Q.max(axis=1)
(pd
    .DataFrame(
        state_table(V, env), 
        index=bj.hand_labels, 
        columns=bj.card_labels
    )
    .style.format('{:.4f}')
    .applymap(lambda x: f'color: {"red" if x < 0 else "black"}')
)

Unnamed: 0,2,3,4,5,6,7,8,9,T,A
H4,-0.1149,-0.0826,-0.0494,-0.0124,0.0111,-0.0883,-0.1593,-0.2407,-0.3439,-0.4829
H5,-0.1282,-0.0953,-0.0615,-0.024,-0.0012,-0.1194,-0.1881,-0.2666,-0.3662,-0.5006
H6,-0.1408,-0.1073,-0.0729,-0.0349,-0.013,-0.1519,-0.2172,-0.2926,-0.3887,-0.5183
H7,-0.1092,-0.0766,-0.043,-0.0073,0.0292,-0.0688,-0.2106,-0.2854,-0.3714,-0.5224
H8,-0.0218,0.008,0.0388,0.0708,0.115,0.0822,-0.0599,-0.2102,-0.3071,-0.4441
H9,0.0744,0.1013,0.129,0.158,0.196,0.1719,0.0984,-0.0522,-0.2181,-0.3532
H10,0.1825,0.2061,0.2305,0.2563,0.2878,0.2569,0.198,0.1165,-0.0536,-0.2513
H11,0.2384,0.2603,0.283,0.3073,0.3337,0.2921,0.23,0.1583,0.0334,-0.2087
H12,-0.2534,-0.2337,-0.2111,-0.1672,-0.1537,-0.2128,-0.2716,-0.34,-0.4287,-0.5504
H13,-0.2928,-0.2523,-0.2111,-0.1672,-0.1537,-0.2691,-0.3236,-0.3872,-0.4695,-0.5825


## Marginal contributions

In [8]:
hand_prob = state_table(env.isd, env).sum(axis=1)
card_prob = state_table(env.isd, env).sum(axis=0)
hand_value = state_table(V, env) @ card_prob
card_value = hand_prob @ state_table(V, env)
hand_contrib = hand_prob * hand_value
card_contrib = card_prob * card_value
assert np.isclose(hand_contrib.sum(), card_contrib.sum())

## Hand margins

In [9]:
(pd
    .DataFrame(
        data=np.vstack([hand_prob, hand_value, hand_contrib]).T, 
        index=bj.hand_labels, 
        columns=['prob', 'value', 'contrib']
    )
    .style.format('{:.2%}')
    .applymap(lambda x: f'color: {"red" if x < 0 else "black"}')
)

Unnamed: 0,prob,value,contrib
H4,0.59%,-19.96%,-0.12%
H5,1.18%,-21.92%,-0.26%
H6,1.78%,-23.87%,-0.42%
H7,2.37%,-21.38%,-0.51%
H8,2.96%,-12.69%,-0.38%
H9,3.55%,-2.68%,-0.10%
H10,4.14%,9.76%,0.40%
H11,4.73%,15.60%,0.74%
H12,8.88%,-31.60%,-2.81%
H13,8.28%,-34.75%,-2.88%


## Card margins

In [10]:
(pd
    .DataFrame(
        data=np.vstack([card_prob, card_value, card_contrib]), 
        columns=bj.card_labels, 
        index=['prob', 'value', 'contrib']
    )
    .style.format('{:.2%}')
    .applymap(lambda x: f'color: {"red" if x < 0 else "black"}')
)

Unnamed: 0,2,3,4,5,6,7,8,9,T,A
prob,7.69%,7.69%,7.69%,7.69%,7.69%,7.69%,7.69%,7.69%,30.77%,7.69%
value,6.64%,9.38%,12.21%,15.30%,18.27%,12.15%,4.40%,-4.77%,-17.79%,-33.89%
contrib,0.51%,0.72%,0.94%,1.18%,1.41%,0.93%,0.34%,-0.37%,-5.47%,-2.61%


## The house advantage

In [11]:
house_advantage = (env.isd * V).sum()
assert np.isclose(house_advantage, hand_contrib.sum())
assert np.isclose(house_advantage, card_contrib.sum())
print(f'{house_advantage:.2%}')

-2.42%


## References
[Reinforcement Learning, an Introduction](http://incompleteideas.net/book/RLbook2020.pdf), second edition, Richard S. Sutton and Andrew G. Barto (2018).  
[Optimal stopping of Markov chains or How to play Blackjack](https://www.dropbox.com/s/xrntclqyx36jhis/Blackjack_talk_2001.pdf), Craig L. Zirbel (2001).