```
         Copyright Rein Halbersma 2020.
Distributed under the Boost Software License, Version 1.0.
   (See accompanying file LICENSE_1_0.txt or copy at
         http://www.boost.org/LICENSE_1_0.txt)
```

# Dynamic programming for the game of Blackjack

In [1]:
import gym
import pandas as pd

import gym_blackjack_v1 as bj
from doctrina import spaces
from doctrina.algorithms import dp

env = gym.make('Blackjack-v1', winning_blackjack=+1.5, model_based=True)

## The payout

In [2]:
pd.DataFrame(
    env.payout,
    index=bj.count_labels,  # player count
    columns=bj.count_labels # dealer count
)

Unnamed: 0,BUST,16,17,18,19,20,21,BJ
BUST,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
16,1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
17,1.0,1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
18,1.0,1.0,1.0,0.0,-1.0,-1.0,-1.0,-1.0
19,1.0,1.0,1.0,1.0,0.0,-1.0,-1.0,-1.0
20,1.0,1.0,1.0,1.0,1.0,0.0,-1.0,-1.0
21,1.0,1.0,1.0,1.0,1.0,1.0,0.0,-1.0
BJ,1.5,1.5,1.5,1.5,1.5,1.5,1.5,0.0


## Value iteration

In [3]:
V, policy, delta, iter = dp.V_value_iter(env)

## The state-value function

In [4]:
pd.DataFrame(
    V.reshape(spaces.shape(env.state_space))[1:len(bj.Hand), :len(bj.Card)], 
    index=bj.hand_labels[1:], 
    columns=bj.card_labels
).round(4)

Unnamed: 0,2,3,4,5,6,7,8,9,T,A
H2,-0.0759,-0.0498,-0.0221,0.0137,0.0389,-0.0273,-0.1032,-0.19,-0.3003,-0.4485
H3,-0.1005,-0.0689,-0.0363,0.0002,0.0245,-0.0574,-0.1309,-0.2151,-0.3218,-0.4655
H4,-0.1149,-0.0826,-0.0494,-0.0124,0.0111,-0.0883,-0.1593,-0.2407,-0.3439,-0.4829
H5,-0.1282,-0.0953,-0.0615,-0.024,-0.0012,-0.1194,-0.1881,-0.2666,-0.3662,-0.5006
H6,-0.1408,-0.1073,-0.0729,-0.0349,-0.013,-0.1519,-0.2172,-0.2926,-0.3887,-0.5183
H7,-0.1092,-0.0766,-0.043,-0.0073,0.0292,-0.0688,-0.2106,-0.2854,-0.3714,-0.5224
H8,-0.0218,0.008,0.0388,0.0708,0.115,0.0822,-0.0599,-0.2102,-0.3071,-0.4441
H9,0.0744,0.1013,0.129,0.158,0.196,0.1719,0.0984,-0.0522,-0.2181,-0.3532
H10,0.1825,0.2061,0.2305,0.2563,0.2878,0.2569,0.198,0.1165,-0.0536,-0.2513
H11,0.2384,0.2603,0.283,0.3073,0.3337,0.2921,0.23,0.1583,0.0334,-0.2087


## The optimal policy

In [5]:
pd.DataFrame(
    policy.reshape(spaces.shape(env.state_space))[1:len(bj.Hand), :len(bj.Card)], 
    index=bj.hand_labels[1:], 
    columns=bj.card_labels
).applymap(lambda a: bj.action_labels[a].upper()).replace({'S': ' '})

Unnamed: 0,2,3,4,5,6,7,8,9,T,A
H2,H,H,H,H,H,H,H,H,H,H
H3,H,H,H,H,H,H,H,H,H,H
H4,H,H,H,H,H,H,H,H,H,H
H5,H,H,H,H,H,H,H,H,H,H
H6,H,H,H,H,H,H,H,H,H,H
H7,H,H,H,H,H,H,H,H,H,H
H8,H,H,H,H,H,H,H,H,H,H
H9,H,H,H,H,H,H,H,H,H,H
H10,H,H,H,H,H,H,H,H,H,H
H11,H,H,H,H,H,H,H,H,H,H


## References
[Reinforcement Learning, an Introduction, second edition] (http://incompleteideas.net/book/RLbook2020.pdf), Richard S. Sutton and Andrew G. Barto (2018).  
[Optimal stopping of Markov chains or How to play Blackjack](https://www.dropbox.com/s/xrntclqyx36jhis/Blackjack_talk_2001.pdf), Craig L. Zirbel (2001).