### Problem with DP code

The value function, the function that shows the estimated value of each state, shows that the value for hands with a usable ace have the same value for hands without a usalbe ace. However, this is clearly not the case shown in the Monte Carlo Sim. Even with logic you may figure out that the likelyhood of winning increase when you have a usable ace to bail you out of a bust. Say, you are at 27, with a usable ace you may count that as a 17 and play on.

# Value Function
3 dimensional array, where first index is \[cardsum(12 to 21) - 12\] (_for indexing_); second index is 0 if no usable ace and 1 if any; third index is dealer card \[(1 to 10)-1\](_for indexing_).

In [30]:
import numpy as np
import random
import cProfile

In [31]:
V = np.array([[[0.]*10]*2]*10)

# BlackJack Dynamic Programming Strategy

In [32]:
states = []
for i in range(10):
    for j in range(2):
        for k in range(10):
            states.append((i, j, k))

In [33]:
dealer_dist = [
    [13.6, 5.6, 14.3, 14.6, 13.8, 38.1],
    [34.9, 13.6, 14.1, 12.8, 12.8, 11.9],
    [38.1, 11.8, 13.4, 12.6, 12.2, 12.0],
    [39.6, 11.9, 13.3, 12.1, 11.8, 11.4],
    [42.1, 12.1, 12.4, 11.0, 11.6, 10.9],
    [43.5, 12.3, 11.8, 11.0, 11.2, 10.3],
    [26.3, 37.0, 14.1, 7.9, 7.8, 7.0],
    [25.0, 12.7,35.6, 12.8, 7.3, 6.6],
    [22.3, 12.0, 12.0, 35.6, 11.9, 6.2],
    [21.3, 11.25, 11.5, 11.1, 34.1, 11.3]
]

# cumulative win probability
# sum of all indexes before
for probabilities in dealer_dist:
    for i in range(1, 6):
        probabilities[i] += probabilities[i-1]
        

In [44]:
def blackjack_probability(s_, s, a, r):
    stay = 0
    hit = 1

    if a == stay:
        '''find Probability of win, loss, tie'''
        if s[0] < 5:
            # probability dealer busts
            if r == 1:
                return (dealer_dist[s[2]][0])/100

            else:
                return (1 - dealer_dist[s[2]][0]/100)
        
        else:
            index = s[0] - 4      # 17 is 1
            if r == 1:
                return (dealer_dist[s[2]][index-1]/ 100)
            
            elif r == 0:
                return ((dealer_dist[s[2]][index] - dealer_dist[s[2]-1][index-1])/100)
            
            else:
                return (1-(dealer_dist[s[2]][index]/100))
                
    
    if a == hit:
        '''Find probability s_'''
        if s[2] != s_[2]:
            '''dealer card is not the same'''
            return 0
        
        if s_ == (-1, -1, -1):
            '''find Probability of a bust'''
            return (4 + s[0])/13
            
        if s_[1] == 0 and s[1] == 1:
            '''player has to use his ace'''
            
            if s_[0] == s[0]:
                '''probability of a 10 value card'''
                return 4/13
            
            if s_[0] > s[0]:
                return 0
            
            return 1/13
        
        if s_[1] == s[1]:
            '''no ace transaction'''
            if s_[0] <= s[0]:
                '''new hand cannot be less than old hand'''
                return 0
        
        return 1/13

In [35]:
def pi_20(action, state):
    '''deterministic policy that returns the probability of action given state'''
    stay = 0
    hit = 1
    
    if state[0] >= 8:
        if action == stay:  
            return 1
        else:
            return 0
    else:
        if action == hit:
            return 0
        else:
            return 1

In [36]:
def evaluate_policy(V, theta):
    while True:
        delta = 0
        for s1, s2, s3 in states:
            v = V[s1][s2][s3]
            bellman_update(V, (s1, s2, s3))
            delta = max(delta, abs(v - V[s1][s2][s3]))
        if delta < theta:
            break
    return V

In [43]:
def bellman_update(V, state):
    """Mutate ``V`` according to the Bellman update equation."""
    stay = 0
    hit = 1
    
    hit_value = 0
    for s_ in states:
        Vs_ = V[s_[0]][s_[1]][s_[2]]
        hit_value += blackjack_probability(s_, state, hit, 0)*(0 + Vs_) 
    
    # probability of a bust
    hit_value += blackjack_probability((-1, -1, -1), state, hit, 0)*(-1 + 0) 
    
    # value of staying
    stay_value = 0
    for r in range(3):
        stay_value += blackjack_probability((-1, -1, -1), state, stay, (r-1))*(r-1)
    
    hand, ace, dealer = state
    V[hand][ace][dealer] = pi_20(hit, state)*hit_value + pi_20(stay, state)*stay_value
    return 

In [45]:
evaluate_policy(V, 0.00005)
cProfile.run("evaluate_policy(V, 0.00005)")

         41804 function calls in 0.044 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      400    0.000    0.000    0.000    0.000 <ipython-input-35-95ae22cb2c62>:1(pi_20)
        1    0.000    0.000    0.044    0.044 <ipython-input-36-576cca37a699>:1(evaluate_policy)
      200    0.037    0.000    0.044    0.000 <ipython-input-43-2c44fb10cedd>:1(bellman_update)
    40800    0.007    0.000    0.007    0.000 <ipython-input-44-6cdbcf21c358>:1(blackjack_probability)
        1    0.000    0.000    0.044    0.044 <string>:1(<module>)
      200    0.000    0.000    0.000    0.000 {built-in method builtins.abs}
        1    0.000    0.000    0.044    0.044 {built-in method builtins.exec}
      200    0.000    0.000    0.000    0.000 {built-in method builtins.max}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}




In [46]:
V

array([[[-0.728 , -0.302 , -0.238 , -0.208 , -0.158 , -0.13  , -0.474 ,
         -0.5   , -0.554 , -0.574 ],
        [-0.728 , -0.302 , -0.238 , -0.208 , -0.158 , -0.13  , -0.474 ,
         -0.5   , -0.554 , -0.574 ]],

       [[-0.728 , -0.302 , -0.238 , -0.208 , -0.158 , -0.13  , -0.474 ,
         -0.5   , -0.554 , -0.574 ],
        [-0.728 , -0.302 , -0.238 , -0.208 , -0.158 , -0.13  , -0.474 ,
         -0.5   , -0.554 , -0.574 ]],

       [[-0.728 , -0.302 , -0.238 , -0.208 , -0.158 , -0.13  , -0.474 ,
         -0.5   , -0.554 , -0.574 ],
        [-0.728 , -0.302 , -0.238 , -0.208 , -0.158 , -0.13  , -0.474 ,
         -0.5   , -0.554 , -0.574 ]],

       [[-0.728 , -0.302 , -0.238 , -0.208 , -0.158 , -0.13  , -0.474 ,
         -0.5   , -0.554 , -0.574 ],
        [-0.728 , -0.302 , -0.238 , -0.208 , -0.158 , -0.13  , -0.474 ,
         -0.5   , -0.554 , -0.574 ]],

       [[-0.728 , -0.302 , -0.238 , -0.208 , -0.158 , -0.13  , -0.474 ,
         -0.5   , -0.554 , -0.574 ],
        [-0

In [47]:
import plotly.graph_objects as go

y_values = []
x_values = []

for i in range(12, 22):
    for k in range(12, 22):
        x_values.append(i)

for i in range(10):
    for k in range(1, 11):
        y_values.append(k)


In [48]:
without_ace = np.array([[0.]*10]*10)
for i in range(len(V)):
    for j in range(len(V[0][0])):
        without_ace[i][j] = V[i][0][j]
        
z_values = np.array(without_ace).flatten()

without_ace_fig = go.Figure(data=[go.Mesh3d(x=np.array(x_values),
                   y=(np.array(y_values)),
                   z=(np.array(z_values)),
                   opacity=0.5,
                   color='rgba(244,22,100,0.6)'
                  )])

without_ace_fig.update_layout(
    scene = dict(
        xaxis = dict(nticks=10, range=[12, 21],),
                     yaxis = dict(nticks=10, range=[1, 10],),
                     zaxis = dict(nticks=5, range=[-1.5, 1.5],),),
    width=700,
    margin=dict(r=20, l=10, b=10, t=10))

without_ace_fig.show()

In [49]:
with_ace = np.array([[0.]*10]*10)
for i in range(len(V)):
    for j in range(len(V[0][0])):
         with_ace[i][j] = V[i][1][j]


z_values = np.array(without_ace).flatten()

with_ace_fig = go.Figure(data=[go.Mesh3d(x=np.array(x_values),
                   y=(np.array(y_values)),
                   z=(np.array(z_values)),
                   opacity=0.5,
                   color='rgba(244,22,100,0.6)'
                  )])

with_ace_fig.update_layout(
    scene = dict(
        xaxis = dict(nticks=10, range=[12, 21],),
                     yaxis = dict(nticks=10, range=[1, 10],),
                     zaxis = dict(nticks=5, range=[-1.5, 1.5],),),
    width=700,
    margin=dict(r=20, l=10, b=10, t=10))

with_ace_fig.show()