### Problem with DP code

The value function, the function that shows the estimated value of each state, shows that the value for hands with a usable ace have the same value for hands without a usalbe ace. However, this is clearly not the case shown in the Monte Carlo Sim. Even with logic you may figure out that the likelyhood of winning increase when you have a usable ace to bail you out of a bust. Say, you are at 27, with a usable ace you may count that as a 17 and play on.

# Value Function
3 dimensional array, where first index is \[cardsum(12 to 21) - 12\] (_for indexing_); second index is 0 if no usable ace and 1 if any; third index is dealer card \[(1 to 10)-1\](_for indexing_).

In [59]:
import numpy as np
import random
import cProfile

In [60]:
V = np.zeros((10, 2, 10))

# BlackJack Dynamic Programming Strategy

In [61]:
states = []
for i in range(10):
    for j in range(2):
        for k in range(10):
            states.append((i, j, k))

In [62]:
dealer_dist = [
    [13.6,  5.6 , 14.3, 14.6, 13.8, 38.1],
    [34.9, 13.6 , 14.1, 12.8, 12.8, 11.9],
    [38.1, 11.8 , 13.4, 12.6, 12.2, 12.0],
    [39.6, 11.9 , 13.3, 12.1, 11.8, 11.4],
    [42.1, 12.1 , 12.4, 11.0, 11.6, 10.9],
    [43.5, 12.3 , 11.8, 11.0, 11.2, 10.3],
    [26.3, 37.0 , 14.1,  7.9,  7.8,  7.0],
    [25.0, 12.7 , 35.6, 12.8,  7.3,  6.6],
    [22.3, 12.0 , 12.0, 35.6, 11.9,  6.2],
    [21.3, 11.25, 11.5, 11.1, 34.1, 11.3]
]

# cumulative win probability
# sum of all indexes before
for probabilities in dealer_dist:
    for i in range(1, 6):
        probabilities[i] += probabilities[i-1]    

In [81]:
pi = {}
for state in states:
    pi[state] = {
        0 : 0,
        1 : 1
    }
for h in range(10):
    for a in range(2):
        for d in range(10):
            if h < 5:
                pi[(h, a, d)][1] = 1
                pi[(h, a, d)][0] = 0
                
            if d == 9 or d == 0:
                pi[(h, a, d)][1] = 0
                pi[(h, a, d)][0] = 1
                
            if (d < 6 and d > 0) and h < 6:
                pi[(h, a, d)][1] = 1
                pi[(h, a, d)][0] = 0
            
            if a == 1:
                pi[(h, a, d)][1] = 1
                pi[(h, a, d)][0] = 0
                
            if h > 7:
                pi[(h, a, d)][1] = 0
                pi[(h, a, d)][0] = 1
                

In [64]:
for h in range(10):
    for a in range(2):
        for d in range(10):
            if h > 6:
                pi[(h, a, d)][1] = 0
                pi[(h, a, d)][0] = 1
            else:
                pi[(h, a, d)][1] = 1
                pi[(h, a, d)][0] = 0

In [65]:
pi

{(0, 0, 0): {0: 0, 1: 1},
 (0, 0, 1): {0: 0, 1: 1},
 (0, 0, 2): {0: 0, 1: 1},
 (0, 0, 3): {0: 0, 1: 1},
 (0, 0, 4): {0: 0, 1: 1},
 (0, 0, 5): {0: 0, 1: 1},
 (0, 0, 6): {0: 0, 1: 1},
 (0, 0, 7): {0: 0, 1: 1},
 (0, 0, 8): {0: 0, 1: 1},
 (0, 0, 9): {0: 0, 1: 1},
 (0, 1, 0): {0: 0, 1: 1},
 (0, 1, 1): {0: 0, 1: 1},
 (0, 1, 2): {0: 0, 1: 1},
 (0, 1, 3): {0: 0, 1: 1},
 (0, 1, 4): {0: 0, 1: 1},
 (0, 1, 5): {0: 0, 1: 1},
 (0, 1, 6): {0: 0, 1: 1},
 (0, 1, 7): {0: 0, 1: 1},
 (0, 1, 8): {0: 0, 1: 1},
 (0, 1, 9): {0: 0, 1: 1},
 (1, 0, 0): {0: 0, 1: 1},
 (1, 0, 1): {0: 0, 1: 1},
 (1, 0, 2): {0: 0, 1: 1},
 (1, 0, 3): {0: 0, 1: 1},
 (1, 0, 4): {0: 0, 1: 1},
 (1, 0, 5): {0: 0, 1: 1},
 (1, 0, 6): {0: 0, 1: 1},
 (1, 0, 7): {0: 0, 1: 1},
 (1, 0, 8): {0: 0, 1: 1},
 (1, 0, 9): {0: 0, 1: 1},
 (1, 1, 0): {0: 0, 1: 1},
 (1, 1, 1): {0: 0, 1: 1},
 (1, 1, 2): {0: 0, 1: 1},
 (1, 1, 3): {0: 0, 1: 1},
 (1, 1, 4): {0: 0, 1: 1},
 (1, 1, 5): {0: 0, 1: 1},
 (1, 1, 6): {0: 0, 1: 1},
 (1, 1, 7): {0: 0, 1: 1},
 (1, 1, 8): 

In [66]:
def blackjack_probability(s_, s, a, r):
    stay = 0
    hit = 1

    if a == stay:
        '''find Probability of win, loss, tie'''
        if s[0] < 5:
            # probability dealer busts
            if r == 1:
                return (dealer_dist[s[2]][0])/100

            else:
                return (1 - dealer_dist[s[2]][0]/100)
        
        else:
            index = s[0] - 4      # 17 is 1
            if r == 1:
                # probability of dealer recieving a lower score
                return (dealer_dist[s[2]][index-1]/ 100)
            
            elif r == 0:
                # probability of dealer getting player's score
                return ((dealer_dist[s[2]][index] - dealer_dist[s[2]][index-1])/100)
            
            else:
                # probability of dealer getting a higher score
                return (1-(dealer_dist[s[2]][index]/100))
                
    
    if a == hit:
        '''Find probability s_'''
        
        if s_ == (-1, -1, -1):
            '''find Probability of a bust'''
            if s[1] == 0:
                return (4 + s[0])/13
            else:
                # player cannot bust with an ace
                return 0
            
        if s[2] != s_[2]:
            '''dealer card is not the same'''
            return 0
        
            
        if s_[1] == 0 and s[1] == 1:
            '''player has to use his ace'''
            
            if s_[0] == s[0]:
                '''probability of a 10 value card'''
                return 4/13
            
            if s_[0] > s[0]:
                return 0
            
            return 1/13
        
        if s_[1] == s[1]:
            '''no ace transaction'''
            if s_[0] <= s[0]:
                '''new hand cannot be less than old hand'''
                return 0
        
        return 1/13

In [67]:
def pi20(action, state):
    '''deterministic policy that returns the probability of action given state'''
    stay = 0
    hit = 1
    
    # index 0 is a hand with value 12
    if state[0] >= 8:
        if action == stay:  
            return 1
        else:
            return 0
    else:
        if action == hit:
            return 0
        else:
            return 1


In [68]:
def evaluate_policy(V, theta, pi):
    while True:
        delta = 0
        for s1, s2, s3 in states:
            v = V[s1][s2][s3]
            if abs(v) > 2:
                return
            bellman_update(V, (s1, s2, s3), pi)
            delta = max(delta, abs(v - V[s1][s2][s3]))
        if delta < theta:
            break
    return V

In [69]:
def bellman_update(V, state, pi):
    """Mutate ``V`` according to the Bellman update equation."""
    stay = 0
    hit = 1
    
    hit_value = 0
    for s_ in states:
        Vs_ = V[s_[0]][s_[1]][s_[2]]
        hit_value += blackjack_probability(s_, state, hit, 0)*(0 + Vs_)
        
    
    # probability of a bust
    hit_value += blackjack_probability((-1, -1, -1), state, hit, -1)*(-1 + 0) 
    
    # value of staying
    stay_value = 0
    for r in [-1, 0, 1]:
        stay_value += blackjack_probability((-1, -1, -1), state, stay, r)*(r + 0)
    
    hand, ace, dealer = state
    V[hand][ace][dealer] = pi[state][hit]*hit_value + pi[state][stay]*stay_value
    return 

In [87]:
V

array([[[-0.728     , -1.01870831, -1.01565259, -0.99324397,
         -0.97592822, -0.9535196 , -0.8180493 , -0.80480784,
         -0.83943934, -0.574     ],
        [-0.41297241, -0.55466389, -0.55229912, -0.53495746,
         -0.52155709, -0.50421544, -0.39937724, -0.3891299 ,
         -0.41593064, -0.21095073]],

       [[-0.728     , -1.01852688, -1.01567914, -0.99479576,
         -0.97865859, -0.95777521, -0.83152564, -0.81918545,
         -0.85145978, -0.574     ],
        [-0.43547438, -0.58826863, -0.58590939, -0.56860826,
         -0.55523921, -0.53793808, -0.43334491, -0.42312152,
         -0.44985962, -0.23688282]],

       [[-0.728     , -1.01826048, -1.01560675, -0.99614604,
         -0.98110822, -0.96164751, -0.84399869, -0.83249919,
         -0.86257482, -0.574     ],
        [-0.45636907, -0.61940971, -0.61706669, -0.59988455,
         -0.58660744, -0.56942529, -0.46555142, -0.45539834,
         -0.48195256, -0.26096262]],

       [[-0.728     , -1.01792195, -1.01544918

In [83]:
V = np.zeros((10, 2, 10))
cProfile.run("evaluate_policy(V, 0.01, pi)")

         416004 function calls in 0.674 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
   408000    0.138    0.000    0.138    0.000 <ipython-input-66-f3052806827b>:1(blackjack_probability)
        1    0.005    0.005    0.674    0.674 <ipython-input-68-f4506db57596>:1(evaluate_policy)
     2000    0.530    0.000    0.668    0.000 <ipython-input-69-6eee29b1f731>:1(bellman_update)
        1    0.000    0.000    0.674    0.674 <string>:1(<module>)
     4000    0.001    0.000    0.001    0.000 {built-in method builtins.abs}
        1    0.000    0.000    0.674    0.674 {built-in method builtins.exec}
     2000    0.001    0.000    0.001    0.000 {built-in method builtins.max}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}




In [84]:
import plotly.graph_objects as go

y_values = []
x_values = []

for i in range(12, 22):
    for k in range(12, 22):
        x_values.append(i)

for i in range(10):
    for k in range(1, 11):
        y_values.append(k)


In [85]:
without_ace = np.zero((10, 10))
for i in range(len(V)):
    for j in range(len(V[0][0])):
        without_ace[i][j] = V[i][0][j]
        
z_values = np.array(without_ace).flatten()

without_ace_fig = go.Figure(data=[go.Mesh3d(x=np.array(x_values),
                   y=(np.array(y_values)),
                   z=(np.array(z_values)),
                   opacity=0.5,
                   color='rgba(244,22,100,0.6)'
                  )])

without_ace_fig.update_layout(
    scene = dict(
        xaxis = dict(nticks=10, range=[12, 21],),
                     yaxis = dict(nticks=10, range=[1, 10],),
                     zaxis = dict(nticks=5, range=[-1.5, 1.5],),),
    width=700,
    margin=dict(r=20, l=10, b=10, t=10))

without_ace_fig.show()

In [86]:
with_ace = np.zeros((10, 10))
for i in range(len(V)):
    for j in range(len(V[0][0])):
         with_ace[i][j] = V[i][1][j]


z_values = np.array(with_ace).flatten()

with_ace_fig = go.Figure(data=[go.Mesh3d(x=np.array(x_values),
                   y=(np.array(y_values)),
                   z=(np.array(z_values)),
                   opacity=0.5,
                   color='rgba(244,22,100,0.6)'
                  )])

with_ace_fig.update_layout(
    scene = dict(
        xaxis = dict(nticks=10, range=[12, 21],),
                     yaxis = dict(nticks=10, range=[1, 10],),
                     zaxis = dict(nticks=5, range=[-1.5, 1.5],),),
    width=700,
    margin=dict(r=20, l=10, b=10, t=10))

with_ace_fig.show()