In [154]:
import numpy as np
import pandas as pd
import heapq
import ast

In [152]:
def percentChange(current, previous): #returns percent change between current and previous
    if current == previous:
        return 0
    try:
        return (abs(current - previous) / previous) * 100.0
    except ZeroDivisionError:
        return float('inf')
    
#-----------------------#given a row in the Q table, returns action that the model believes is best (max)------------------------------------------------
def whichAction(row): 
    action = np.argmax(row)
    if action == 0:
        return 'Hit'
    elif action == 1:
        return 'Stand'
    elif action == 2:
        return 'Double'
    elif action == 3:
        return "Surrender"

#-----------------------nth best action according to the model within percent% of max action------------------------------------------------
def nextAction(row, n, percent = 0):
    if n > 4: #only 4 possible actions
        return "Error, only 4 actions are possible (n>4)"
    
    action = heapq.nlargest(n, range(len(row)), key = row.__getitem__)[n-1] #index of nth max
    action_val = heapq.nlargest(n, row)[1] #value of nth max
    
    if percent: #if we want to check that the nth action is within percent% of the max action
        if percentChange(action_val, np.max(row)) > percent: 
            return "No"
        
    if action == 0:
        return 'Hit (c)'
    elif action == 1:
        return 'Stand (c)'
    elif action == 2:
        return 'Double (c)'
    elif action == 3 and n == 2:
        return "Surrender"
    elif action == 3 and n == 3:
        return "Surrender (c)"
    
#-----------------------highlight cells with color according to action------------------------------------------------
def highlight_actions(val):
    if val == 'Hit':
        color = 'green'
    elif val == 'Stand':
        color = 'brown'
    elif val == 'Double':
        color = 'blue'
    elif val == 'Surrender':
        color = 'goldenrod'
    elif val == 'Hit (c)':
        color = 'darkgreen'
    elif val == 'Stand (c)':
        color = 'firebrick'
    elif val == 'Double (c)':
        color = 'mediumblue'
    elif val == 'Surrender (c)':
        color = 'darkgoldenrod'
    else:
        color = 'white'
    return 'background-color: %s' % color

#-------------------given a state in Q table, returns [row, column] coordinates in basic_strategy table----------------
def state_to_BS(state): 
    state_lst = ast.literal_eval(state)
    hand_total = state_lst[0]
    dealer_upcard = state_lst[1]
    if dealer_upcard == 11:
        dealer_upcard = 'A'
        
    ace = state_lst[2]
    
    BS_column = dealer_upcard
    
    if ace:
        if hand_total == 12:
            hand_total = 'A,A'
        else:
            hand_total = 'A,%i'%(hand_total-11)
    
    BS_row = hand_total  
    
    return [BS_row, BS_column]

#-------------------generate dictionary with keys = row number in Q, and value = corresponding state-----------------
no_ace_states = []
ace_states = []

for i in [4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]:
    for j in [2,3,4,5,6,7,8,9,10,11]:
        no_ace_states.append([i,j,0])

for i in [12,13,14,15,16,17,18,19,20]:
    for j in [2,3,4,5,6,7,8,9,10,11]:
        ace_states.append([i,j,1])

all_states = no_ace_states + ace_states

states_dict = {}

for i in np.arange(len(all_states)):
    states_dict[i] = str(all_states[i])

#-------------------generates basic strategy table according to Q and with any corrections desired-----------------
def generateBS(Q, basic_strategy, correction = 'None'):
    for num, row in enumerate(Q):
        state = states_dict[num]
        action = whichAction(row)
        bs_coords = state_to_BS(state)
        
        if correction != 'None':
            if correction == 'within 0.5%':
                p = 0.005
            elif correction == 'second action':
                p = 0
                
            if action != correct_bs.loc[bs_coords[0], bs_coords[1]]:
                #either just the next action ('second action') or next action within p% ('within 0.5%')
                second_action = nextAction(row, 2, p)
                if second_action != "No":
                    action = second_action

            if action == "Surrender" and action != correct_bs.loc[bs_coords[0], bs_coords[1]]:
                action = nextAction(row, 3)
        
        basic_strategy.loc[bs_coords[0], bs_coords[1]] = action
    return basic_strategy.style.applymap(highlight_actions)

In [101]:
#empty basic_strategy to fill with Q table
dealer_upcard = [2,3,4,5,6,7,8,9,10,'A']
no_ace_hand = [4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
ace_hand = ['A,A', 'A,2', 'A,3', 'A,4', 'A,5', 'A,6', 'A,7', 'A,8', 'A,9']
basic_strategy = pd.DataFrame(columns = dealer_upcard, index = no_ace_hand)
basic_strategy_ace = pd.DataFrame(columns = dealer_upcard, index = ace_hand)
basic_strategy = basic_strategy.append(basic_strategy_ace)

## Correct Basic Strategy

In [102]:
#generate table with correct basic strategy

dealer_upcard = [2,3,4,5,6,7,8,9,10,'A']
no_ace_hand = [4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
ace_hand = ['A,A', 'A,2', 'A,3', 'A,4', 'A,5', 'A,6', 'A,7', 'A,8', 'A,9']
basic_strategy = pd.DataFrame(columns = dealer_upcard, index = no_ace_hand)
basic_strategy_ace = pd.DataFrame(columns = dealer_upcard, index = ace_hand)
correct_bs = basic_strategy.append(basic_strategy_ace)

correct_bs.loc[4] = "Hit"
correct_bs.loc[5] = "Hit"
correct_bs.loc[6] = "Hit"
correct_bs.loc[7] = "Hit"
correct_bs.loc[8] = "Hit"
correct_bs.loc[9] = ["Hit", "Double", "Double", "Double", "Double", "Hit", "Hit", "Hit", "Hit", "Hit"]
correct_bs.loc[10] = ["Double","Double","Double","Double","Double","Double","Double","Double","Hit", "Hit"]
correct_bs.loc[11] = "Double"
correct_bs.loc[12] = ["Hit", "Hit", "Stand", "Stand", "Stand", "Hit","Hit","Hit","Hit","Hit"]
correct_bs.loc[13] = ["Stand","Stand","Stand","Stand","Stand","Hit","Hit","Hit","Hit","Hit"]
correct_bs.loc[14] = ["Stand","Stand","Stand","Stand","Stand","Hit","Hit","Hit","Hit","Hit"]
correct_bs.loc[15] = ["Stand","Stand","Stand","Stand","Stand","Hit","Hit","Hit","Surrender","Surrender"]
correct_bs.loc[16] = ["Stand","Stand","Stand","Stand","Stand","Hit","Hit","Surrender","Surrender","Surrender"]
correct_bs.loc[17] = "Stand"
correct_bs.loc[18] = "Stand"
correct_bs.loc[19] = "Stand"
correct_bs.loc[20] = "Stand"
correct_bs.loc['A,A'] = ["Hit", "Hit", "Stand", "Stand", "Stand", "Hit","Hit","Hit","Hit","Hit"]
correct_bs.loc['A,2'] = ["Hit", "Hit", "Hit", "Double", "Double", "Hit","Hit","Hit","Hit","Hit"]
correct_bs.loc['A,3'] = ["Hit", "Hit", "Hit", "Double", "Double", "Hit","Hit","Hit","Hit","Hit"]
correct_bs.loc['A,4'] = ["Hit", "Hit", "Double", "Double", "Double", "Hit","Hit","Hit","Hit","Hit"]
correct_bs.loc['A,5'] = ["Hit", "Hit", "Double", "Double", "Double", "Hit","Hit","Hit","Hit","Hit"]
correct_bs.loc['A,6'] = ["Hit", "Double", "Double", "Double", "Double", "Hit","Hit","Hit","Hit","Hit"]
correct_bs.loc['A,7'] = ["Double", "Double", "Double", "Double", "Double", "Stand","Stand","Hit","Hit","Hit"]
correct_bs.loc['A,8'] = ["Stand", "Stand", "Stand", "Stand", "Double", "Stand","Stand","Stand","Stand","Stand"]
correct_bs.loc['A,9'] = "Stand"
correct_bs.style.applymap(highlight_actions)

Unnamed: 0,2,3,4,5,6,7,8,9,10,A
4,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit
5,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit
6,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit
7,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit
8,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit
9,Hit,Double,Double,Double,Double,Hit,Hit,Hit,Hit,Hit
10,Double,Double,Double,Double,Double,Double,Double,Double,Hit,Hit
11,Double,Double,Double,Double,Double,Double,Double,Double,Double,Double
12,Hit,Hit,Stand,Stand,Stand,Hit,Hit,Hit,Hit,Hit
13,Stand,Stand,Stand,Stand,Stand,Hit,Hit,Hit,Hit,Hit


## Results

In [143]:
#dictionary of 22 tested Q tables
QTables = {}
for i in np.arange(1,23):
    QTables[i] = np.load("/Users/dannyfarid/Desktop/Blackjack RL/QTables/Q_table-%i.npy"%i)

In [144]:
generateBS(QTables[19], basic_strategy)

Unnamed: 0,2,3,4,5,6,7,8,9,10,A
4,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit
5,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit
6,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Double,Hit,Hit
7,Hit,Hit,Hit,Hit,Hit,Hit,Surrender,Hit,Hit,Hit
8,Hit,Hit,Hit,Double,Double,Hit,Hit,Hit,Hit,Hit
9,Double,Double,Double,Double,Double,Double,Hit,Hit,Hit,Hit
10,Double,Double,Double,Double,Double,Double,Double,Double,Hit,Hit
11,Hit,Double,Double,Double,Double,Hit,Hit,Double,Hit,Hit
12,Hit,Double,Stand,Stand,Stand,Double,Surrender,Surrender,Stand,Stand
13,Hit,Stand,Hit,Hit,Hit,Hit,Stand,Hit,Stand,Hit


In [145]:
#72% accurate

## Correcting (Taking Second Action if < 1% Change)

Here, we apply a correction where if the model's top chosen action is incorrect (according to the correct_bs dataframe), we check if its second top option value was within 0.5% of the top action value, and if it is then we take that action instead. We see that the accuracy jumps from 72% to 84%, meaning that for the cells the model got wrong, it was often very close to picking the correct one instead. This shows that perhaps with more training or parameter tuning, the model would increase its accuracy significantly.

In [146]:
#empty corrected basic strategy data frame to fill
basic_strategy_corrected = pd.DataFrame(columns = dealer_upcard, index = no_ace_hand)
basic_strategy_ace = pd.DataFrame(columns = dealer_upcard, index = ace_hand)
basic_strategy_corrected = basic_strategy_corrected.append(basic_strategy_ace)

In [147]:
generateBS(QTables[19], basic_strategy_corrected,'within 0.5%')

Unnamed: 0,2,3,4,5,6,7,8,9,10,A
4,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit
5,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit
6,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit (c),Hit,Hit
7,Hit,Hit,Hit,Hit,Hit,Hit,Hit (c),Hit,Hit,Hit
8,Hit,Hit,Hit,Double,Double,Hit,Hit,Hit,Hit,Hit
9,Double,Double,Double,Double,Double,Double,Hit,Hit,Hit,Hit
10,Double,Double,Double,Double,Double,Double,Double,Double,Hit,Hit
11,Hit,Double,Double,Double,Double,Hit,Hit,Double,Hit,Hit
12,Hit,Hit (c),Stand,Stand,Stand,Hit (c),Double (c),Hit (c),Double (c),Hit (c)
13,Stand (c),Stand,Stand (c),Stand (c),Stand (c),Hit,Hit (c),Hit,Hit (c),Hit


In [109]:
#84% accuracy

## Corrected (Taking Second Action)

Here, we apply a slightly broader correction where if the model's top chosen action is incorrect (according to the correct_bs dataframe), we instead fill the table with the model's second top action regardless of how far/close it is to the top action. We see that the accuracy jumps from 72% to 93%, meaning that it is very likely for the correct action to be within the model's first two choices. In other words, if the model's first choice isn't correct, it is likely that its second one will be. This again exemplifies the potential of the model to return very high accuracy with optimal parameters and conditions.

In [148]:
generateBS(QTables[19], basic_strategy_corrected, 'second action')

Unnamed: 0,2,3,4,5,6,7,8,9,10,A
4,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit
5,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit
6,Hit,Hit,Hit,Hit,Hit,Hit,Hit,Hit (c),Hit,Hit
7,Hit,Hit,Hit,Hit,Hit,Hit,Hit (c),Hit,Hit,Hit
8,Hit,Hit,Hit,Hit (c),Hit (c),Hit,Hit,Hit,Hit,Hit
9,Hit (c),Double,Double,Double,Double,Hit (c),Hit,Hit,Hit,Hit
10,Double,Double,Double,Double,Double,Double,Double,Double,Hit,Hit
11,Double (c),Double,Double,Double,Double,Double (c),Double (c),Double,Double (c),Double (c)
12,Hit,Hit (c),Stand,Stand,Stand,Hit (c),Double (c),Hit (c),Double (c),Hit (c)
13,Stand (c),Stand,Stand (c),Stand (c),Stand (c),Hit,Hit (c),Hit,Hit (c),Hit


In [44]:
#93% accuracy

## Analysis of Error and Tools for Checking Individual Rows/Cells

If you want to look into the specific action values for each row in a Q table, we show how to do this below using reverse_states_dict and the function checkQrow(). Pass in a state as a string of a list of the form: $[Your hand total, Dealer's upcard, # of Aces you have in play]$ and it will return the row in the Q table that corresponds to that state. The first entry in the returned row is the action value for "Hit," the second for "Stay," the third for "Double," and the fourth for "Surrender."

In [119]:
reverse_states_dict = {}

for i in np.arange(len(all_states)):
    reverse_states_dict[str(all_states[i])] = i

def checkQrow(Q, state):
    row_num = reverse_states_dict[state]
    return Q[row_num]

For example, in Q_table19, we can check the action values for the state where the player has soft 17 (A, 6), the dealer is showing a 5, and there is 1 ace in play: $[17, 5, 1]$:

In [149]:
checkQrow(QTables[19], '[17, 5, 1]')

array([ 0.06243137,  0.08      ,  9.78      , -0.4925    ])

We can see that the action value for "Double" (9.78) is the highest by far, which is why the basic strategy table generated by generateBS() using Q_table19 shows a blue "Double" in the cell corresponding to 'A,6' for the player hand and '5' for the dealer upcard. Note that according to correct basic strategy, this is the correct decision for this state.

We can also look at examples where the model was incorrect, for example when the player's hand is a 9, the dealer is showing a 2, and we do not have any aces: $[9, 2, 0]$

In [155]:
checkQrow(QTables[19], '[9, 2, 0]')

array([-0.27011479, -0.815     ,  8.5       , -1.225     ])

Here, we see that the model chose to "Double" (the 8.5 in the third entry). This, however, is the incorrect decision for this state. The correct decision is to just "Hit," and when applying corrections we can see that the model's second best option was indeed to "Hit." Note that the value for "Hit" is -0.2, which while still larger than all the other options, is not within 0.5% of the value for "Double." This is why this correction does not show up in the "within 0.5%" corrected table, but does show up in the 'second action' corrected table above. Also note that this state is just on the cusp of doubling according to correct basic strategy ($[9, 3, 0]$, for example, is a double, see below), so it makes sense that the model might get this wrong and say to double. 

In [151]:
checkQrow(QTables[19], '[9, 3, 0]')

array([-0.022883, -0.65    , 16.09    , -1.2825  ])

Note here that for the state $[9,3,0]$, which is actually supposed to be a double, the action value for "Double" is much higher than it was for $[9,2,0]$ (16.09 vs 8.5), and furthermore the action value for "Hit" is much lower than for $[9,2,0]$ (-0.02 vs -0.2). This shows us that the model was approaching the correct answer and exhibited more hesitancy in doubling $[9,2,0]$ (incorrect) than it did when doubling $[9,3,0]$ (correct). This is encouraging, as it suggests that model has more confidence when making correct decisions than it does when making incorrect decisions.