In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt
import pickle

In [None]:
# reset probability pickles
if input("Are you sure you want to flush data?") == 'y':
  bot_state_specific = True
  bot_strategy = 3
  filename = "bj_belief_" + str(bot_strategy) + ("s" * bot_state_specific)    
  if bot_state_specific:
    temp_belief = np.ones((26,2)) / 2
  else:
    temp_belief = np.ones(2) / 2
  tempfile = open(filename, 'wb')
  pickle.dump((1,temp_belief), tempfile)
  tempfile.close()
  print("FLUSHED DATA")
else:
  print("no data flushed")

Are you sure you want to flush data?y
FLUSHED DATA


In [None]:
# void player.setup((player card 1, player card 2), dealer card, [(opp card 1, ...)...])
# void player.deal(int card)
# bool player.hit() returns true if player hits and false if player stands
# void player.feedback(bool won?)

In [None]:
class HumanPlayer:
  def __init__(self, my_hand, opp_hands, num_decks=6):
    self.hand = [my_hand[0], my_hand[1]] # list of integers (repeats and appends allowed)
    self.opp_hands = opp_hands # list of tuples
    self.num_decks = num_decks # number of decks in circulation
    
    self.card_totals = {} # maps a card number to amount of cards in circulation (in deck or face down)
    for number_card in range(1,10): # 1 = A
      self.card_totals[number_card] = 4*num_decks
    self.card_totals[10] = 16*num_decks  # 10, J, Q, K
    for card in self.hand:
      self.card_totals[card] -= 1
    for opp_hand in self.opp_hands:
      for card in opp_hand:
        self.card_totals[card] -= 1
    print("[Human] Your starting hand: " + str(self.hand))


  def deal(self, card):
    self.hand.append(card)
    self.card_totals[card] -= 1
    print("[Human] You've been dealt a card: " + str(card))
    print("[Human] Your current hand: " + str(self.hand))

  def hit(self):
    print("[Human] Type 'h' to hit, otherwise stand: ", end="")
    response = input()
    if (response == 'h') or (response == 'H'):
      return True  # player decides to hit
    return False # player decides to stand

  def win(self, reward):
    ################ REWARD
    print("[Human] You won " + str(reward) + " units!")

  def lose(self, punishment):
    ################ PUNISH
    print("[Human] You netted " + str(punishment) + " units.")

In [None]:
class AutoPlayer:
  def __init__(self, my_hand, opp_hands, num_decks=6, state_specific=True, name="Robot", iteration=1, prior_belief=None, strategy=0, verbose=True):
    self.hand = [my_hand[0], my_hand[1]] # list of integers (repeats and appends allowed)
    self.opp_hands = opp_hands # list of tuples
    self.num_decks = num_decks # number of decks in circulation
    self.state_specific = state_specific
    self.name = name
    self.last_move = (-1, 0)  # represents state, 0 for stand and 1 for hit
    self.t = iteration # with t^th robot are we training?
    # sqrt (num actions * 2 * log(num_actions) / t)
    self.gamma = min(1, math.sqrt(2 * 2 * math.log(2) / (self.t) ) )
    if self.state_specific:
      self.strategy = strategy
    else:
      self.strategy = 0

    self.card_totals = {} # maps a card number to amount of cards in circulation (in deck or face down)
    for number_card in range(1,10): # 1 = A
      self.card_totals[number_card] = 4*num_decks
    self.card_totals[10] = 16*num_decks  # 10, J, Q, K
    for card in self.hand:
      self.card_totals[card] -= 1
    for opp_hand in self.opp_hands:
      for card in opp_hand:
        self.card_totals[card] -= 1
    if verbose : print("[" + self.name + "] Starting hand: " + str(self.hand))

    ############# PICKLE LOAD
    if prior_belief is not None:
      self.belief = prior_belief
    else:
      if self.state_specific:
        self.belief = np.ones((26,2)) / 2
      else:
        self.belief = np.ones(2) / 2
    self.prior_belief = np.copy(self.belief)

    '''
    Type I: bold (epsilon greedy) (naive) (strategy 0)
    action_prob = (1-epsilon)*belief + (epsilon * 1/2)

    Type II: cautious (game programmed) (strategy 1)  [ONLY STATE SPECIFIC]
    action_prob = (1-p_bust)*belief

    Type III: mix (strategy 2)   [ONLY STATE SPECIFIC]
    action_prob = (1-epsilon)*(1-p_bust)*belief + (epsilon * 1/2)
    '''
      
    # action probability 0 = stand, 1 = hit
    if (self.strategy == 0):
      self.action_probability = (1.0 - self.gamma) * self.belief + self.gamma / 2
    elif self.strategy == 1:
      self.action_probability = (1.0-self.p_bust()) * self.belief
    else: # strategy = 2
      self.action_probability = ((1.0 - self.gamma)*(1.0-self.p_bust())) * self.belief + self.gamma / 2

    # normalize probabilities to sum to 1
    if self.state_specific:
      self.action_probability /= np.sum(self.action_probability, axis=1)[:,None]
    else:
      self.action_probability /= np.sum(self.action_probability)

  def p_bust(self):
    # returns the probability that hitting will put agent over 21 in any case
    total_cards_in_deck = 0
    for card_val in self.card_totals:
      total_cards_in_deck += self.card_totals[card_val]
    threshold = 21 - sum(self.hand) # if over threshold, we bust
    if threshold >= 10:
      return 0   # if we need more than 10 to bust, no card will force us to bust
    bust_cards_in_deck = 0
    for card_val in self.card_totals:
      if card_val > threshold:
        bust_cards_in_deck += self.card_totals[card_val]
    return bust_cards_in_deck/total_cards_in_deck


  def values(self):
    possible_values = [np.sum(self.hand)]
    for i in range(self.hand.count(1)):
      possible_values.append(possible_values[0] + ((i+1)*10))  # account for Ace dual value
    return tuple(possible_values)


  def deal(self, card):
    self.hand.append(card)
    self.card_totals[card] -= 1
    if verbose : print("[" + self.name + "] Dealt card: " + str(card))
    if verbose : print("[" + self.name + "] Current hand: " + str(self.hand))


  def hit(self):
    if 21 in self.values(): # already most optimal position
      if verbose : print("[" + self.name + "] Blackjack! I stand.")
      return False  # stand
    if min(self.values()) > 21:
      if verbose : print("[" + self.name + "] Bust! I must stand.")
      return False  # busted... punish?

    decision = False

    curr_state = -1
    
    if self.state_specific:
      if (1 in self.hand) and (np.sum(self.hand)-1 < 10):  # ace present in hand. if OTHER cards add up to more than 10, ace becomes default 1. if they add up to exactly 10, then you would've auto-stood at 21
        curr_state = np.sum(self.hand) + 15
      else:  # all cards in hand are non-aces
        curr_state = np.sum(self.hand) - 4
      # print(self.hand)
      # print(curr_state)
      decision = bool(np.random.choice(2, p=self.action_probability[curr_state,:]))
    else:
      decision = bool(np.random.choice(2, p=self.action_probability))


    self.last_move = (curr_state, int(decision))

    if decision == 1:
      if verbose : print("[" + self.name + "] I choose to hit!")
      return True  # player decides to hit
    else:
      if verbose : print("[" + self.name + "] I choose to stand.")
      return False # player decides to stand


  def feedback(self, reward):
    self.t += 1
    if self.state_specific:
      estimatedReward = reward / self.action_probability[self.last_move[0],self.last_move[1]]
      self.belief[self.last_move[0],self.last_move[1]] *= math.exp(estimatedReward * self.gamma / 2) 
      print(self.belief)
    else:
      estimatedReward = reward / self.action_probability[self.last_move[1]]
      self.belief[self.last_move[1]] *= math.exp(estimatedReward * self.gamma / 2)
    ########## PIKCLE DUMP 


  def win(self, reward):
    if verbose : print("[" + self.name + "] Yay! I won " + str(reward) + " units!")
    # self.feedback(reward)


  def lose(self, punishment):
    if verbose : print("[" + self.name + "] Aw, I netted " + str(punishment) + " units.")
    # self.feedback(punishment)

  # def change_in_belief(self):
  #   return self.belief - self.prior_belief

  def change_belief(self, reward):
    if self.state_specific:
      estimatedReward = reward / self.action_probability[self.last_move[0],self.last_move[1]]
    else:
      estimatedReward = reward / self.action_probability[self.last_move[1]]
    return (self.last_move[0],self.last_move[1], math.exp(estimatedReward * self.gamma / 2) )

In [None]:
def bust(hand):
  return np.sum(hand) > 21

def possible_values(hand):
  possible_values = [np.sum(hand)]
  for i in range(hand.count(1)):
    possible_values.append(possible_values[0] + ((i+1)*10))  # account for Ace dual value
  return tuple(possible_values)

In [None]:
dealer = True
human = False
num_decks = 1
num_players = 5  

bot_state_specific = True
bot_strategy = 3

possible_settings = [(False, 1),] + [(True, x) for x in range(3)]

verbose = False

num_epochs = 1000
for epoch in range(num_epochs):
  filename = "bj_belief_" + str(bot_strategy) + ("s" * bot_state_specific)    
  tempfile = open(filename, 'rb')
  (generation, prior_belief) = pickle.load(tempfile)  
  # generation = how many rounds bot has been trained
  # prior_belief = trained beliefs for picking any given action
  tempfile.close()

  # at least 2 if dealer is not present, otherwise 1
  if (not dealer):
    num_players = max(num_players, 2)
  else:
    num_players = max(num_players, 1)

  human_turn = np.random.choice(num_players)
  names = ["Bot " + str(i) for i in range(num_players)]
  if human:
    names[human_turn] = "Human"
  if dealer:
    names.append("Dealer")

  if verbose : print("Players: " + str(names))

  # create and shuffle deck
  deck = []
  for nd in range(num_decks):
    for i in range(1,10):
      for suit in range(4):
        deck.append(i)
    for face in range(16):
      deck.append(10)
  np.random.shuffle(deck)

  # deal initial cards
  all_hands = []  # length = num_players (+1 if dealer exists)
  for p in range(num_players):
    all_hands.append([deck.pop(), deck.pop()])

  if(dealer):
    next_card = deck.pop()
    all_hands.append([next_card, ])

  if verbose : print("Starting Hands: " + str(all_hands))

  players = []

  for p in range(num_players):  # players go one at a time, COULD CHANGE THIS LATER?
    if verbose : print()
    curr_hand = tuple(all_hands[p])
    curr_opp_hands = []
    for p2 in range(len(all_hands)):
      if p2 != p:
        curr_opp_hands.append(tuple(all_hands[p2]))
    
    if human and (p == human_turn):
      curr_player = HumanPlayer(curr_hand,curr_opp_hands,num_decks=num_decks)
    else:
      curr_player = AutoPlayer(curr_hand,curr_opp_hands,num_decks=num_decks, state_specific=bot_state_specific, name=names[p], strategy=bot_strategy, iteration=generation, verbose=verbose, prior_belief=prior_belief)
    players.append(curr_player)

    while(curr_player.hit()):
      # deal player a card
      curr_card = deck.pop()
      all_hands[p].append(curr_card)
      curr_player.deal(curr_card)
      if bust(all_hands[p]):
        if verbose : print("[" + names[p] + "] Bust!")
        break

  # dealer's behavior, no variability
  if dealer:
    if verbose : print()
    while max(possible_values(all_hands[-1])) < 17:
      curr_card = deck.pop()
      all_hands[-1].append(curr_card)
      if verbose : print("[Dealer] Draws card: " + str(curr_card))
      if verbose : print("[Dealer] Hand: " + str(all_hands[-1]))

  if verbose : print()
  # decide who won
  winners = []
  max_score = -1
  for p in range(len(all_hands)):
    for valuation in possible_values(all_hands[p]):
      if valuation > 21:  # don't want to award win to a valuation that busts
        continue
      if valuation > max_score:
        max_score = valuation
        winners = [p,]
      elif valuation == max_score:
        winners.append(p)
  # print(winners)

  if(len(winners) > 0):
    reward = (len(all_hands) * 1.0/len(winners))-1
  else:
    reward = -1  # everyone busted

  final_belief = prior_belief 

  generation += 1

  # print("Before: " + str(final_belief))
  for p in range(num_players):
    if p in winners:
      players[p].win(reward)  # maybe reward inversely proportional to number of winners?
      # each player and dealer adds 1 unit to pool, which is split amongst winners
      # subtract one for initial investment
      curr_reward = reward
    else:
      players[p].lose(-1)
      curr_reward = -1
    if (not human) or (human and (p != human_turn)):
      change = players[p].change_belief(curr_reward)
      # print(change)
      if bot_state_specific:
        final_belief[change[0],change[1]] *= change[2]
      else:
        final_belief[change[1]]*=change[2]
  
  if bot_state_specific:
    final_belief /= np.sum(final_belief, axis=1)[:,None]
  else:
    final_belief /= np.sum(final_belief)

  # print("After: " + str(final_belief))

  tempfile = open(filename, 'wb')
  pickle.dump((generation,final_belief), tempfile)
  tempfile.close()

  if(num_players in winners):
    if verbose : print("[Dealer] House wins " + str(reward) + " units.")
  elif dealer:  # dealer in the game but not in winners
    if verbose : print("[Dealer] House netted -1 unit.")

In [None]:
bot_state_specific = True
bot_strategy = 3
filename = "bj_belief_" + str(bot_strategy) + ("s" * bot_state_specific)    
tempfile = open(filename, 'rb')
(generation, prior_belief) = pickle.load(tempfile)  
# generation = how many rounds bot has been trained
# prior_belief = trained beliefs for picking any given action
tempfile.close()
(generation, prior_belief)

(1001, array([[3.80002930e-01, 6.19997070e-01],
        [3.56217148e-02, 9.64378285e-01],
        [1.93970519e-02, 9.80602948e-01],
        [3.62329946e-02, 9.63767005e-01],
        [8.97889152e-03, 9.91021108e-01],
        [1.37575365e-05, 9.99986242e-01],
        [6.03246635e-07, 9.99999397e-01],
        [1.17075389e-13, 1.00000000e+00],
        [8.73057200e-08, 9.99999913e-01],
        [3.91958672e-14, 1.00000000e+00],
        [1.64206300e-06, 9.99998358e-01],
        [1.92224825e-08, 9.99999981e-01],
        [2.34110698e-06, 9.99997659e-01],
        [8.87136858e-06, 9.99991129e-01],
        [8.34536678e-02, 9.16546332e-01],
        [9.99999586e-01, 4.14384182e-07],
        [1.00000000e+00, 2.55781094e-28],
        [3.53109121e-01, 6.46890879e-01],
        [3.33286078e-02, 9.66671392e-01],
        [5.30042708e-02, 9.46995729e-01],
        [1.41719440e-03, 9.98582806e-01],
        [1.12841235e-01, 8.87158765e-01],
        [6.90669053e-02, 9.30933095e-01],
        [2.63626238e-02, 9.7

In [None]:
# load pickle file
tempfile = open(filename, 'rb')
(generation, prior_belief) = pickle.load(tempfile)  
# generation = how many rounds bot has been trained
# prior_belief = trained beliefs for picking any given action
tempfile.close()
(generation, prior_belief)