In [1]:
import numpy as np
import sys
if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.blackjack import BlackjackEnv

In [11]:
env = BlackjackEnv()

17

In [31]:
# Functions
def print_observation(observation):
    score, dealer_score, usable_ace = observation
    print("Player Score: {} (Usable Ace: {}), Dealer Score: {}".format(
          score, usable_ace, dealer_score))

def strategy(observation):
    score, dealer_score, usable_ace = observation
    # Stick (action 0) if the score is > 20, hit (action 1) otherwise
    return 0 if score >= 20 else 1

def calc_val(state_dict):
    for key, value in state_dict.items():
        if value[0] != 0:
            value[3] = value[1] / value[0]

In [37]:
state_dict = {}
usable_a = [False, True]

for sum in range(12,22):
    for dealer_card in range(1,11):
        for ace in range(len(usable_a)):
            state_dict[(sum, dealer_card, usable_a[ace])] = [0,0,0,0] # (Counter, Total Return, Visited, Value)

In [24]:
# First Visit Monte-Carlo Policy Evaluation

for i_episode in range(200):
    observation = env.reset()
    states_visited = []
    
    for t in range(100):
        player_score = observation[0]
        useable_ace = observation[2]
        dealer_score = observation[1]
        
        if state_dict[(player_score, dealer_score, useable_ace)][2] == 0:
            # Then first time the state is visited
            # Increment counter by 1
            state_dict[(player_score, dealer_score, useable_ace)][0] += 1
            states_visited.append((player_score, dealer_score, useable_ace))
            state_dict[(player_score, dealer_score, useable_ace)][2] = 1 # mark as visited for the episode
            
        print_observation(observation)
        action = strategy(observation)
        print("Taking action: {}".format( ["Stick", "Hit"][action]))
        
        observation, reward, done, _ = env.step(action)
        if done:
            print_observation(observation)
            print("Game end. Reward: {}\n".format(float(reward)))
            break
    
    for i in range(len(states_visited)):
        state_dict[states_visited[i]][1] += reward
        state_dict[states_visited[i]][2] = 0 # reset visited flag

Player Score: 17 (Usable Ace: False), Dealer Score: 8
Taking action: Hit
Player Score: 20 (Usable Ace: False), Dealer Score: 8
Taking action: Stick
Player Score: 20 (Usable Ace: False), Dealer Score: 8
Game end. Reward: 1.0

Player Score: 16 (Usable Ace: False), Dealer Score: 9
Taking action: Hit
Player Score: 18 (Usable Ace: False), Dealer Score: 9
Taking action: Hit
Player Score: 23 (Usable Ace: False), Dealer Score: 9
Game end. Reward: -1.0

Player Score: 21 (Usable Ace: True), Dealer Score: 5
Taking action: Stick
Player Score: 21 (Usable Ace: True), Dealer Score: 5
Game end. Reward: 1.0

Player Score: 16 (Usable Ace: False), Dealer Score: 8
Taking action: Hit
Player Score: 19 (Usable Ace: False), Dealer Score: 8
Taking action: Hit
Player Score: 29 (Usable Ace: False), Dealer Score: 8
Game end. Reward: -1.0

Player Score: 20 (Usable Ace: False), Dealer Score: 10
Taking action: Stick
Player Score: 20 (Usable Ace: False), Dealer Score: 10
Game end. Reward: 1.0

Player Score: 14 (Usabl

In [41]:
# Every visit Monte-Carlo Policy Evaluation

for i_episode in range(200):
    observation = env.reset()
    states_visited = [] # list of states visited in an episode
    
    for time_step in range(200):
        player_score = observation[0]
        dealer_score = observation[1]
        useable_ace = observation[2]
        
        state_dict[(player_score, dealer_score, useable_ace)][0] += 1
        states_visited.append((player_score, dealer_score, useable_ace))
        
        print_observation(observation)
        action = strategy(observation)
        print("Taking action: {}".format( ["Stick", "Hit"][action]))
        
        observation, reward, done, _ = env.step(action)
        if done:
            print_observation(observation)
            print("Game end. Reward: {}\n".format(float(reward)))
            break
        
    for state in set(states_visited):
        state_dict[state][1] += reward

Player Score: 21 (Usable Ace: True), Dealer Score: 9
Taking action: Stick
Player Score: 21 (Usable Ace: True), Dealer Score: 9
Game end. Reward: 1.0

Player Score: 20 (Usable Ace: False), Dealer Score: 2
Taking action: Stick
Player Score: 20 (Usable Ace: False), Dealer Score: 2
Game end. Reward: 1.0

Player Score: 20 (Usable Ace: False), Dealer Score: 3
Taking action: Stick
Player Score: 20 (Usable Ace: False), Dealer Score: 3
Game end. Reward: 1.0

Player Score: 12 (Usable Ace: False), Dealer Score: 7
Taking action: Hit
Player Score: 17 (Usable Ace: False), Dealer Score: 7
Taking action: Hit
Player Score: 22 (Usable Ace: False), Dealer Score: 7
Game end. Reward: -1.0

Player Score: 17 (Usable Ace: False), Dealer Score: 3
Taking action: Hit
Player Score: 27 (Usable Ace: False), Dealer Score: 3
Game end. Reward: -1.0

Player Score: 19 (Usable Ace: False), Dealer Score: 2
Taking action: Hit
Player Score: 29 (Usable Ace: False), Dealer Score: 2
Game end. Reward: -1.0

Player Score: 12 (Us

In [43]:
calc_val(state_dict)