## GET FRIENDLY WITH BLACKJACK

In [1]:
import numpy as np
import sys
sys.path.append('./env')
from blackjack import BlackjackEnv
print ("PACKAGES LOADED.")

PACKAGES LOADED.


## INITIALIZE BLACKJACK ENVIRONMENT

In [2]:
env = BlackjackEnv()
print ("BLACKJACK INITIALIZED.")
print ("#ACTION: %d" % (env.nA))

BLACKJACK INITIALIZED.
#ACTION: 2


## PRINT FUNCTIONS

In [3]:
def print_observation(observation):
    score, dealer_score, usable_ace = observation
    print("Player Score: [%d] (Usable Ace: [%s]), Dealer Score: %d"
          % (score, usable_ace, dealer_score))

def print_reward(reward):
    if reward is +1:
        rstr = 'Wins'
    elif reward is +0:
        rstr = 'Draws'
    elif reward is -1:
        rstr = 'Loses'
    else:
        rstr = 'WTF'
    print ("Player [%s]. Get [%+d] reward." % (rstr, reward))
    
def strategy(observation):
    score, dealer_score, usable_ace = observation
    # Stick (action 0) if the score is > 20, hit (action 1) otherwise
    return 0 if score >= 20 else 1

## RUN BLACKJACK

- Blackjack is a card game where the goal is to obtain cards that sum to as near as possible to 21 without going over.  They're playing against a fixed dealer.
- Face cards (Jack, Queen, King) have point value 10. 
- Aces can either count as 11 or 1, and it's called 'usable' at 11.
- This game is placed with an infinite deck (or with replacement).
- The game starts with each (player and dealer) having one face up and one face down card.
- The player can request additional cards (hit=1) until they decide to stop (stick=0) or exceed 21 (bust).
- After the player sticks, the dealer reveals their facedown card, and draws until their sum is 17 or greater.  If the dealer goes bust the player wins. 
- If neither player nor dealer busts, the outcome (win, lose, draw) is decided by whose sum is closer to 21.  
- The reward for winning is +1, drawing is 0, and losing is -1.
- The observation of a 3-tuple of: the players current sum, the dealer's one showing card (1-10 where 1 is ace), and whether or not the player holds a usable ace (0 or 1).
- This environment corresponds to the version of the blackjack problem described in Example 5.1 in Reinforcement Learning: An Introduction by Sutton and Barto (1998).
https://webdocs.cs.ualberta.ca/~sutton/book/the-book.html

## ROLLOUT

In [5]:
ntotal_episode = 10
for i_episode in range(ntotal_episode): 
    print ('================[%d-ROUND]================' % (i_episode))
    observation = env.reset()
    print_observation(observation)
    for t in range(100):
        # DO ACTION
        action = strategy(observation)
        print("Taking action: [%s]" % ( ["Stick", "Hit"][action]))
        
        # GET OBSERVATION
        observation, reward, done, _ = env.step(action)
        print_observation(observation)
        
        # IF FINISHED 
        if done:
            print_reward(reward)
            break

Player Score: [16] (Usable Ace: [False]), Dealer Score: 9
Taking action: [Hit]
Player Score: [22] (Usable Ace: [False]), Dealer Score: 9
Player [Loses]. Get [-1] reward.
Player Score: [21] (Usable Ace: [True]), Dealer Score: 2
Taking action: [Stick]
Player Score: [21] (Usable Ace: [True]), Dealer Score: 2
Player [Wins]. Get [+1] reward.
Player Score: [20] (Usable Ace: [False]), Dealer Score: 5
Taking action: [Stick]
Player Score: [20] (Usable Ace: [False]), Dealer Score: 5
Player [Wins]. Get [+1] reward.
Player Score: [21] (Usable Ace: [True]), Dealer Score: 6
Taking action: [Stick]
Player Score: [21] (Usable Ace: [True]), Dealer Score: 6
Player [Wins]. Get [+1] reward.
Player Score: [21] (Usable Ace: [True]), Dealer Score: 9
Taking action: [Stick]
Player Score: [21] (Usable Ace: [True]), Dealer Score: 9
Player [Wins]. Get [+1] reward.
Player Score: [16] (Usable Ace: [False]), Dealer Score: 8
Taking action: [Hit]
Player Score: [21] (Usable Ace: [False]), Dealer Score: 8
Taking action: 