In [1]:
import numpy as np

In [116]:
# MC with exploring starts or e-soft policy
reward_map = {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:8, 9:9, 10:10, 11:0, 12:0, 13:0, 14:0, 15:0, 16:0}

# Exploring starts is bad because it commits to a greedy action at a state once it has given a positive return
def exploring_starts(Q, s):
  a_star = np.argmax(Q[s])
  return np.array([1 if ac == a_star else 0 for ac in np.arange(2)])

# Epsilon soft is much more reliable to converge because it doesn't hard-commit to any action, allowing it
# to explore other possibilities.
# The problem is, as seen in bandits, it never selects the optimal action every time at any state.
def e_soft(Q, s):
  eps = 0.1
  actions = len(Q[s])

  a_star = np.argmax(Q[s])
  return np.array([1 - eps + eps/actions if ac == a_star else eps/actions for ac in np.arange(2)])

# Initialize
Q = np.zeros((11,2))
p = np.full((11,2), 0.5)
returns = [[[] for a in range(2)] for s in range(11)]

policy_update = e_soft

# Loop
T = 1000
for t in range(T):
  # Generate episode
  episode = []
  s = 0
  while s <= 10:
    s_old = s
    a = np.random.choice(np.arange(len(p[s])), p=p[s])
    roll = np.random.choice(np.arange(1,7))
    s += roll if a == 1 else 0
    r = reward_map[s] - reward_map[s_old]
    episode.append((s_old, a, r, s))
    if a == 0:
      break
    
  # Update returns
  i = len(episode) - 1
  G = 0
  while i >= 0:
    si, ai, ri = episode[i][0:3]
    G += ri
    returns[si][ai].append(G)
    Q[si,ai] = np.average(returns[si][ai])
    i -= 1
  
  # Update policy
  for ep in episode:
    p[ep[0]] = policy_update(Q, ep[0])

In [117]:
p

array([[0.05, 0.95],
       [0.05, 0.95],
       [0.05, 0.95],
       [0.05, 0.95],
       [0.05, 0.95],
       [0.05, 0.95],
       [0.95, 0.05],
       [0.95, 0.05],
       [0.95, 0.05],
       [0.95, 0.05],
       [0.95, 0.05]])

In [118]:
Q

array([[  0.        ,   6.3875    ],
       [  0.        ,   5.75      ],
       [  0.        ,   4.58659218],
       [  0.        ,   3.99456522],
       [  0.        ,   3.52534562],
       [  0.        ,   1.63865546],
       [  0.        ,  -0.23076923],
       [  0.        ,  -2.36363636],
       [  0.        ,  -6.71428571],
       [  0.        ,  -5.        ],
       [  0.        , -10.        ]])

In [150]:
# Off-policy MC with weight importance sampling
reward_map = {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:8, 9:9, 10:10, 11:0, 12:0, 13:0, 14:0, 15:0, 16:0}

# Initialize
Q = np.zeros((11,2))
C = np.zeros((11,2))
p = np.array([[0.3, 0.7] for i in range(11)])
u = np.full((11,2), 0.5)

# Loop
T = 1000
delta = 1
for t in range(T):
  # Generate episode
  episode = []
  s = 0
  while s <= 10:
    s_old = s
    a = np.random.choice(np.arange(len(u[s])), p=u[s])
    roll = np.random.choice(np.arange(1,7))
    s += roll if a == 1 else 0
    r = reward_map[s] - reward_map[s_old]
    episode.append((s_old, a, r, s))
    if a == 0:
      break
    
  # Update returns
  i = len(episode) - 1
  G = 0
  W = 1
  while i >= 0:
    si, ai, ri = episode[i][0:3]

    G = ri + delta*G
    C[si, ai] += W
    Q[si, ai] += (W/C[si,ai]) * (G - Q[si,ai])
    p[si] = [1 if a == np.argmax(Q[si]) else 0 for a in range(2)]
    W /= u[si, ai]
    if W == 0:
      break
    
    i -= 1

In [151]:
p

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [152]:
Q

array([[  0.        ,   4.64964249],
       [  0.        ,   3.74157303],
       [  0.        ,   3.45      ],
       [  0.        ,   3.06779661],
       [  0.        ,   0.69333333],
       [  0.        ,   0.45864662],
       [  0.        ,  -1.63636364],
       [  0.        ,  -2.69230769],
       [  0.        ,  -4.48648649],
       [  0.        ,  -5.77419355],
       [  0.        , -10.        ]])

In [49]:
# Ignore: This doesn't work
# Trying to implement MC for blackjack but the rules for BJ are so complex that I messed up the design

cards = ["A", "2", "3", "4", "5", "6", "7", "8", "9", "10", "J", "Q", "K"]
cards_map = {"A":1, "2":2, "3":3, "4":4, "5":5, "6":6, "7":7, "8":8, "9":9, "10":10, "J":10, "Q":10, "K":10}

def draw(n):
  return np.random.choice(cards, size=n)

def make_state(hand, dealer):
  ace = False
  hand_sum = np.sum([cards_map[card] for card in hand])
  if "A" in hand and hand_sum + 10 <= 21:
    ace = True
  if "A" in hand and hand_sum + 10 == 21:
    hand_sum = 21

  dealer_sum = np.sum([cards_map[card] for card in dealer])
  if "A" in dealer and dealer_sum + 10 == 21:
    dealer_sum = 21
  
  return hash(str(hand_sum) + "|" + str(ace*1) + "|" + dealer[0]), hand_sum, dealer_sum

# Initialize
Q = {}
p = {}
returns = {}

# Loop
for t in range(1):
  # Generate episode
  episode = []

  hand = draw(2)
  dealer = draw(2)

  s, hand_sum, dealer_sum = make_state(hand, dealer)
  if s not in p:
    p[s] = [0.5, 0.5]

  a_old, r, stop = 1, 0, False

  while (hand_sum < 21) and (dealer_sum < 21) and not stop:
    s_old = s
    a = np.random.choice(np.arange(len(p[s])), p=p[s]) if a_old == 1 else 0
    dealer_a = 1 if dealer_sum < 17 or ("A" in dealer and dealer_sum + 10 < 17) else 0

    print(hand, a, dealer, dealer_a)

    hand = np.append(hand, draw(1)) if a == 1 else hand
    dealer = np.append(dealer, draw(1)) if dealer_a == 1 else dealer
      
    s, hand_sum, dealer_sum = make_state(hand, dealer)
    if s not in p:
      p[s] = [0.5, 0.5]

    r += 1 if dealer_sum > 21 else 0
    r -= 1 if hand_sum > 21 else 0
    if a == 0 and dealer_a == 0:
      stop = True
      if hand_sum > dealer_sum:
        r += 1
      if hand_sum < dealer_sum:
        r -= 1
    
    episode.append((s_old, a, r, s))
    a_old = a
    if stop:
      break

  print(hand, a, dealer, dealer_a)

  # Average returns
  for s, a, r, sp in episode:
    pass

['8' '9'] 0 ['4' 'J'] 1
['8' '9'] 0 ['4' 'J' '8'] 1


In [17]:
a = np.array([1,2,3])
np.append(a, [1])
draw(1)

array(['Q'], dtype='<U2')

In [51]:
hash("hi")

251455801643063389

In [52]:
state_hash

0

In [1]:
a = {}
a["hi"] = []
a

{'hi': []}

In [4]:
"".join(sorted("agfh"))

'afgh'

In [10]:
if "hi" in a:
  print('true')

TypeError: ignored