In [1]:
import gym

In [2]:
env = gym.make('Blackjack-v0')

In [4]:
env.reset()

(13, 10, False)

In [5]:
env.action_space

Discrete(2)

## Every Visit MC prediction

In [8]:
import pandas as pd
from collections import defaultdict

In [9]:
def policy(state):
    return 0 if state[0]>19 else 1

In [10]:
def generate_episode(policy):
    num_timesteps = 200
    episode = []
    state = env.reset()
    
    for t in range(num_timesteps):
        action = policy(state)
        next_state, reward, done, info = env.step(action)
        episode.append((state, action, reward))
        
        if done:
            break
        
        state = next_state
        
    return episode

In [11]:
total_return = defaultdict(float)
N = defaultdict(int)

In [12]:
num_iterations = 500000

In [13]:
for i in range(num_iterations):
    episode = generate_episode(policy)
    states, actions, rewards = zip(*episode)
    for t, state in enumerate(states):
        R = sum(rewards[t:])
        total_return[state] = total_return[state] + R
        N[state] = N[state] + 1

In [14]:
total_return = pd.DataFrame(total_return.items(), columns=['state', 'total_return'])
N = pd.DataFrame(N.items(), columns=['state', 'N'])

In [15]:
df = pd.merge(total_return, N, on='state')

In [16]:
df.head(10)

Unnamed: 0,state,total_return,N
0,"(12, 2, False)",-2482.0,4574
1,"(17, 2, False)",-3642.0,5269
2,"(21, 8, True)",2216.0,2366
3,"(16, 1, False)",-3753.0,5148
4,"(13, 10, False)",-11442.0,18902
5,"(17, 10, False)",-14852.0,21144
6,"(20, 2, False)",4816.0,7459
7,"(13, 7, True)",-131.0,478
8,"(12, 7, False)",-2289.0,4550
9,"(20, 10, False)",13083.0,30016


In [17]:
df['value'] = df['total_return'] / df['N']

In [18]:
df.head(10)

Unnamed: 0,state,total_return,N,value
0,"(12, 2, False)",-2482.0,4574,-0.542632
1,"(17, 2, False)",-3642.0,5269,-0.691213
2,"(21, 8, True)",2216.0,2366,0.936602
3,"(16, 1, False)",-3753.0,5148,-0.729021
4,"(13, 10, False)",-11442.0,18902,-0.605333
5,"(17, 10, False)",-14852.0,21144,-0.702421
6,"(20, 2, False)",4816.0,7459,0.645663
7,"(13, 7, True)",-131.0,478,-0.274059
8,"(12, 7, False)",-2289.0,4550,-0.503077
9,"(20, 10, False)",13083.0,30016,0.435868


In [19]:
df[df['state']==(21,9,False)]['value'].values

array([0.94746767])

In [20]:
df[df['state']==(5, 8, False)]['value'].values

array([-0.48491379])

## First visit MC prediction

In [23]:
total_return = defaultdict(float)
N = defaultdict(int)

for i in range(num_iterations):
    episode = generate_episode(policy)
    states, actions, rewards = zip(*episode)
    for t, state in enumerate(states):
        if state not in states[0:t]:
            R = sum(rewards[t:])
            total_return[state] = total_return[state] + R
            N[state] = N[state] + 1