Let's implement the first-visit MC prediction with the blackjack game step by step:

In [1]:
# Import the necessary libraries

import gym
import pandas as pd
from collections import defaultdict

In [2]:
# Create a blackjack environment:
env = gym.make('Blackjack-v1')

Defining a policy

In [3]:
# We define the input policy whose value function will be predicted in the upcoming steps.
def policy(state):
    return 0 if state[0] > 19 else 1

In [4]:
# Let's generate an initial state by resetting the environment as shown below:

state = env.reset()
print(state)

(17, 3, False)


In [5]:
print(policy(state))

1


Generating an episode

In [6]:
# First, let's set the number of time steps:
num_timestep = 100

In [8]:
# We generate an episode using the given policy, so, we, define a function called "generate_episode" 
# which takes the policy as an input and generates the episode using the given policy.
def generate_episode(policy):
    
    #let's define a list called episode for storing the episode
    episode = []
    
    #initialize the state by resetting the environment
    state = env.reset()
    
    #then for each time step
    for i in range(num_timestep):
        
        #select the action according to the given policy
        action = policy(state)
        
        #perform the action and store the next state information
        next_state, reward, done, info = env.step(action)
        
        #store the state, action, reward into our episode list
        episode.append((state, action, reward))
        
        #If the next state is a final state then break the loop 
        if done:
            break
        # else update the next state to the current state    
        state = next_state

    return episode

In [9]:
# Let's take a look at how the output of our generate_episode function looks like.
# Note that we generate episode using the policy we defined earlier:
generate_episode(policy)

[((18, 2, True), 1, 0.0), ((21, 2, True), 0, 1.0)]

Computing the value function

In [10]:
# First, we define the total_return and N as a dictionary 
# for storing the total return and the number of times 
# the state is visited across episodes respectively.
total_return = defaultdict(float)
N = defaultdict(int)


In [11]:
# Set the number of iterations:
num_iterations = 10000

In [13]:
#For every iteration
for i in range(num_iterations):
    
    #generate the episode using the given policy
    episode = generate_episode(policy)
    
    #store all the states, actions, rewards obtained from the episode
    states, actions, rewards = zip(*episode)
    
    #then, for each step in the episode
    for t, state in enumerate(states):
        
        #if the state is not visited already
        if state not in states[0:t]:
                
            #compute the return R of the state as the sum of reward
            R = (sum(rewards[t:]))
            
            #update the total_return of the state
            total_return[state] =  total_return[state] + R
            
            #update the number of times the state is visited in the episode
            N[state] =  N[state] + 1

In [14]:
# After computing the total_return and N We can just convert them 
# into a pandas data frame for a better understanding.

# Convert total_returns dictionary to a data frame:
total_return = pd.DataFrame(total_return.items(),columns=['state', 'total_return'])

In [15]:
# Convert the counter N dictionary to a data frame
N = pd.DataFrame(N.items(),columns=['state', 'N'])

In [16]:
# Merge the two data frames on states:
df = pd.merge(total_return, N, on="state")

In [18]:
# Take a look at the first few rows of the data frame:
df.head(10)

Unnamed: 0,state,total_return,N
0,"(15, 8, False)",-137.0,223
1,"(17, 8, False)",-132.0,206
2,"(20, 10, False)",533.0,1184
3,"(19, 1, True)",-21.0,29
4,"(19, 1, False)",-179.0,233
5,"(17, 5, False)",-142.0,197
6,"(21, 5, False)",125.0,138
7,"(20, 2, False)",212.0,320
8,"(12, 7, False)",-93.0,190
9,"(15, 7, False)",-134.0,206


As we can observe from above, we have the total return and the number of times the state is visited.

In [19]:
# Next, we can compute the value of the state as the average return, thus, we can write:
df['value'] = df['total_return']/df['N']

In [20]:
# Take a look at the first few rows of the data frame:
df.head(10)

Unnamed: 0,state,total_return,N,value
0,"(15, 8, False)",-137.0,223,-0.61435
1,"(17, 8, False)",-132.0,206,-0.640777
2,"(20, 10, False)",533.0,1184,0.450169
3,"(19, 1, True)",-21.0,29,-0.724138
4,"(19, 1, False)",-179.0,233,-0.76824
5,"(17, 5, False)",-142.0,197,-0.720812
6,"(21, 5, False)",125.0,138,0.905797
7,"(20, 2, False)",212.0,320,0.6625
8,"(12, 7, False)",-93.0,190,-0.489474
9,"(15, 7, False)",-134.0,206,-0.650485


As we can observe we now have the value of the state which is just the average of a return of the state across several episodes. Thus, we have successfully predicted the value function of the given policy using the first-visit MC method.

In [21]:
# Let's evaluate the value of the state (19,1,False)
df[df['state']==(19,1,False)]['value'].values

array([-0.76824034])