In [1]:
# How to implement the every-visit MC prediction with the blackjack game step by step:

# Import libraries

In [2]:
# Import libraries
import gym
import pandas as pd
from collections import defaultdict

In [3]:
# Create a blackjack environment:
env = gym.make('Blackjack-v1')

# Defining a policy

In [4]:
# We define the input policy whose value function will be predicted in the upcoming steps.
def policy(state):
    return 0 if state[0] > 19 else 1

In [5]:
# let's generate an initial state by resetting the environment as shown below:

state = env.reset()
print(state)

(12, 10, False)


In [6]:
print(policy(state))

1


# Generating an episode

In [7]:
# Next, we generate an episode using the given policy, 
# so, we, define a function called generate_episode which takes 
# the policy as an input and generates the episode using the given policy.

# First, let's set the number of time steps:
num_timestep = 100

In [8]:
def generate_episode(policy):
    
    #let's define a list called episode for storing the episode
    episode = []
    
    #initialize the state by resetting the environment
    state = env.reset()
    
    #then for each time step
    for i in range(num_timestep):
        
        #select the action according to the given policy
        action = policy(state)
        
        #perform the action and store the next state information
        next_state, reward, done, info = env.step(action)
        
        #store the state, action, reward into our episode list
        episode.append((state, action, reward))
        
        #If the next state is a final state then break the loop else update the next state to the current state
        if done:
            break
            
        state = next_state

    return episode

In [9]:
generate_episode(policy)
# The output will be in the form of [(state, action, reward)]

[((18, 6, False), 1, -1.0)]

# Computing the value function

In [10]:
# First, we define the total_return and N as a dictionary 
# for storing the total return and the number of times the state is visited 
# across episodes respectively.

total_return = defaultdict(float)
N = defaultdict(int)

In [11]:
# Set the number of iterations
num_iterations = 500000

In [12]:
# For every iteration
for i in range(num_iterations):
    
    #generate the episode using the given policy,
    episode = generate_episode(policy)
    
    #store all the states, actions, rewards obtained from the episode
    states, actions, rewards = zip(*episode)
    
    #then for each step in the episode 
    for t, state in enumerate(states):
        
            #compute the return R of the state as the sum of reward
            R = (sum(rewards[t:]))
            
            #update the total_return of the state
            total_return[state] =  total_return[state] + R
            
            #update the number of times the state is visited in the episode
            N[state] =  N[state] + 1

In [13]:
# After computing the total_return and N We can just convert them into a pandas data frame 
# for a better understanding. [Note that this is just to give a clear understanding 
# of the algorithm, we don't necessarily have to convert to the pandas data frame, 
# we can also implement this efficiently just using the dictionary]

In [14]:
# Convert total_return dictionary to a data frame:

total_return = pd.DataFrame(total_return.items(),columns=['state', 'total_return'])

In [15]:
# Convert the counter N dictionary to a data frame
N = pd.DataFrame(N.items(),columns=['state', 'N'])

In [16]:
# Merge the two data frames on states:
df = pd.merge(total_return, N, on="state")

In [17]:
# Take a Look at the first few rows of the data frame:

df.head(10)

Unnamed: 0,state,total_return,N
0,"(19, 2, False)",-3987.0,5464
1,"(16, 7, False)",-3202.0,5065
2,"(17, 7, False)",-3631.0,5314
3,"(20, 10, False)",13316.0,30062
4,"(19, 7, True)",-314.0,861
5,"(15, 7, False)",-3214.0,5148
6,"(8, 3, False)",-679.0,1238
7,"(19, 3, True)",-393.0,844
8,"(19, 3, False)",-4104.0,5628
9,"(20, 8, False)",5836.0,7455


In [18]:
# Next, we can compute the value of the state as the average return

df['value'] = df['total_return']/df['N']

In [19]:
# Let's look at the first few rows of the data frame:
df.head(10)

Unnamed: 0,state,total_return,N,value
0,"(19, 2, False)",-3987.0,5464,-0.729685
1,"(16, 7, False)",-3202.0,5065,-0.632182
2,"(17, 7, False)",-3631.0,5314,-0.683289
3,"(20, 10, False)",13316.0,30062,0.442951
4,"(19, 7, True)",-314.0,861,-0.364692
5,"(15, 7, False)",-3214.0,5148,-0.62432
6,"(8, 3, False)",-679.0,1238,-0.548465
7,"(19, 3, True)",-393.0,844,-0.46564
8,"(19, 3, False)",-4104.0,5628,-0.729211
9,"(20, 8, False)",5836.0,7455,0.78283


In [20]:
# Let's evaluate the value of the state (21,9,False)
df[df['state']==(21,9,False)]['value'].values

array([0.9412971])

In [21]:
# let's check the value of the state (16,8,False) 
df[df['state']==(16,8,False)]['value'].values

array([-0.48636364])