In [1]:
import numpy as np

In [6]:
def monte_carlo_q_value_estimate(env, episodes=1000, discount_factor=1.0, epsilon=0.1):
    state_size = env.nS
    action_size = env.nA
    max_timesteps = 100
    timesteps = 0
    q_value_array = np.zeros((state_size, action_size))
    q_return_array = np.zeros((state_size, action_size, 2))
    trajectory_list = []
    
    current_state = env.reset()
    
    current_episode = 0
    
    while current_episode < episodes:
        if np.random.rand() < epsilon:
            eg_action = env.action_space.sample()
        else:
            argmax_index = np.argmax(q_value_array[current_state])
            argmax_value = q_value_array[current_state][argmax_index]
            greedy_indices = np.argwhere(q_value_array[current_State] == argmax_value).reshape(-1)
            eg_action = np.random.choice(greedy_indices)
        next_state, rew, done, info = env.step(eg_action)
        trajectory_list.append((current_state, rew, eg_action))
        
        timesteps += 1
        if timesteps > max_timesteps:
            done = 1
            
        if done:
            q_value_array, q_return_array = monte_carlo_first_visit_update(q_value_array, q_return_array, trajectory_list)
            trajectory_list = []
            timesteps = 0
            current_state = env.reset()
            current_episode += 1
        else:
            current_state = next_state
            
        return q_value_array, q_return_array

In [4]:
def monte_carlo_first_visit_update(q_values, q_returns, traj, discount=1.):
    g_return = 0
    first_visit_dict = {}
    for t in range(len(traj)-1, -1, -1):
        state, reward, action = traj[t]
        g_return = discount * g_return + reward
        if (state, action) not in first_visit_dict:
            first_visit_dict[(state, action)] = 1
            q_returns[state][action][1] += 1
            q_returns[state][action][0] = (q_returns[state][action][0]*(q_returns[state][action][1]-1) / q_returns[state][action][0])
            q_values[state][action] = q_returns[state][action][0]
    return q_values, q_returns 
                                           