In [4]:
import io
import numpy as np
import sys
from gym.envs.toy_text import discrete
import pprint

In [2]:
UP = 0
RIGHT = 1
DOWN = 2
LEFT = 3

In [5]:
def choose_action(env, q_table, state, epsilon=0.1):
    if np.random.rand() < epsilon:
        return env.action_space.sample()
    return np.argmax(q_table[state])

In [7]:
def sarsa_q_value_estimate(env, episodes=1000, alpha=0.05, discount_factor=1.0, epsilon=0.1):
    state_size = env.nS
    action_size = env.nA
    q_value_array = np.zeros((state_size, action_size))
    
    current_state = env.reset()
    eg_action = choose_action(env, q_value_array, current_state, epsilon)
    
    current_episode = 0
    
    while current_episode < episodes:
        next_state, rew, done, info = env.step(eg_action)
        next_action = choose_action(q_value_array, current_state, epsilon)
        
        q_value_array[current_state, eg_action] = q_value_array[current_state, eg_action] + \
        alpha * (rew + discount_factor * q_value_array[next_state, next_action] - q_value_array[current_state, eg_action])
        
        if done:
            current_state = env.reset()
            eg_action = choose_action(env, q_value_array, current_state, epsilon)
            current_episode += 1
        else:
            current_state = next_state
            eg_action = next_action
            
    return q_value_array
        
        

In [8]:
def sarse_q_value_estimate(env, episodes=1000, alpha=0.05, discount_factor=1.0, epsilon=0.1):
    state_size = env.nS
    action_size = env.nA
    q_value_array = np.zeros((state_size, action_size))
    current_state = env.reset()
    
    random_prob = epsilon / action_size
    greedy_prob = 1. - epsilon
    
    current_episode = 0
    while current_episode < episodes:
        if np.random.rand() < epsilon:
            eg_action = env.action_space.sample()
        else:
            eg_action = np.argmax(q_value_array[current_state])
            
        next_state, rew, done, info = env.step(eg_action)
        action_probs = np.array([random_prob]*action_size)
        action_probs[np.argmax(q_value_array[next_state])] += greedy_prob
        
        next_action_value_estimate = 0.
        for i in range(action_size):
            next_action_value_estimate += action_probs[i] * q_value_array[next_state, i]
            
        q_value_array[current_state, eg_action] = q_value_array[current_state, eg_action] + \
        alpha * (rew + discount_factor * next_action_value_estimate - q_value_array[current_state, eg_action])
        
        if done:
            current_state = env.reset()
            current_episode += 1
        else:
            current_state = next_state
            
    return q_value_array
            