In [4]:
import gym
import math
from copy import deepcopy
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
def discretize(buckets, obs):
    '''
        Discretizes the continuous state values to a discrete value
    '''
    upper_bounds = [env.observation_space.high[0], 0.5, env.observation_space.high[2], math.radians(50)]
    lower_bounds = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -math.radians(50)]
    ratios = [(obs[i] + abs(lower_bounds[i])) / (upper_bounds[i] - lower_bounds[i]) for i in range(len(obs))]
    new_obs = [int(round((buckets[i] - 1) * ratios[i])) for i in range(len(obs))]
    new_obs = [min(buckets[i] - 1, max(0, new_obs[i])) for i in range(len(obs))]
    return tuple(new_obs)

In [6]:
def epsilon_greedy_policy(action_probabilities, epsilon):
    '''
        Most of the time they choose an action that has maximal estimated action value 
        but with probability epsilon they choose an action at random
        
        Input:
            np.array of action probabilities
            
        Returns:
            Action to choose
    '''
    max_action_index = np.argmax(action_probabilities)
    max_prob = 1 - epsilon + epsilon/len(action_probabilities)
    other_prob = epsilon/len(action_probabilities)
    probs = [max_prob if max_action_index == i else other_prob for i in range(len(action_probabilities))]
    return np.random.choice(np.arange(len(action_probabilities)), p=probs)
    

In [7]:
def get_learning_rate(t):
    min_lr = 0.1
    decay = 25
    return max(min_lr, min(1., 1. - math.log10((t + 1) / decay)))

In [8]:
def get_action(env, action_probabilities, t):
    epsilon = max(0.1, min(1., 1. - math.log10((t + 1) / 25))) # decaying epsilon
    if np.random.random() < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(action_probabilities)

In [30]:
def SARSA(env, buckets, max_episodes, max_iterations, gamma=1.0, epsilon=0.5):
    Q = np.zeros(buckets + (env.action_space.n,))
    
    for i_episode in range(max_episodes):
        state = discretize(buckets, env.reset())
        
        alpha = get_learning_rate(i_episode)
        i = 0
        done=False
        
        while not done and i != max_iterations: 
            action = get_action(env, Q[state], i_episode)
            next_state, reward, done, _ = env.step(action)
            next_state = discretize(buckets, next_state)
            next_action = get_action(env, Q[next_state], i_episode)
            Q[state][action] += alpha * (reward + gamma * Q[next_state][next_action] - Q[state][action]) 
            state = next_state
            action = next_action
            i += 1
            
        if i_episode % 50 == 0:
            print('Episode {}: Terminated at {} iterations'.format(i_episode, i))

In [29]:
SARSA(env, buckets, max_episodes, max_iterations)

Episode 0: Terminated at 11 iterations
Episode 60: Terminated at 19 iterations
Episode 120: Terminated at 86 iterations
Episode 180: Terminated at 210 iterations
Episode 240: Terminated at 151 iterations
Episode 300: Terminated at 17 iterations
Episode 360: Terminated at 144 iterations
Episode 420: Terminated at 40 iterations
Episode 480: Terminated at 188 iterations
Episode 540: Terminated at 47 iterations
Episode 600: Terminated at 62 iterations
Episode 660: Terminated at 19 iterations
Episode 720: Terminated at 20 iterations
Episode 780: Terminated at 240 iterations
Episode 840: Terminated at 175 iterations
Episode 900: Terminated at 207 iterations
Episode 960: Terminated at 182 iterations
Episode 1020: Terminated at 221 iterations
Episode 1080: Terminated at 39 iterations
Episode 1140: Terminated at 88 iterations
Episode 1200: Terminated at 473 iterations
Episode 1260: Terminated at 269 iterations
Episode 1320: Terminated at 141 iterations
Episode 1380: Terminated at 267 iterations