# COMP579 Assignment 2

Authors:
* Ryan Reszetnik: 260948454
* Mathieu Geoffroy: 260986559

**Coding: Tabular RL [70 points]**

In [87]:
import gym
import numpy as np
import matplotlib.pyplot as plt

In [88]:
def softmax(x, temp):
    # write your solution here
    e = np.exp(x / temp)
    return e / e.sum()

In [89]:
class Sarsa:
    def __init__(self, env, alpha, gamma, temp):
        # write your solution here
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.temp = temp
        self.Q = np.zeros((env.observation_space.n, env.action_space.n))

    def select_action(self, s, greedy=False):
        # write your solution here
        if greedy:
            return np.argmax(self.Q[s, :])
        else:
            return self.env.action_space.sample()

    def update(self, s, a, r, s_prime, a_prime, done):
        # write your solution here
        prediction = self.Q[s, a]
        target = r + self.gamma * self.Q[s_prime, a_prime] * (1 - done)
        self.Q[s, a] += self.alpha * (target - prediction)
        return self.Q


class ExpectedSarsa:
    def __init__(self, env, alpha, gamma, temp):
        # write your solution here
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.temp = temp
        self.Q = np.zeros((env.observation_space.n, env.action_space.n))

    def select_action(self, s, greedy=False):
        # write your solution here
        if greedy:
            # if finished training, then choose the optimal policy
            return np.argmax(self.Q[s, :])
        else:
            return np.random.choice(self.env.action_space.n, p=softmax(self.Q[s, :], self.temp))

    def update(self, s, a, r, s_prime, a_prime, done):
        prediction = self.Q[s, a]
        
        


# bonus question, optional
class Hybrid_Sarsa_Q:
    def __init__(self, env, alpha, gamma, temp):
        # write your solution here
        self.env = None
        self.alpha = None
        self.gamma = None
        self.temp = None
        self.Q = None
        return

    def select_action(self, s, greedy=False):
        # write your solution here
        if greedy:
            # if finished training, then choose the optimal policy
            return
        else:
            return

    def update(self, s, a, r, s_prime, a_prime, done):
        # write your solution here
        return

# Write your experiment code below

In [90]:
env_name = 'Taxi-v3'
env = gym.make(env_name)
print("Action space:", env.action_space)
print("State space:", env.observation_space)

Action space: Discrete(6)
State space: Discrete(500)


In [91]:
# function that runs each episode
def run_episode(agent, env, max_steps, train=False):
    s = env.reset()[0]
    done = False
    episode_reward = 0
    a = agent.select_action(s, not train)
    for step in range(max_steps):
        s_prime, r, done, info, mask = env.step(a)
        a_prime = agent.select_action(s_prime, not train)
        if train:
            agent.update(s, a, r, s_prime, a_prime, done)
        s = s_prime
        a = a_prime
        episode_reward += r
        
        if done:
            break
            
    return episode_reward
            
# function that runs each hyperparameter setting
def run_experiment(agent, env, num_segments, max_steps):
    rewards = np.zeros(num_segments)
    for i in range(num_segments):
        if i % 10 == 0 and i > 0:
            rewards[i] = run_episode(agent, env, max_steps, train=False)
        else:
            rewards[i] = run_episode(agent, env, max_steps, train=True)
    return rewards


In [98]:
# define hyperparameter arrays
num_segments = 500
max_steps = 200
alphas = [0.1, 0.5, 0.9, 0.99]
gammas = [0.1, 0.5, 0.9, 0.99]
temps = [0.1, 0.5, 0.9, 0.99]
num_trials = 10

# define agents
sarsa_agents = [Sarsa(env, alpha, gamma, temp) for alpha in alphas for gamma in gammas for temp in temps]
expected_sarsa_agents = [ExpectedSarsa(env, alpha, gamma, temp) for alpha in alphas for gamma in gammas for temp in temps]

# define result arrays with uncertainty
sarsa_rewards = np.zeros((num_trials, len(alphas), len(gammas), len(temps), num_segments))
expected_sarsa_rewards = np.zeros((num_trials, len(alphas), len(gammas), len(temps), num_segments))

# run experiments
for agent in sarsa_agents:
    print(f"Running experiment for sarsa with alpha={agent.alpha}, gamma={agent.gamma}, temp={agent.temp}...")
    for trial in range(num_trials):
        sarsa_rewards[trial, alphas.index(agent.alpha), gammas.index(agent.gamma), temps.index(agent.temp)] = run_experiment(agent, env, num_segments, max_steps)
        print(f"\tTrial {trial} Reward: {sarsa_rewards[trial, alphas.index(agent.alpha), gammas.index(agent.gamma), temps.index(agent.temp), :].max()}")
    
    



Running experiment for sarsa with alpha=0.1, gamma=0.1, temp=0.1...
	Trial 0 Reward: -48.0
	Trial 1 Reward: -157.0
	Trial 2 Reward: -33.0
	Trial 3 Reward: -107.0
	Trial 4 Reward: -64.0
	Trial 5 Reward: -159.0
	Trial 6 Reward: -104.0
	Trial 7 Reward: -90.0
	Trial 8 Reward: -57.0
	Trial 9 Reward: -96.0
Running experiment for sarsa with alpha=0.1, gamma=0.1, temp=0.5...
	Trial 0 Reward: -98.0
	Trial 1 Reward: -102.0
	Trial 2 Reward: -29.0
	Trial 3 Reward: -154.0
	Trial 4 Reward: -200.0
	Trial 5 Reward: -169.0
	Trial 6 Reward: -75.0
	Trial 7 Reward: -73.0
	Trial 8 Reward: -73.0
	Trial 9 Reward: -192.0
Running experiment for sarsa with alpha=0.1, gamma=0.1, temp=0.9...
	Trial 0 Reward: -153.0
	Trial 1 Reward: -60.0
	Trial 2 Reward: -186.0
	Trial 3 Reward: -84.0
	Trial 4 Reward: -151.0
	Trial 5 Reward: -20.0
	Trial 6 Reward: -98.0
	Trial 7 Reward: -40.0
	Trial 8 Reward: -99.0
	Trial 9 Reward: -119.0
Running experiment for sarsa with alpha=0.1, gamma=0.1, temp=0.99...
	Trial 0 Reward: -71.0
	

KeyboardInterrupt: 