# COMP579 Assignment 2

Authors:
* Ryan Reszetnik: 260948454
* Mathieu Geoffroy: 260986559

**Coding: Tabular RL [70 points]**

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def softmax(x, temp):
    # write your solution here
    e = np.exp(x / temp)
    return e / e.sum()

In [3]:
class Sarsa:
    def __init__(self, env, alpha, gamma, temp):
        # write your solution here
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.temp = temp
        self.Q = np.zeros((env.observation_space.n, env.action_space.n))

    def select_action(self, s, greedy=False):
        # write your solution here
        if greedy:
            return np.argmax(self.Q[s, :])
        else:
            return np.random.choice(self.env.action_space.n, p=softmax(self.Q[s, :], self.temp))

    def update(self, s, a, r, s_prime, a_prime, done):
        # write your solution here
        prediction = self.Q[s, a]
        target = r + self.gamma * self.Q[s_prime, a_prime] * (1 - done)
        self.Q[s, a] += self.alpha * (target - prediction)
        return self.Q


class ExpectedSarsa:
    def __init__(self, env, alpha, gamma, temp):
        # write your solution here
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.temp = temp
        self.Q = np.zeros((env.observation_space.n, env.action_space.n))

    def select_action(self, s, greedy=False):
        # write your solution here
        if greedy:
            # if finished training, then choose the optimal policy
            return np.argmax(self.Q[s, :])
        else:
            return np.random.choice(self.env.action_space.n, p=softmax(self.Q[s, :], self.temp))

    def update(self, s, a, r, s_prime, a_prime, done):
        prediction = self.Q[s, a]
        if done:
            target = r
        else:
            target = r + self.gamma * np.sum(softmax(self.Q[s_prime, :], self.temp) * self.Q[s_prime, :])
        self.Q[s, a] += self.alpha * (target - prediction)
        return self.Q
        
        


# bonus question, optional
class Hybrid_Sarsa_Q:
    def __init__(self, env, alpha, gamma, temp):
        # write your solution here
        self.env = None
        self.alpha = None
        self.gamma = None
        self.temp = None
        self.Q = None
        return

    def select_action(self, s, greedy=False):
        # write your solution here
        if greedy:
            # if finished training, then choose the optimal policy
            return
        else:
            return

    def update(self, s, a, r, s_prime, a_prime, done):
        # write your solution here
        return

# Write your experiment code below

In [4]:
env_name = 'Taxi-v3'
env = gym.make(env_name)
print("Action space:", env.action_space)
print("State space:", env.observation_space)

Action space: Discrete(6)
State space: Discrete(500)


In [5]:
# function that runs each episode
def run_episode(agent, env, train=False):
    s, _ = env.reset()
    done = False
    episode_reward = 0
    step = 0
    a = agent.select_action(s, not train)
    while not done and step < 1000:
        s_prime, r, terminated, truncated, _ = env.step(a)
        done = terminated or truncated
        a_prime = agent.select_action(s_prime, not train)
        if train:
            agent.update(s, a, r, s_prime, a_prime, done)
        
        s = s_prime
        a = a_prime
        episode_reward += r
        step += 1
        
        if done:
            break
            
    return episode_reward
            
# function that runs each hyperparameter setting
def run_experiment(agent, env, num_segments):
    rewards = np.zeros(num_segments)
    for i in range(num_segments):
        for j in range(10):
            run_episode(agent, env, train=True)
        rewards[i] = run_episode(agent, env, train=False)
        print(f"\tSegment {i} Reward: {rewards[i]}")
    return rewards


In [6]:
# define hyperparameter arrays
num_segments = 500
alphas = [0.1, 0.5, 0.9, 0.99]
gamma = 0.99
temps = [0, 0.5, 1]
num_trials = 10

In [7]:
# define sarsa agent
sarsa_agents = [Sarsa(env, alpha, gamma, temp) for alpha in alphas for temp in temps]

# define result array
sarsa_rewards = np.zeros((num_trials, len(alphas), len(temps), num_segments))

# run experiments for sarsa
for agent in sarsa_agents:
    print(f"Running experiment for sarsa with alpha={agent.alpha}, gamma={agent.gamma}, temp={agent.temp}...")
    for trial in range(num_trials):
        sarsa_rewards[trial, alphas.index(agent.alpha), temps.index(agent.temp)] = run_experiment(agent, env, num_segments)
        print(f"\tTrial {trial} Reward: {sarsa_rewards[trial, alphas.index(agent.alpha), temps.index(agent.temp), -10:].mean()}")
        
    
# calculate mean and standard deviation for sarsa. Averaged over the last 10 training episodes and the 10 runs
sarsa_mean = sarsa_rewards.mean(axis=0)
sarsa_std = sarsa_rewards.std(axis=0)


# plot results for sarsa training performance per hyperparameter setting

Running experiment for sarsa with alpha=0.1, gamma=0.99, temp=0...
	Segment 0 Reward: -200.0
	Segment 1 Reward: -200.0
	Segment 2 Reward: -200.0
	Segment 3 Reward: -200.0
	Segment 4 Reward: -200.0
	Segment 5 Reward: -1991.0
	Segment 6 Reward: -200.0
	Segment 7 Reward: -200.0
	Segment 8 Reward: -200.0
	Segment 9 Reward: -200.0
	Segment 10 Reward: -200.0


  if not isinstance(terminated, (bool, np.bool8)):


	Segment 11 Reward: -200.0
	Segment 12 Reward: -200.0
	Segment 13 Reward: -200.0
	Segment 14 Reward: -200.0
	Segment 15 Reward: -200.0
	Segment 16 Reward: -1991.0
	Segment 17 Reward: -200.0
	Segment 18 Reward: -200.0
	Segment 19 Reward: -200.0
	Segment 20 Reward: -200.0
	Segment 21 Reward: -200.0
	Segment 22 Reward: -200.0
	Segment 23 Reward: -200.0
	Segment 24 Reward: -200.0
	Segment 25 Reward: -200.0
	Segment 26 Reward: -200.0
	Segment 27 Reward: -200.0
	Segment 28 Reward: -200.0
	Segment 29 Reward: -200.0
	Segment 30 Reward: -200.0
	Segment 31 Reward: -200.0
	Segment 32 Reward: -200.0
	Segment 33 Reward: -200.0
	Segment 34 Reward: -200.0
	Segment 35 Reward: -200.0
	Segment 36 Reward: -200.0
	Segment 37 Reward: -200.0
	Segment 38 Reward: -200.0
	Segment 39 Reward: 10.0
	Segment 40 Reward: -200.0
	Segment 41 Reward: -200.0
	Segment 42 Reward: -200.0
	Segment 43 Reward: -200.0
	Segment 44 Reward: -200.0
	Segment 45 Reward: -200.0
	Segment 46 Reward: -200.0
	Segment 47 Reward: -200.0
	S

KeyboardInterrupt: 