# Introduction
- In this kernel, we will be implementing an example environment.
- We will be deploying SARSA, Q-Learning and Expected SARSA to try and find the optimal agent's policy and the optimal value functions, in order to maximize the rewards.

# Importing Packages & Boilerplate Stuff

1. jdc: Jupyter magic that allows defining classes over multiple jupyter notebook cells.
2. numpy: the fundamental package for scientific computing with Python.
3. matplotlib: the library for plotting graphs in Python.
4. RL-Glue: the library for reinforcement learning experiments.
5. BaseEnvironment, BaseAgent: the base classes from which we will inherit when creating the environment and agent classes in order for them to support the RL-Glue framework.
6. itertools.product: the function that can be used easily to compute permutations.
7. tqdm.tqdm: Provides progress bars for visualizing the status of loops.

In [1]:
import jdc
import copy
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from itertools import product
from tqdm import tqdm

In [2]:
### DEBUG CODE
# Setting the seed for reproducible results
# np.random.seed(0)

# 1. Environment
- The below code cell provides the backbone of the `ExampleEnvironment` class.

In [3]:
class ExampleEnvironment():
    def __init__(self, env_info={}):
        # These are the different possible states
        self.grid = [0, 1, 2, 3]
        
        # These are the different possible actions
        # 0 -> LEFT, 1 -> RIGHT, 2 -> STAY 
        # In states 0 and 3, LEFT and RIGHT actions will lead to the same state as STAY
        self.tran_matrix = [
            [1/4, 1/2, 1/4], 
            [1/4, 1/2, 1/4],
            [1/4, 1/2, 1/4],
            [1/4, 1/2, 1/4]
        ]
        
        # These are the rewards
        self.rewards = [
            [0, 0, 2],
            [0, 1, 0],
            [1, 1, 0],
            [2, 1.5, 3]
        ]
        
        # Defines the starting location and the current location
        self.start_loc = 0
        self.cur_loc = None
        
    def start(self):
        self.cur_loc = self.start_loc
        return self.cur_loc
    
    def step(self, action):
        reward = self.rewards[self.cur_loc][action]
        
        if action == 0:
            self.cur_loc = max(0, self.cur_loc - 1)
        elif action == 1:
            self.cur_loc = min(3, self.cur_loc + 1)
        elif action == 2:
            pass
        
        return reward

# 2. Learning Agents

In [4]:
class QLearningAgent():
    def __init__(self, agent_info={}):
        # Defining the #actions and #states 
        self.num_actions = 3
        self.num_states = 4
        
        # Discount factor (gamma) to use in the updates.
        self.discount = agent_info.get("discount", 0.9)

        # The learning rate or step size parameter (alpha) to use in updates.
        self.step_size = agent_info.get("step_size", 0.1)

        # To control the exploration-exploitation trade-off
        self.epsilon = agent_info.get("epsilon", 0.1)
        
        # To determine if the Q-function is converged or not
        self.delta = agent_info.get("delta", 0.01)
        
        # Defining a random generator
        self.rand_generator = np.random.RandomState(agent_info.get("seed", 0))
        
        # Defining the initial action values
        self.q = self.rand_generator.randn(self.num_states, self.num_actions)
        
        # Initializing the variables for the previous state and action
        self.prev_state  = None
        self.prev_action = None
        
    def start(self, state):
        # Choose action using epsilon greedy.
        current_q = self.q[state][:]
        if self.rand_generator.rand() < self.epsilon:
            action = self.rand_generator.randint(self.num_actions)
        else:
            action = self.argmax(current_q)
            
        self.prev_state = state
        self.prev_action = action
        return action
    
    def step(self, state, reward):
        # Choose action using epsilon greedy.
        current_q = self.q[state][:]
        if self.rand_generator.rand() < self.epsilon:
            action = self.rand_generator.randint(self.num_actions)
        else:
            action = self.argmax(current_q)
        
        # Determining the new Q-Value
        new_val = -1e8
        cur_val = copy.copy(self.q[self.prev_state, self.prev_action])
        for act in range(self.num_actions):
            val = cur_val + self.step_size * (
                reward + self.discount * self.q[state, act] - cur_val
            )
            new_val = max(new_val, val)
        self.q[self.prev_state, self.prev_action] = new_val
            
        # Determining if the Q-function has converged or not
        if abs(new_val - cur_val) < self.delta:
            return (action, True)
        else:
            return (action, False)
            
            
    def argmax(self, q_values):
        top = float("-inf")
        ties = []

        for i in range(len(q_values)):
            if q_values[i] > top:
                top = q_values[i]
                ties = []

            if q_values[i] == top:
                ties.append(i)

        return self.rand_generator.choice(ties)

# 3. Running Experiments

In [5]:
def run_experiment(env_info = {}, agent_info = {}):
    env = ExampleEnvironment() 
    agent = QLearningAgent(agent_info)
    has_converged = False
    num_steps = 0
    
    next_state  = env.start()                 # STARTING STATE
    next_action = agent.start(next_state)     # STARTING ACTION
    next_reward = env.step(next_action)       # STARTING REWARD
    
    while not has_converged:
        next_action, has_converged = agent.step(next_state, next_reward)
        next_reward = env.step(next_action)
        
        if num_steps % 1000 == 0:
            print(f"Time Steps Elapsed | {num_steps}")
            print("Q-Values:", agent.q)
            print()
        
        num_steps += 1
        
    print("POST CONVERGENCE\n")
    print("Optimal Action Values:")
    print(agent.q)
    
    print("\nOptimal State Values:")
    print(np.max(agent.q, axis = -1))
    
    print("\nOptimal Policy:")
    print(np.argmax(agent.q, axis = -1))

In [6]:
# Defining the characteristics for the agent
agent_info = {
    "discount": 0.9,       
    "step_size": 0.1,
    "epsilon": 0.1,
    "delta": 1e-4,
    "seed": 0
}

run_experiment(agent_info = agent_info)

Time Steps Elapsed | 0
Q-Values: [[ 1.74641182  0.40015721  0.97873798]
 [ 2.2408932   1.86755799 -0.97727788]
 [ 0.95008842 -0.15135721 -0.10321885]
 [ 0.4105985   0.14404357  1.45427351]]

Time Steps Elapsed | 1000
Q-Values: [[ 1.6418421   0.40015721  0.97873798]
 [ 2.2408932   1.86755799 -0.97727788]
 [ 0.95008842 -0.15135721 -0.10321885]
 [ 0.4105985   0.14404357  1.45427351]]

Time Steps Elapsed | 2000
Q-Values: [[ 1.53460202  0.40015721  0.97873798]
 [ 2.2408932   1.86755799 -0.97727788]
 [ 0.95008842 -0.15135721 -0.10321885]
 [ 0.4105985   0.14404357  1.45427351]]

Time Steps Elapsed | 3000
Q-Values: [[ 1.32663728  0.40015721  0.97873798]
 [ 2.2408932   1.86755799 -0.97727788]
 [ 0.95008842 -0.15135721 -0.10321885]
 [ 0.4105985   0.14404357  1.45427351]]

Time Steps Elapsed | 4000
Q-Values: [[ 1.21956914  0.40015721  0.97873798]
 [ 2.2408932   1.86755799 -0.97727788]
 [ 0.95008842 -0.15135721 -0.10321885]
 [ 0.4105985   0.14404357  1.45427351]]

Time Steps Elapsed | 5000
Q-Value