# Value-Difference Based Q-Learning

Preston Whitcomb

In [None]:
import numpy as np
import gymnasium
import matplotlib.pyplot as plt

from agent import Agent
from VDBE_agent import VDBE_agent

### Graphing functions

In [None]:
def calc_q_table_magnitude(q_values : dict):
    # calculate L1 norm of q_value matrix 
    # each key is a column, each value of the key is a row element
    # we take the norm of all columns, 
    # then find the max for the overall matrix norm
    norms = []
    for key in q_values.keys():
        n = np.linalg.norm(q_values[key])
        norms.append(n)
    norm_max = max(norms)
    return norm_max

def graph_x_over_time(x : list, x_label : str, y_label : str, title : str):
    fig, axs = plt.subplots()
    axs.plot(x)
    axs.set_xlabel(x_label)
    axs.set_xlabel(y_label)
    axs.set_title(title)
    return fig

# Training Loop

In [None]:
def train(agent : Agent, num_episodes=10_000, render=False):
    env = gymnasium.make("Taxi-v3", render_mode="human" if render else None)

    for _ in range(num_episodes):
        # Reset environment to start a new episode
        observation, info = env.reset()

        episode_over = False
        total_reward = 0

        while not episode_over:
            # Choose an action
            action = agent.pi(observation)

            # Take the action and see what happens
            new_observation, reward, terminated, truncated, info = env.step(action)

            # reward: +1 for each step the pole stays upright
            # terminated: True if pole falls too far (agent failed)
            # truncated: True if we hit the time limit (500 steps)

            total_reward += reward
            episode_over = terminated or truncated

            agent.update_Q_learning(new_observation, action, reward, observation, episode_over)

            observation = new_observation
    env.close()
    return agent


# Making Agents, Env, and Training Parameters

In [None]:
env = gymnasium.make("Taxi-v3")

flat_epsilon_agent = Agent([0, 1, 2, 3, 4, 5])
epsilon_decay_agent = Agent([0, 1, 2, 3, 4, 5], epsilon=1.0, do_epsilon_decay=True)
vdbe_agent = VDBE_agent([0, 1, 2, 3, 4, 5])

num_turns_to_train = 100

# Train Flat Epsilon Agent (epsilon always stays the same)

In [None]:
flat_epsilon_agent = train(flat_epsilon_agent, num_episodes=num_turns_to_train, render=False)

#### Visualize progress with one step: (q-update is ignored)

In [None]:
# train(flat_epsilon_agent, 1, render=True)

In [None]:
flat_epsilon_agent.Q # TODO better way to visualize convergence

# Train Epsilon Decay Agent

In [None]:
epsilon_decay_agent = train(epsilon_decay_agent, num_episodes=num_turns_to_train, render=False)

In [None]:
epsilon_decay_agent.Q

#### Visualize progress with one step: (q-update is ignored)

In [None]:
# train(epsilon_decay_agent, 1, render=True)

In [None]:
epsilon_decay_agent.Q # TODO better way to visualize convergence

# Train VDBE Epsilon Decay Agent

In [None]:
vdbe_agent = train(vdbe_agent, num_episodes=num_turns_to_train, render=False)

#### Visualize progress with one step: (q-update is ignored)

In [None]:
# train(vdbe_agent, 1, render=True)

In [None]:
vdbe_agent.Q # TODO better way to visualize convergence