# Value-Difference Based Q-Learning

Preston Whitcomb

In [1]:
import numpy as np
import gymnasium

from agent import Agent
from VDBE_agent import VDBE_agent

# Training Loop

In [2]:
def train(agent : Agent, num_episodes=10_000, render=False):
    env = gymnasium.make("Taxi-v3", render_mode="human" if render else None)

    for _ in range(num_episodes):
        # Reset environment to start a new episode
        observation, info = env.reset()

        episode_over = False
        total_reward = 0

        while not episode_over:
            # Choose an action
            action = agent.pi(observation)

            # Take the action and see what happens
            new_observation, reward, terminated, truncated, info = env.step(action)

            # reward: +1 for each step the pole stays upright
            # terminated: True if pole falls too far (agent failed)
            # truncated: True if we hit the time limit (500 steps)

            total_reward += reward
            episode_over = terminated or truncated

            agent.update_Q_learning(new_observation, action, reward, observation, episode_over)

            observation = new_observation
    env.close()
    return agent


# Making Agents, Env, and Training Parameters

In [3]:
env = gymnasium.make("Taxi-v3")

flat_epsilon_agent = Agent([0, 1, 2, 3, 4, 5])
epsilon_decay_agent = Agent([0, 1, 2, 3, 4, 5], epsilon=1.0, do_epsilon_decay=True)
vdbe_agent = VDBE_agent([0, 1, 2, 3, 4, 5])

num_turns_to_train = 100_000

# Train Flat Epsilon Agent (epsilon always stays the same)

In [4]:
flat_epsilon_agent = train(flat_epsilon_agent, num_episodes=num_turns_to_train, render=False)

#### Visualize progress with one step: (q-update is ignored)

In [5]:
# train(flat_epsilon_agent, 1, render=True)

In [6]:
flat_epsilon_agent.Q # TODO better way to visualize convergence

defaultdict(<function agent.Agent.__init__.<locals>.<lambda>()>,
            {26: defaultdict(<function agent.Agent.__init__.<locals>.<lambda>.<locals>.<lambda>()>,
                         {np.int64(4): -5.11850037855227,
                          np.int64(5): -5.084208027056313,
                          np.int64(0): -4.854616704642545,
                          np.int64(1): -4.8542458332472265,
                          np.int64(3): -4.854326127483957,
                          np.int64(2): -4.854534731879752}),
             126: defaultdict(<function agent.Agent.__init__.<locals>.<lambda>.<locals>.<lambda>()>,
                         {np.int64(5): -5.017767013984935,
                          np.int64(3): -4.810466743028174,
                          np.int64(0): -4.810587778580195,
                          np.int64(2): -4.810364075423378,
                          np.int64(1): -4.810303243359002,
                          np.int64(4): -5.040215077915436}),
             106: defa

# Train Epsilon Decay Agent

In [7]:
epsilon_decay_agent = train(epsilon_decay_agent, num_episodes=num_turns_to_train, render=False)

#### Visualize progress with one step: (q-update is ignored)

In [8]:
# train(epsilon_decay_agent, 1, render=True)

In [9]:
epsilon_decay_agent.Q # TODO better way to visualize convergence

defaultdict(<function agent.Agent.__init__.<locals>.<lambda>()>,
            {384: defaultdict(<function agent.Agent.__init__.<locals>.<lambda>.<locals>.<lambda>()>,
                         {np.int64(2): -4.516351010937349,
                          np.int64(1): -4.516258847509433,
                          np.int64(3): -4.516521810986669,
                          np.int64(5): -4.522322286727491,
                          np.int64(4): -4.520307962368396,
                          np.int64(0): -4.516273079111319}),
             284: defaultdict(<function agent.Agent.__init__.<locals>.<lambda>.<locals>.<lambda>()>,
                         {np.int64(3): -4.410791101322801,
                          np.int64(2): -4.410330208133427,
                          np.int64(4): -4.418788179905323,
                          np.int64(1): -4.410428856710836,
                          np.int64(5): -4.4170486310322525,
                          np.int64(0): -4.410920139975463}),
             264: de

# Train VDBE Epsilon Decay Agent

In [4]:
vdbe_agent = train(vdbe_agent, num_episodes=num_turns_to_train, render=False)

#### Visualize progress with one step: (q-update is ignored)

In [None]:
# train(vdbe_agent, 1, render=True)

<VDBE_agent.VDBE_agent at 0x23533200c80>

In [None]:
vdbe_agent.Q # TODO better way to visualize convergence

defaultdict(<function VDBE_agent.VDBE_agent.__init__.<locals>.<lambda>()>,
            {473: defaultdict(<function VDBE_agent.VDBE_agent.__init__.<locals>.<lambda>.<locals>.<lambda>()>,
                         {np.int64(4): -0.6179741399531791,
                          np.int64(5): -5.190566441120405,
                          np.int64(2): -0.7193793580676381,
                          np.int64(3): -0.6696271582318689,
                          np.int64(0): -0.6635112815639345,
                          np.int64(1): -0.6269490025464373}),
             477: defaultdict(<function VDBE_agent.VDBE_agent.__init__.<locals>.<lambda>.<locals>.<lambda>()>,
                         {np.int64(3): -0.5165613552131595,
                          np.int64(1): -0.4874856663961471,
                          np.int64(5): -0.5101211272314088,
                          np.int64(4): -4.196010118891071,
                          np.int64(0): -0.5052384443167036,
                          np.int64(2): -0.4