In [36]:
import gym
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

# Get Taxi-v3 environment
env = gym.make('Taxi-v3')

model = Sequential()
model.add(Dense(50, input_dim=env.observation_space.n, activation='relu'))
model.add(Dense(env.action_space.n, activation='linear'))
model.compile(loss='mse', optimizer=Adam())

# Parameters
epsilon = 0.9  # Exploration rate
epsilon_decay = 0.5  # Decay rate for exploration
gamma = 0.95  # Discount factor
num_episodes = 5000  # Number of episodes for training

for episode in range(num_episodes):
    state = env.reset()
    state_one_hot = np.zeros([1, env.observation_space.n])
    state_one_hot[0, state[0]] = 1

    done = False
    iters = 1
    while not done:
        print("Episode: {}, Iteration: {}".format(episode, iters))
        # Epsilon-greedy action selection
        if np.random.rand() <= epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(model.predict(state_one_hot))  # Exploit

        next_state, reward, done, _, _ = env.step(action)
        next_state_one_hot = np.zeros([1, env.observation_space.n])
        next_state_one_hot[0, next_state] = 1

        # Update Q-value
        target = model.predict(state_one_hot)
        if done:
            target[0][action] = reward
        else:
            target[0][action] = reward + gamma * np.max(model.predict(next_state_one_hot))

        # Train the model
        model.fit(state_one_hot, target, epochs=1, verbose=0)

        state_one_hot = next_state_one_hot
        iters += 1

    # Decay the exploration rate
    if epsilon > 0.01:
        epsilon *= epsilon_decay



Episode: 0, Iteration: 1
Episode: 0, Iteration: 2
Episode: 0, Iteration: 3
Episode: 0, Iteration: 4
Episode: 0, Iteration: 5
Episode: 0, Iteration: 6
Episode: 0, Iteration: 7
Episode: 0, Iteration: 8
Episode: 0, Iteration: 9
Episode: 0, Iteration: 10
Episode: 0, Iteration: 11
Episode: 0, Iteration: 12
Episode: 0, Iteration: 13
Episode: 0, Iteration: 14
Episode: 0, Iteration: 15
Episode: 0, Iteration: 16
Episode: 0, Iteration: 17
Episode: 0, Iteration: 18
Episode: 0, Iteration: 19
Episode: 0, Iteration: 20
Episode: 0, Iteration: 21
Episode: 0, Iteration: 22
Episode: 0, Iteration: 23
Episode: 0, Iteration: 24
Episode: 0, Iteration: 25
Episode: 0, Iteration: 26
Episode: 0, Iteration: 27
Episode: 0, Iteration: 28
Episode: 0, Iteration: 29
Episode: 0, Iteration: 30
Episode: 0, Iteration: 31
Episode: 0, Iteration: 32
Episode: 0, Iteration: 33
Episode: 0, Iteration: 34
Episode: 0, Iteration: 35
Episode: 0, Iteration: 36
Episode: 0, Iteration: 37
Episode: 0, Iteration: 38
Episode: 0, Iteration

In [None]:
import matplotlib.pyplot as plt

# Parameters
num_test_episodes = 100  # Number of episodes for testing

# List to store rewards
rewards = []

# Test the trained agent
for episode in range(num_test_episodes):
    state = env.reset()
    state = np.reshape(state, [1, env.observation_space.n])
    episode_reward = 0
    done = False
    while not done:
        action = np.argmax(model.predict(state))  # Exploit learned values
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, env.observation_space.n])
        state = next_state
        episode_reward += reward
    rewards.append(episode_reward)

# Plot the rewards
plt.plot(rewards)
plt.title('Average reward per episode')
plt.xlabel('Episode')
plt.ylabel('Average Reward')
plt.show()


In [None]:
from keras import backend as K

# Parameters
num_actions = env.action_space.n  # Number of actions
confidence_bound = 2.0  # Confidence bound parameter for UCB

# Define a custom loss function for UCB
def ucb_loss(target, prediction):
    return -K.log(prediction) + confidence_bound * K.sqrt(K.log(target) / prediction)

# Define the neural network model
model_ucb = Sequential()
model_ucb.add(Dense(50, input_dim=env.observation_space.n, activation='relu'))
model_ucb.add(Dense(num_actions, activation='linear'))
model_ucb.compile(loss=ucb_loss, optimizer=Adam())

# Q-learning algorithm with UCB
for episode in range(num_episodes):
    state = env.reset()
    state = np.reshape(state, [1, env.observation_space.n])
    done = False
    while not done:
        # UCB action selection
        q_values = model_ucb.predict(state)
        action = np.argmax(q_values + confidence_bound * np.sqrt(np.log(episode+1) / (q_values+1)))

        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, env.observation_space.n])

        # Update Q-value
        target = model_ucb.predict(state)
        if done:
            target[0][action] = reward
        else:
            target[0][action] = reward + gamma * np.max(model_ucb.predict(next_state))

        # Train the model
        model_ucb.fit(state, target, epochs=1, verbose=0)

        state = next_state
