In [None]:
import numpy as np
import gym
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
game1 = gym.make('FrozenLake-v0')

game2 = gym.make('Taxi-v2')

In [None]:
game1_action_size = game1.action_space.n
game1_state_size = game1.observation_space.n

game2_action_size = game2.action_space.n
game2_state_size = game2.observation_space.n

In [None]:
game1_qtable = np.zeros((game1_state_size, game1_action_size))
print(game1_qtable.shape)

game2_qtable = np.zeros((game2_state_size, game2_action_size))
print(game2_qtable.shape)

In [None]:
total_episodes = [10000, 15000, 20000, 25000] 
learning_rate = 0.9
max_steps = 99
gamma = 0.95

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.001

# Comparing different parameters

## Number of episodes

**Game 1**

In [None]:
# Learning Game 1
total_episodes = [10000, 15000, 20000, 25000]
learning_rate = 0.9
max_steps = 99
gamma = 0.95

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01
# List of rewards
game1_rewards = []

for game1_total_episodes in total_episodes:
    episode_total_reward = []
    game1_qtable = np.zeros((game1_state_size, game1_action_size))
    for episode in range(game1_total_episodes):
        state = game1.reset()
        step = 0
        done = False
        total_rewards = 0

        for step in range(max_steps):
            exp_exp_tradeoff = random.uniform(0, 1)

            if exp_exp_tradeoff > epsilon:
                action = np.argmax(game1_qtable[state, :])
            else:
                action = game1.action_space.sample()

            new_state, reward, done, info = game1.step(action)

            game1_qtable[state, action] = game1_qtable[state, action] + learning_rate * (reward + gamma * np.max(game1_qtable[new_state, :]) - game1_qtable[state, action])

            total_rewards += reward

            state = new_state
            if done == True:
                break

        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
        episode_total_reward.append(total_rewards)
    game1_rewards.append(episode_total_reward)
#     print("Score over time: " + str(sum(game1_rewards)/total_episodes))

# plotting
order = [[0, 'r'], [1, 'b'], [2, 'g'], [3, 'y']]
plt.figure(figsize=(10, 10))
for i, color in order:
    division_ones = np.ones(len(game1_rewards[i]))
    plt.plot(np.divide(np.cumsum(game1_rewards[i])[100:], np.cumsum(division_ones)[100:]), color=color)
plt.legend([len(game1_rewards[i]) for i in range(4)])
plt.title('GAME: FROZEN LAKE, GRAPH: CHANGE IN SCORE WITH NUMBER OF ITERATIONS')
plt.xlabel('ITERATION')
plt.ylabel('SCORE')
plt.savefig('frozenLake_iteration.png')
plt.show()

**Game 2**

In [None]:
# Learning Game 2
total_episodes = [25000, 35000, 45000, 50000]
learning_rate = 0.9
max_steps = 99
gamma = 0.618

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01
# List of rewards
game2_rewards = []

for game2_total_episodes in total_episodes:
    episode_total_reward = []
    game2_qtable = np.zeros((game2_state_size, game2_action_size))
    for episode in range(game2_total_episodes):
        state = game2.reset()
        step = 0
        done = False
        total_rewards = 0

        for step in range(max_steps):
            exp_exp_tradeoff = random.uniform(0, 1)

            if exp_exp_tradeoff > epsilon:
                action = np.argmax(game2_qtable[state, :])
            else:
                action = game2.action_space.sample()

            new_state, reward, done, info = game2.step(action)

            game2_qtable[state, action] = game2_qtable[state, action] + learning_rate * (reward + gamma * np.max(game2_qtable[new_state, :]) - game2_qtable[state, action])

            total_rewards += reward

            state = new_state
            if done == True:
                break

        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
        episode_total_reward.append(total_rewards)
    game2_rewards.append(episode_total_reward)
    
# plotting
order = [[0, 'r'], [1, 'b'], [2, 'g'], [3, 'y']]
plt.figure(figsize=(10, 10))
for i, color in order:
    division_ones = np.ones(len(game2_rewards[i]))
    plt.plot(np.divide(np.cumsum(game2_rewards[i])[100:], np.cumsum(division_ones)[100:]), color=color)
plt.legend([len(game2_rewards[i]) for i in range(4)])
plt.title('GAME: TAXI, GRAPH: CHANGE IN SCORE WITH NUMBER OF ITERATIONS')
plt.xlabel('ITERATION')
plt.ylabel('SCORE')
plt.savefig('taxi_iteration.png')
plt.show()

# print("Score over time: " + str(sum(game2_rewards)/total_episodes))

## Learning rate

**Game 1**

In [None]:
# Learning Game 1
total_episodes = 25000
learning_rate = [0.6, 0.7, 0.8, 0.9]
max_steps = 99
gamma = 0.95

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01
# List of rewards
game1_rewards = []

for game1_learning_rate in learning_rate:
    rate_total_reward = []
    game1_qtable = np.zeros((game1_state_size, game1_action_size))
    for episode in range(total_episodes):
        state = game1.reset()
        step = 0
        done = False
        total_rewards = 0

        for step in range(max_steps):
            exp_exp_tradeoff = random.uniform(0, 1)

            if exp_exp_tradeoff > epsilon:
                action = np.argmax(game1_qtable[state, :])
            else:
                action = game1.action_space.sample()

            new_state, reward, done, info = game1.step(action)

            game1_qtable[state, action] = game1_qtable[state, action] + game1_learning_rate * (reward + gamma * np.max(game1_qtable[new_state, :]) - game1_qtable[state, action])

            total_rewards += reward

            state = new_state
            if done == True:
                break

        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
        rate_total_reward.append(total_rewards)
    game1_rewards.append(rate_total_reward)
#     print("Score over time: " + str(sum(game1_rewards)/total_episodes))

# plotting
order = [[0, 'r'], [1, 'b'], [2, 'g'], [3, 'y']]
plt.figure(figsize=(10, 10))
for i, color in order:
    division_ones = np.ones(len(game1_rewards[i]))
    plt.plot(np.divide(np.cumsum(game1_rewards[i])[100:], np.cumsum(division_ones)[100:]), color=color)
plt.legend(learning_rate)
plt.title('GAME: FROZEN LAKE, GRAPH: CHANGE IN SCORE WITH RATE')
plt.xlabel('ITERATION')
plt.ylabel('SCORE')
plt.savefig('frozenLake_rate.png')
plt.show()

**Game 2**

In [None]:
# Learning Game 2
total_episodes = 40000
learning_rate = [0.6, 0.7, 0.8, 0.9]
max_steps = 99
gamma = 0.618

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01
# List of rewards
game2_rewards = []

for game2_learning_rate in learning_rate:
    rate_total_reward = []
    game2_qtable = np.zeros((game2_state_size, game2_action_size))
    for episode in range(total_episodes):
        state = game2.reset()
        step = 0
        done = False
        total_rewards = 0

        for step in range(max_steps):
            exp_exp_tradeoff = random.uniform(0, 1)

            if exp_exp_tradeoff > epsilon:
                action = np.argmax(game2_qtable[state, :])
            else:
                action = game2.action_space.sample()

            new_state, reward, done, info = game2.step(action)

            game2_qtable[state, action] = game2_qtable[state, action] + game2_learning_rate * (reward + gamma * np.max(game2_qtable[new_state, :]) - game2_qtable[state, action])

            total_rewards += reward

            state = new_state
            if done == True:
                break

        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
        rate_total_reward.append(total_rewards)
    game2_rewards.append(rate_total_reward)
    
# plotting
order = [[0, 'r'], [1, 'b'], [2, 'g'], [3, 'y']]
plt.figure(figsize=(10, 10))
for i, color in order:
    division_ones = np.ones(len(game2_rewards[i]))
    plt.plot(np.divide(np.cumsum(game2_rewards[i])[100:], np.cumsum(division_ones)[100:]), color=color)
plt.legend(learning_rate)
plt.title('GAME: TAXI, GRAPH: CHANGE IN SCORE WITH RATE')
plt.xlabel('ITERATION')
plt.ylabel('SCORE')
plt.savefig('taxi_rate.png')
plt.show()

# print("Score over time: " + str(sum(game2_rewards)/total_episodes))

## Gamma

**Game 1**

In [None]:
# Learning Game 1
total_episodes = 25000
learning_rate = 0.9
max_steps = 99
gamma = [0.80, 0.85, 0.90, 0.95]

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01
# List of rewards
game1_rewards = []

for game1_gamma in gamma:
    gamma_total_reward = []
    game1_qtable = np.zeros((game1_state_size, game1_action_size))
    for episode in range(total_episodes):
        state = game1.reset()
        step = 0
        done = False
        total_rewards = 0

        for step in range(max_steps):
            exp_exp_tradeoff = random.uniform(0, 1)

            if exp_exp_tradeoff > epsilon:
                action = np.argmax(game1_qtable[state, :])
            else:
                action = game1.action_space.sample()

            new_state, reward, done, info = game1.step(action)

            game1_qtable[state, action] = game1_qtable[state, action] + learning_rate * (reward + game1_gamma * np.max(game1_qtable[new_state, :]) - game1_qtable[state, action])

            total_rewards += reward

            state = new_state
            if done == True:
                break

        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
        gamma_total_reward.append(total_rewards)
    game1_rewards.append(gamma_total_reward)
#     print("Score over time: " + str(sum(game1_rewards)/total_episodes))

# plotting
order = [[0, 'r'], [1, 'b'], [2, 'g'], [3, 'y']]
plt.figure(figsize=(10, 10))
for i, color in order:
    division_ones = np.ones(len(game1_rewards[i]))
    plt.plot(np.divide(np.cumsum(game1_rewards[i])[100:], np.cumsum(division_ones)[100:]), color=color)
plt.legend(gamma)
plt.title('GAME: FROZEN LAKE, GRAPH: CHANGE IN SCORE WITH GAMMA')
plt.xlabel('ITERATION')
plt.ylabel('SCORE')
plt.savefig('frozenLake_gamma.png')
plt.show()

**Game 2**

In [None]:
# Learning Game 2
total_episodes = 40000
learning_rate = 0.9
max_steps = 99
gamma = [0.6, 0.618, 0.7, 0.8]

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01
# List of rewards
game2_rewards = []

for game2_gamma in gamma:
    gamma_total_reward = []
    game2_qtable = np.zeros((game2_state_size, game2_action_size))
    for episode in range(game2_total_episodes):
        state = game2.reset()
        step = 0
        done = False
        total_rewards = 0

        for step in range(max_steps):
            exp_exp_tradeoff = random.uniform(0, 1)

            if exp_exp_tradeoff > epsilon:
                action = np.argmax(game2_qtable[state, :])
            else:
                action = game2.action_space.sample()

            new_state, reward, done, info = game2.step(action)

            game2_qtable[state, action] = game2_qtable[state, action] + learning_rate * (reward + game2_gamma * np.max(game2_qtable[new_state, :]) - game2_qtable[state, action])

            total_rewards += reward

            state = new_state
            if done == True:
                break

        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
        gamma_total_reward.append(total_rewards)
    game2_rewards.append(gamma_total_reward)
    
# plotting
order = [[0, 'r'], [1, 'b'], [2, 'g'], [3, 'y']]
plt.figure(figsize=(10, 10))
for i, color in order:
    division_ones = np.ones(len(game2_rewards[i]))
    plt.plot(np.divide(np.cumsum(game2_rewards[i])[100:], np.cumsum(division_ones)[100:]), color=color)
plt.legend(gamma)
plt.title('GAME: TAXI, GRAPH: CHANGE IN SCORE WITH GAMMA')
plt.xlabel('ITERATION')
plt.ylabel('SCORE')
plt.savefig('taxi_gamma.png')
plt.show()

# print("Score over time: " + str(sum(game2_rewards)/total_episodes))

## Decay Rate

**Game 1**

In [None]:
# Learning Game 1
total_episodes = 25000
learning_rate = 0.9
max_steps = 99
gamma = 0.95

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = [0.1, 0.05, 0.01, 0.001]
# List of rewards
game1_rewards = []

for game1_decay_rate in decay_rate:
    decay_rate_total_reward = []
    game1_qtable = np.zeros((game1_state_size, game1_action_size))
    for episode in range(total_episodes):
        state = game1.reset()
        step = 0
        done = False
        total_rewards = 0

        for step in range(max_steps):
            exp_exp_tradeoff = random.uniform(0, 1)

            if exp_exp_tradeoff > epsilon:
                action = np.argmax(game1_qtable[state, :])
            else:
                action = game1.action_space.sample()

            new_state, reward, done, info = game1.step(action)

            game1_qtable[state, action] = game1_qtable[state, action] + learning_rate * (reward + gamma * np.max(game1_qtable[new_state, :]) - game1_qtable[state, action])

            total_rewards += reward

            state = new_state
            if done == True:
                break

        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-game1_decay_rate*episode)
        decay_rate_total_reward.append(total_rewards)
    game1_rewards.append(decay_rate_total_reward)
#     print("Score over time: " + str(sum(game1_rewards)/total_episodes))

# plotting
order = [[0, 'r'], [1, 'b'], [2, 'g'], [3, 'y']]
plt.figure(figsize=(10, 10))
for i, color in order:
    division_ones = np.ones(len(game1_rewards[i]))
    plt.plot(np.divide(np.cumsum(game1_rewards[i])[100:], np.cumsum(division_ones)[100:]), color=color)
plt.legend(decay_rate)
plt.title('GAME: FROZEN LAKE, GRAPH: CHANGE IN SCORE WITH DECAY RATE')
plt.xlabel('ITERATION')
plt.ylabel('SCORE')
plt.savefig('frozenLake_decay_rate.png')
plt.show()

**Game 2**

In [None]:
# Learning Game 1
total_episodes = 25000
learning_rate = 0.9
max_steps = 99
gamma = 0.95

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = [0.1, 0.05, 0.01, 0.001]
# List of rewards
game1_rewards = []

for game2_decay_rate in decay_rate:
    decay_rate_total_reward = []
    game2_qtable = np.zeros((game2_state_size, game2_action_size))
    for episode in range(total_episodes):
        state = game2.reset()
        step = 0
        done = False
        total_rewards = 0

        for step in range(max_steps):
            exp_exp_tradeoff = random.uniform(0, 1)

            if exp_exp_tradeoff > epsilon:
                action = np.argmax(game2_qtable[state, :])
            else:
                action = game2.action_space.sample()

            new_state, reward, done, info = game2.step(action)

            game2_qtable[state, action] = game2_qtable[state, action] + learning_rate * (reward + gamma * np.max(game2_qtable[new_state, :]) - game2_qtable[state, action])

            total_rewards += reward

            state = new_state
            if done == True:
                break

        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-game2_decay_rate*episode)
        decay_rate_total_reward.append(total_rewards)
    game2_rewards.append(decay_rate_total_reward)
#     print("Score over time: " + str(sum(game1_rewards)/total_episodes))

# plotting
order = [[0, 'r'], [1, 'b'], [2, 'g'], [3, 'y']]
plt.figure(figsize=(10, 10))
for i, color in order:
    division_ones = np.ones(len(game2_rewards[i]))
    plt.plot(np.divide(np.cumsum(game2_rewards[i])[100:], np.cumsum(division_ones)[100:]), color=color)
plt.legend(decay_rate)
plt.title('GAME: FROZEN LAKE, GRAPH: CHANGE IN SCORE WITH DECAY RATE')
plt.xlabel('ITERATION')
plt.ylabel('SCORE')
plt.savefig('taxi_decay_rate.png')
plt.show()