# Imports

In [1]:
import gym
import random
import numpy as np

# Creating Environment

In [2]:

# DeprecatedEnv: Env Taxi-v2 not found (valid versions include ['Taxi-v3'])
env = gym.make('Taxi-v3')

"""
This command will display a popup window. Since it is written within a loop,
an updated popup window will be rendered for every new action taken in each step.
"""
env.render() 


+---------+
|[34;1mR[0m: | : :G|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [3]:
action_size = env.action_space.n
print("Action size ", action_size)


Action size  6


In [4]:
state_size = env.observation_space.n
print("State size ", state_size)

State size  500


# Q-table

In [5]:
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


# Hyperparameters

In [14]:
total_episodes = 10000       
total_test_episodes = 100     
max_steps = 99

learning_rate = 0.7     # learning rate
gamma = 0.318           #discount rate

# Exploration parameters
epsilon = 1.0                
max_epsilon = 1.0             
min_epsilon = 0.01            
decay_rate = 0.01            


# Q-learning

In [16]:
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    
    for step in range(max_steps):
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0,1)
        
        # if random > epsilon then exploitation
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:]) 
            
        # else exploration
        else:
            action = env.action_space.sample()
        
        new_state, reward, done, info = env.step(action)
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma *
                                    np.max(qtable[new_state, :]) - qtable[state, action])
        state = new_state
        if done == True:
            break
        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
            

In [17]:
env.reset()
rewards = []

for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0

    for step in range(max_steps):
#         env.render()
        action = np.argmax(qtable[state,:])

        new_state, reward, done, info = env.step(action)

        total_rewards += reward

        if done:
            rewards.append(total_rewards)
            print (total_rewards)
            break
        state = new_state
env.close()
print("Score: " + str(sum(rewards)/total_test_episodes))

11
8
7
7
14
11
10
11
9
5
9
7
10
9
8
10
8
11
8
11
7
10
6
8
8
8
6
3
6
6
8
10
9
10
10
13
9
12
14
12
9
7
Score: 3.75


#            -------------------------- Thank you ------------------------