In [8]:
import numpy as np
import random
import gym

In [9]:
# Make the env (comes from OpenAIGYM)
env = gym.make("Taxi-v2")
env.render()

+---------+
|R: | : :[43mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+



In [10]:
# Here in the Taxi_v2 environment, we are using Q-table to learn how to win this game
action_size = env.action_space.n
print("Action size: ", action_size)

state_size = env.observation_space.n
print("State size ", state_size)


Action size:  6
State size  500


In [11]:
# Initialize Q-table
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [12]:
# Hyperparameters
total_episodes = 50000        # Total episodes
total_test_episodes = 100     # Total test episodes
max_steps = 99                # Max steps per episode

learning_rate = 0.7           # Learning rate
gamma = 0.618                 # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.01             # Exponential decay rate for exploration prob

In [14]:
# Q-learning algo
for episode in range(total_episodes):
    # For each episode, reset the env
    state = env.reset()
    # Reset step count
    step = 0
    # Reset done flag
    done = False
    
    for step in range(max_steps):
        exp_exp_tradeoff = random.uniform(0,1)
        
        if exp_exp_tradeoff > epsilon:
            # Exploitation
            action = np.argmax(qtable[state,:])
            
        else:
            # Exploration
            action = env.action_space.sample()
            
        # Take the action
        new_state, reward, done, info = env.step(action)
        
        # Update Q-value
        # Q(s,a) := Q(s,a) + learning_rate*[R(s,a) + Gamma*(max(Q(s',a')) - Q(s,a))]
        
        qtable[state,action] = qtable[state,action] + learning_rate*(reward+gamma*np.max(qtable[new_state,:])-qtable[state,action])
        
        # Update state
        state = new_state
        
        if done == True:
            break

    #Reduce epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)

        
        

In [15]:
# USe Q-table for Taxi
env.reset()
rewards = []

for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        env.render()
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        total_rewards += reward
        
        if done:
            rewards.append(total_rewards)
            print("Score: ", total_rewards)
            break
        state = new_state
        
env.close()
print("Score over time: "  + str(sum(rewards)/total_test_episodes))

+---------+
|R: | :[43m [0m:G|
| : : : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+

+---------+
|R: |[43m [0m: :G|
| : : : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)
+---------+
|R: | : :G|
| : :[43m [0m: : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| :[43m [0m: : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)
+---------+
|R: | : :G|
| : : : : |
| :[43m [0m: : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : 