## Q-learning with OpenAI Taxi-v3 

In [1]:
# import libraries
import numpy as np
import gym
import random
from IPython.display import clear_output
from IPython.display import Video
import time

import warnings
warnings.filterwarnings('ignore')

In [2]:
############### create the environment ##########
env = gym.make("Taxi-v3")
env.reset()
env.render()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[43mB[0m: |
+---------+



In [6]:
for i in range(20):
    clear_output(wait=True)
    env.reset()
    env.render()
    time.sleep(0.5)

+---------+
|R: | : :G|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+



In [14]:
# all actions from each state
#env.P 

In [7]:
# Actions and states
action_size = env.action_space.n
state_size = env.observation_space.n
print("No of actions : ", action_size)
print("No of states : ", state_size)

# Intial Q-table
qtable = np.zeros((state_size, action_size))
print('\n',qtable)

No of actions :  6
No of states :  500

 [[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [8]:
######### hyperparameters ############

tot_episodes = 50000                    # total episodes
tot_test_episodes = 100                 # total test episodes
max_steps = 99                          # max steps per episodes

learning_rate = 0.7                     # learning rate
gamma = 0.618                           # discount rate

# Exploration parameters
epsilon = 1.0                           # exploration vs exploitation
max_eps = 1.0                           # Exploration probability at start
min_eps = 0.01                          # minimum exploration probability
decay_rate = 0.01                       # decay rate for exploration probability

In [9]:
########### Q-learning algorithm ##########

for episode in range(tot_episodes):
    state = env.reset() # a new state
    step = 0
    done = False
    
    #for step in range(max_steps):
    while not done:
        # choose an action 'a' from the current state 's'
        exp_exp_tradeoff = random.uniform(0, 1)
        
        if exp_exp_tradeoff > epsilon:  # Exploitation
            action = np.argmax(qtable[state, :])
        
        else:      # random chioce (exploration)
            action = env.action_space.sample()
        
        # take the action 'a'
        new_state, reward, done, info = env.step(action)
        
        # update the q value
        # Q(s,a) := Q(s,a) + l_rate[R(s,a) + gamma * max Q(s',a') - Q(s,a)] 
        qtable[state, action] = qtable[state, action] + learning_rate*(reward + gamma*np.argmax(qtable[new_state]) - qtable[state, action])
        
        # set the new state
        state = new_state
        
        # if done, then finish episode
        if done == True:
            break
    # reduce epsilon -> less esploration 
    epsilon = min_eps + (max_eps - min_eps)*np.exp(-decay_rate*episode) 

In [78]:
############# Play Taxi ##############
# test with above q table
env.reset()
rewards = []

for eps in range(tot_test_episodes): 
    state = env.reset()
    step = 0
    done = False
    total_reward = 0
    
    #print("-------------------Episode ", eps)
    
    for step in range(max_steps):
        #clear_output(wait=True)
        #env.render()
        
        # take the action 'a' --> select the maximum future reward
        action = np.argmax(qtable[state, :])
        new_state, reward, done, info = env.step(action)
        total_reward += reward
        state = new_state
     
        #print(total_reward)
        #print(rewards)
            
        if done:
            rewards.append(total_reward)
            break
            
env.close()
#print("score over time: ", str(sum(rewards)/tot_test_episodes))

***The following code shows the training without generating a random number to select the action***

In [117]:
######### without using Q learning 

state = env.reset() # a new episode
counter = 0    # steps
g = 0          # total reward
reward = None

while reward != 20:         #bcz max reward, the agent gets is 20
    state, reward, done, info = env.step(env.action_space.sample())
    counter += 1
    g += reward

print("Solved in {} steps with a total reward of {}".format(counter, g))   

Solved in 349 steps with a total reward of -1165


In above case, agent has taken many stepd and the final reward is minus. Therefore, we use Q-learning to select the optimal action for a given state.

In [10]:
######### only one episode
Q = np.zeros([state_size, action_size])

episodes = 1
G = 0         # goal state
alpha = 0.618

for episode in range(1, episodes+1):
    done = False
    G, reward = 0, 0
    state = env.reset()
    firstState = state
    print("Initial state = {}".format(state))
    
    while reward != 20:
        action = np.argmax(Q[state])
        new_state, reward, done, info = env.step(action)
        Q[state, action] += alpha * (reward + np.max(Q[new_state]) - Q[state, action])
        G += reward
        state = new_state

Initial state = 344


In [11]:
print("final state = {}".format(state))

final state = 0


In [13]:
Q[344]

array([-2.12450721, -1.94418103, -1.854     , -1.32618103, -6.18      ,
       -6.18      ])

In [19]:
###############  for multiple episodes

#Q = np.zeros([state_size, action_size])

episodes = 3000
reward_tracker = []
G = 0
alpha = 0.618


for episode in range(1, episodes+1):
    done = False
    G, reward = 0, 0
    state = env.reset()
    
    while done != True:
        action = np.argmax(Q[state])
        new_state, reward, done, info = env.step(action)
        Q[state, action] += alpha * (reward + np.max(Q[new_state]) - Q[state, action])
        G += reward
        state = new_state
    
    if episode % 100 ==0:
        print('Episode {} total reward: {}'.format(episode, G))
    
    reward_tracker.append(G)
    
print("Total reward agent received = ", sum(reward_tracker))

Episode 100 total reward: 10
Episode 200 total reward: 9
Episode 300 total reward: 6
Episode 400 total reward: 6
Episode 500 total reward: 12
Episode 600 total reward: 7
Episode 700 total reward: 4
Episode 800 total reward: 6
Episode 900 total reward: 9
Episode 1000 total reward: 5
Episode 1100 total reward: 5
Episode 1200 total reward: 10
Episode 1300 total reward: 6
Episode 1400 total reward: 10
Episode 1500 total reward: 5
Episode 1600 total reward: 9
Episode 1700 total reward: 11
Episode 1800 total reward: 6
Episode 1900 total reward: 6
Episode 2000 total reward: 5
Episode 2100 total reward: 10
Episode 2200 total reward: 11
Episode 2300 total reward: 7
Episode 2400 total reward: 11
Episode 2500 total reward: 12
Episode 2600 total reward: 6
Episode 2700 total reward: 8
Episode 2800 total reward: 7
Episode 2900 total reward: 10
Episode 3000 total reward: 9
Total reward agent received =  23472


In [27]:
### testing  - Play Taxi
state = env.reset()
done = None

while done != True:
    action = np.argmax(Q[state])
    state, reward, done, info = env.step(action)
    clear_output(wait=True)
    env.render()
    time.sleep(1)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)


In [36]:
Video("taxi-agent.mp4" )