# Q Learning with OpenAI Taxi-V2
<h3>___________________________________________</h3>

## Importing required dependencies

- numpy for our Q-Table
- OpenAI Gym for taxi environment
- Random to generate random numbers

In [12]:
import numpy as np
import gym
import random

## Creating the environment

Now we'll create the Taxi environment with the help of <b>OpenAI Gym</b> that has many environment that we can use to train our agents. In this game our goal is to make sure our taxi go to the pick up point from a spawn point and then pick the passenger to the destination asap

In [17]:
env = gym.make("Taxi-v2")
env.render()

+---------+
|[35mR[0m: | : :G|
|[43m [0m: : : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+



## Creating the Q-Table 

To create a Q-Table we need to know the number of States which will be our rows and the number of actions which will be our columns.In OpenAI Gym we have env.action_space.n and env.observation_space.n will give us the both required values

In [3]:
action_size = env.action_space.n
state_size = env.observation_space.n

print("Action Size: ", action_size)
print("State Size: ", state_size)

Action Size:  6
State Size:  500


We initiate the values of Q-Table with zero by the help of numpy

In [4]:
qtable = np.zeros((state_size, action_size))
print(qtable.shape)
print(qtable)

(500, 6)
[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


## Creating Hyperparameters

These are the variables that we tune the training of our algorithm

In [6]:
total_episodes = 50000
total_test_episodes = 100
max_steps = 99

learning_rate = 0.5
gamma = 0.618

# Exploration parameters
epsilon = 1.0 # Exploration rate
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01 

## Q learning algorithm

- Initialize Q-values (Q(s,a)) arbitarily for all state-action pairs
- for (life or until learning is stopped):
- - Choose an action(a) in the current world state(s) based on current Q-value estimates Q(s,a)
- - Take the action(a) and observe the outcome state(s') and reward(r)
- - Update Q(s,a) := Q(s,a) + alpha[r + gamma * max(Q(s', a') - Q(s,a)]

In [8]:
# For life or until learning is stopped
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    
    for step in range(max_steps):
        # Choose an action a in the current world state(s) 
        ## First we randomize a number  (exploration and exploitation tradeoff)
        exp_exp_tradeoff = random.uniform(0,1)
        
        ## if exp_exp_tradeoff > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state, :]) # highest qvalue in that state
            
        # Else doing a random choice --> exploration
        else:
            action = env.action_space.sample()
        
        # Now we take the action a, and observe the outcome s' and reward r
        new_state, reward, done, info = env.step(action)
        
        # Update Q(s,a) := Q(s,a) + alpha[R(s,a) + gamma * max Q(s', a') - Q(s,a)] # Bellman equation
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) 
                                                                         - qtable[state, action])
        
        state = new_state
        
        if done == True:
            break
            
    episode += 1
    
    # Reduce epsilon (because as we go on we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon) *np.exp(-decay_rate * episode)    

## Testing the Q-Table

In [10]:
env.reset()
rewards = []

for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    print("**" * 10)
    print("Episode {}".format(episode))
    
    for step in range(max_steps):
        env.render()
        # We'll choose the action in that state which has the max expected future reward
        action = np.argmax(qtable[state, :])
        
        new_state, reward, done, info = env.step(action)
        
        total_rewards += reward
        
        if done:
            rewards.append(total_rewards)
            print("Score: ", total_rewards)
            break
        state = new_state
env.close()
print("Score over time: " + str(sum(rewards)/total_test_episodes))
    

********************
Episode 0
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+

+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[42mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : |[42m_[0m: |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : :[42m_[0m: |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : : :[42m_[0m: |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | :[42m_[0m:[35mG[

  (North)
+---------+
|R: | : :[35m[42mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
Score:  4
********************
Episode 62
+---------+
|R: | : :[35mG[0m|
| :[43m [0m: : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+

+---------+
|R: | : :[35mG[0m|
| : : : : |
| :[43m [0m: : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[42mY[0m| : |B: |
+---------+
  (Pickup)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
|[42m_[0m| : | : |
|Y| : |B: |
+---------+
  (North)
+------

+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |B: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[42mY[0m| : |B: |
+---------+
  (Pickup)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
|[42m_[0m| : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :G|
| : : : : |
|[42m_[0m: : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :G|
|[42m_[0m: : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|[35m[42mR[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
Score:  9
********************
Episode 99
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[34;1mY[0m| :[43m [0m|[35mB[0m: |
+---------+

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | :[43m [0m| : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : : : : |
|