In [1]:
import gym
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle
import random

#for text processing
#import spacy
import re
import pandas as pd
env = gym.make("Taxi-v3").env

env.render()

+---------+
|R: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+



#### There are 4 locations (labeled by different letters), and our job is to pick up the passenger at one location and drop him off at another. We receive +20 points for a successful drop-off and lose 1 point for every time-step it takes. There is also a 10 point penalty for illegal pick-up and drop-off actions."

### Fetching Origing, Destination, and Time of Pickup from the sms data 

In [2]:
def fetch_pickup_drop(text,loc_dict):
    
    #Write your code here
    match = re.findall(r'dwarka sector 21|dwarka sector 23|hauz khaas|airport', text)
    at_match = re.findall(r'({0}(?:\s\w+\s\w+)?)'.format('at'), text)
    src = ''
    dest = ''
    if(match):
        for val in match:
            at = at_match[0].split()
            at = at[1]+" "+at[2]
            time_of_pickup = at
            word = before(text,str(val))
            if("from" in word):
                src = val
            elif("to" in word):
                dest = val
            elif("for" in word):
                dest = val
        if (src == ''):
            return []
        elif (dest == ''):
            return []
        else:
            return [src, dest, time_of_pickup]
    else:
        return []


In [3]:
env.reset() # reset environment to a new, random state
env.render()

print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+

Action Space Discrete(6)
State Space Discrete(500)


## Summing up the Q-Learning Process
Breaking it down into steps, we get

Initialize the Q-table by all zeros.

Start exploring actions: 

For each state, select any one among all possible actions for the current state (S).

Travel to the next state (S') as a result of that action (a).

For all possible actions from the state (S') select the one with the highest Q-value.

Update Q-table values using the equation.

Set the next state as the current state.

If goal state is reached, then end and repeat the process.


## Exploiting learned values
After enough random exploration of actions, the Q-values tend to converge serving our agent as an action-value function which it can exploit to pick the most optimal action from a given state.

There's a tradeoff between exploration (choosing a random action) and exploitation (choosing actions based on already learned Q-values). We want to prevent the action from always taking the same route, and possibly overfitting, so we'll be introducing another parameter called ϵ "epsilon" to cater to this during training.

Instead of just selecting the best learned Q-value action, we'll sometimes favor exploring the action space further. Lower epsilon value results in episodes with more penalties (on average) which is obvious because we are exploring and making random decisions.

In [4]:
#Initialize Q_table
import numpy as np
#write your code here

q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [5]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.9
gamma = 0.9
epsilon = 0.01

# For plotting metrics
all_epochs = []
all_penalties = []

##Write your code here
for i in range(1, 10000):
    state = env.reset()
    
    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values

        next_state, reward, done, info = env.step(action) 
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

np.save("./q_table.npy", q_table)

Episode: 9900
Training finished.

Wall time: 6.71 s


In [6]:
#Load trained q_table for evaluation

q_table = np.load("./q_table.npy")

In [7]:
def create_loc_dict(city_df):
    loc_dict = {}
    ## Create dictionary example, loc_dict['dwarka sector 23] = 0
    loc_dict = city.set_index('location').T.to_dict('list')
    return loc_dict

In [8]:
def check_pick_up_drop_correction(pick_up, drop, org_df):
    #write your code here
    df = []
    df = org_df.loc[(org_df['origin'] == pick_up) & (org_df['dest'] == drop)].values[0]
    if(df == []):
        return False
    else:
        return True

In [9]:
def before(value, a):
    # Find first part and return slice before it.
    pos_a = value.find(a)
    if pos_a == -1: return ""
    return value[pos_a-5:pos_a]

In [10]:
"""Evaluate agent's performance after Q-learning"""

# 1) We need to take text drom "sms.txt" and fetch pickup and drop from it.
# 2) Generate the random state from an enviroment and change the pick up and drop as the fetched one
# 3) Evaluate you q_table performance on all the texts given in sms.txt.
# 4) Have a check if the fetched pickup, drop is not matching with original pickup, drop using orig.csv
# 5) If fetched pickup or/and drop does not match with the original, add penality and reward -10
# 6) Calculate the Total reward, penalities, Wrong pickup/drop predicted and Average time steps per episode.

total_epochs, total_penalties, total_reward, wrong_predictions = 0, 0, 0, 0

count = 0
time_list = []
f = open("./sms.txt", "r")
num_of_lines = 1000
city = pd.read_csv("./city.csv")

loc_dict = create_loc_dict(city)

org_df = pd.read_csv("./org_df.csv")

line_num = 0

taxi_row = random.randrange(0, 5, 1)
taxi_column = random.randrange(0, 5, 1)
frames = [] # for animation

for line in f:
    journey = []
    journey = fetch_pickup_drop(line,loc_dict)
    if(journey != []):
        pick_up = [val for key , val in loc_dict.items() if journey[0] == key][0]
        drop = [val for key , val in loc_dict.items() if journey[1] == key][0]
        state = env.encode(taxi_row, taxi_column, pick_up[0], drop[0])
        env.s = state
        epochs, penalties, reward = 0, 0, 0
        
        done = False
        while not done:
            action = np.argmax(q_table[state])
            state, reward, done, info = env.step(action)

            if reward == -10:
                penalties += 1

            epochs += 1
            
            # Put each rendered frame into dict for animation
            frames.append({
                'frame': env.render(mode='ansi'),
                'state': state,
                'action': action,
                'reward': reward
                }
            )
            
        # check if pickup and drop are same as original 
        if (check_pick_up_drop_correction(journey[0], journey[1], org_df) == False):
                penalties += 10
                reward += -10
                wrong_predictions += 1
                
        total_penalties += penalties
        total_epochs += epochs
        total_reward += reward
        num_of_lines += 1

print(f"Results after {num_of_lines} episodes:")
print(f"Average timesteps per episode: {total_epochs / num_of_lines}")
print(f"Average penalties per episode: {total_penalties / num_of_lines}")
print(f"Total number of wrong predictions", wrong_predictions)
print("Total Reward is", total_reward)

  """


Results after 1825 episodes:
Average timesteps per episode: 6.780821917808219
Average penalties per episode: 0.0
Total number of wrong predictions 0
Total Reward is 16500


In [11]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)
        
print_frames(frames)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)

Timestep: 12375
State: 475
Action: 5
Reward: 20
