In [1]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import math
import pandas as pd



In [2]:
# Example 1
# Simplest ratio
# There are sequence Matching is a bit slow so we can install Levenshetein library to make this process go a bit faster
print(fuzz.ratio("Hello World!", "Helloworld"))

82


In [3]:
# Example 2
# Token Sort Ratio
simple = fuzz.ratio("This ratio isn't that diffferent", "This ratio isn't ReAlly tHaT DifFerent")
print(simple)
sort = fuzz.token_sort_ratio("This ratio isn't that diffferent", "ratio This isn't tHaT DifFerent ReAlly")
print(sort)

77
89


In [4]:
#Example 3
print(fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear"))
print(fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear"))

84
100


In [5]:
choices = ["Pepperoni Pizza, Cheese Pizza, Double Cheeseburger, Italian Hotdogs"]
print(process.extract("pizza", choices, limit=2))

process.extractOne("cheese", choices)

[('Pepperoni Pizza, Cheese Pizza, Double Cheeseburger, Italian Hotdogs', 60)]


('Pepperoni Pizza, Cheese Pizza, Double Cheeseburger, Italian Hotdogs', 60)

Demonstration of the FuzzyWuzzy library in the Online Travel Agency (OTA). OTA like Expedia and Booking.com denote a room in the same hotel with different nomenclature. For example, one room in the same hotel is called 'Studio, 1 King Bed with Sofa bed, Corner' by Expedia while Booking.com may show it as 'Corner King Studio'. This can create confusion for the customers when they are comparing room rate between OTAs or specifically comparing prices of a room at two different OTAs. 

One of the most consistently frustating issues for price comparison websites and app is trying to figure out whether two items (or hotel rooms) are for the same thing, automatically. We are using the dataset of the room types from two Online Travel Agencies Expedia and Booking.com. Using this dataset, we would implement FuzzyWuzzy to match records of the room type between two OTAs. 

In [6]:
# read the room_type data in pandas dataframe
room_type = pd.read_csv('room_type.csv')

In [7]:
# view the data
room_type.head()

Unnamed: 0,Expedia,Booking.com
0,"Deluxe Room, 1 King Bed",Deluxe King Room
1,"Standard Room, 1 King Bed, Accessible",Standard King Roll-in Shower Accessible
2,"Grand Corner King Room, 1 King Bed",Grand Corner King Room
3,"Suite, 1 King Bed (Parlor)",King Parlor Suite
4,"High-Floor Premium Room, 1 King Bed",High-Floor Premium King Room


In [8]:
# analysing which function gives us the precise results for string matching between room types

# using the ratio function to check the similarity 
fuzz.ratio('Deluxe Room, 1 King Bed','Deluxe King Room')

62

This shows that the 'Deluxe Room, 1 King Bed' and 'Deluxe King Room' are about 62% similar

In [9]:
# lets check another pair
fuzz.ratio('Room, 2 Double Beds (19th to 25th Floors)', 'Two Double Beds - Location Room (19th to 25th Floors)')

74

This shows that 'Room, 2 Double Beds (19th to 25th Floors)'and 'Two Double Beds - Location Room (19th to 25th Floors)' are 74% similar. It seems that this naive approach is far too sensitive to minor differences in word order, missing or extra words and other. So we should choose function that ignores word order, duplicated words and be little bit more flexible. So we choose to use the token_set_ratio to find the optimized string match results

In [10]:
# create a function that takes the dataset and 
# compare the room_type in Expedia to that of Booking.com to find the similarity value

def similarity_ratio (row):
    room_name1 = row['Expedia']
    room_name2 = row['Booking.com']
    similarity = fuzz.token_set_ratio(room_name1, room_name2) # compute the match score between expedia and booking.com room type
    return similarity # return the similarity score

In [11]:
# append column similarity to room_type dataframe
room_type['similarity'] = room_type.apply(similarity_ratio,axis = 1)

In [12]:
# view the dataframe with the similarity ratio
room_type

Unnamed: 0,Expedia,Booking.com,similarity
0,"Deluxe Room, 1 King Bed",Deluxe King Room,100
1,"Standard Room, 1 King Bed, Accessible",Standard King Roll-in Shower Accessible,81
2,"Grand Corner King Room, 1 King Bed",Grand Corner King Room,100
3,"Suite, 1 King Bed (Parlor)",King Parlor Suite,100
4,"High-Floor Premium Room, 1 King Bed",High-Floor Premium King Room,100
5,"Traditional Double Room, 2 Double Beds",Double Room with Two Double Beds,78
6,"Room, 1 King Bed, Accessible",King Room - Disability Access,72
7,"Deluxe Room, 1 King Bed",Deluxe King Room,100
8,Deluxe Room,Deluxe Room (Non Refundable),100
9,"Room, 2 Double Beds (19th to 25th Floors)",Two Double Beds - Location Room (19th to 25th ...,97


In [13]:
# compute the percentage of the pairs exceeding a match score of 70
similarity_percent = len(room_type[room_type.apply(similarity_ratio, axis = 1) > 70])/len(room_type)*100

In [14]:
similarity_percent

90.29126213592234

In [15]:
print('Over ', math.floor(similarity_percent), '% of the pairs exceed a match score of 70 when setting ratio > 70')

Over  90 % of the pairs exceed a match score of 70 when setting ratio > 70
