In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
import pandas as pd
filepath = "https://raw.githubusercontent.com/hamzafarooq/maven-mlsystem-design-cohort-1/main/data/miami_hotels.csv"

data = pd.read_csv(filepath)


In [None]:
documents = data.review.values

In [None]:
def get_results(query, query_vector, word_vectors):
    cosine_sim = cosine_similarity(query_vector, word_vectors)
    print(f"Results for query '{query}'")
    result = pd.DataFrame(data = {"documents": documents, 'similarity': cosine_sim[0]}).sort_values("similarity", ascending=False)
    return result

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

def sklearn_search(query):
  query_vector = tfidf_vectorizer.transform([query])
  return get_results(query, query_vector, tfidf_matrix)

In [None]:
query = 'near the beach'

In [None]:
tfidf_result = sklearn_search(query).reset_index(drop=True)
tfidf_result

Results for query 'near the beach'


Unnamed: 0,documents,similarity
0,"This is my second time coming to this hotel, a...",0.435463
1,"The location is unbeatable, so near to the bea...",0.378574
2,The staff team is amazing! We loved our room a...,0.378527
3,"Great location, room was as expected. Big and ...",0.367309
4,Really reccomend it. Very nice and clean hotel...,0.354757
...,...,...
2506,Lautaro was great …he took care of our order a...,0.000000
2507,Perfect getaway with pleasant staff members! ...,0.000000
2508,James at the pool was the bomb! Service was im...,0.000000
2509,The service was amazing ! Ricardo kept checkin...,0.000000


In [None]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
from rank_bm25 import BM25Okapi

corpus = documents

tokenized_corpus = [doc.lower().split(" ") for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)

In [None]:
k = 10
tokenized_query = query.lower().split(' ')
bm25_results = bm25.get_top_n(tokenized_query, corpus, n=k)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_simple= TfidfVectorizer(lowercase =False)
tfidf_simple_matrix = tfidf_simple.fit_transform(documents)
query_vector = tfidf_simple.transform([query])

tfidf_simple_result = get_results(query, query_vector, tfidf_simple_matrix).reset_index(drop = True)

Results for query 'near the beach'


In [None]:
pd.DataFrame(data = {"TF-IDF-S":tfidf_simple_result[:k].documents,"TF-IDF":tfidf_result[:k].documents, "BM25":bm25_results})

Unnamed: 0,TF-IDF-S,TF-IDF,BM25
0,"This is my second time coming to this hotel, a...","This is my second time coming to this hotel, a...",The staff team is amazing! We loved our room a...
1,The staff team is amazing! We loved our room a...,"The location is unbeatable, so near to the bea...","The Urbanica Fifth it's amazing place, the loc..."
2,Really reccomend it. Very nice and clean hotel...,The staff team is amazing! We loved our room a...,Really reccomend it. Very nice and clean hotel...
3,"The Urbanica Fifth it's amazing place, the loc...","Great location, room was as expected. Big and ...",The staff was super friendly and accommodating...
4,Everything is great but could have a little la...,Really reccomend it. Very nice and clean hotel...,"Great location, room was as expected. Big and ..."
5,"The location is unbeatable, so near to the bea...","Quaint, lovely hotel in the center of everythi...",Thanks to Paola and Marcel first of all.\nSimp...
6,My friends and I came for spring break and thi...,"The Urbanica Fifth it's amazing place, the loc...",Everything is great but could have a little la...
7,The staff was super friendly and accommodating...,The staff was super friendly and accommodating...,"This is my second time coming to this hotel, a..."
8,This hotel is in an excellent location near ma...,Everything is great but could have a little la...,Best location in Maimi ocean drive. The rooms ...
9,VERY accommodating. We had an early flight in...,My friends and I came for spring break and thi...,It was an amazing check-in and information on ...


In [None]:
import re

def process_text(text):
  text = text.lower()
  text = re.sub(r'[^a-z ]', ' ', text)
  return text

def vocab_parser(corpus):
  joined_corpus = process_text(' '.join(corpus))
  joined_corpus = re.sub(r'[^a-z ]', ' ', joined_corpus)
  vocab = list(set(joined_corpus.split(' ')))
  return {vocab[i]: i for i in range(len(vocab))}

In [None]:
vocab = vocab_parser(documents)

def compute_bow(text, vocab):
  vector = np.zeros((len(vocab)))

  text = process_text(text)
  tokens = text.split(' ')

  for token in tokens:
    vector[vocab[token]] +=1

  return vector

In [None]:
vectors = np.array([compute_bow(document, vocab) for document in documents])

In [None]:
def bow_search(query):
  query = process_text(query)
  query_vector = compute_bow(query, vocab).reshape(1,-1)
  return get_results(query, query_vector, vectors).reset_index(drop = True)

In [None]:
bow_results = bow_search(query)

Results for query 'near the beach'


In [None]:
pd.DataFrame(data = {"TF-IDF-S":tfidf_simple_result[:k].documents,"TF-IDF":tfidf_result[:k].documents, "BM25":bm25_results, "Bow": bow_results[:k].documents})

Unnamed: 0,TF-IDF-S,TF-IDF,BM25,Bow
0,"This is my second time coming to this hotel, a...","This is my second time coming to this hotel, a...",The staff team is amazing! We loved our room a...,Everything is great but could have a little la...
1,The staff team is amazing! We loved our room a...,"The location is unbeatable, so near to the bea...","The Urbanica Fifth it's amazing place, the loc...",Spacious rooms but located in North Beach away...
2,Really reccomend it. Very nice and clean hotel...,The staff team is amazing! We loved our room a...,Really reccomend it. Very nice and clean hotel...,This is the perfect hotel for families or grou...
3,"The Urbanica Fifth it's amazing place, the loc...","Great location, room was as expected. Big and ...",The staff was super friendly and accommodating...,Xavier at the beach was the best! It was a gre...
4,Everything is great but could have a little la...,Really reccomend it. Very nice and clean hotel...,"Great location, room was as expected. Big and ...",We really enjoyed our stay at Axel Beach the s...
5,"The location is unbeatable, so near to the bea...","Quaint, lovely hotel in the center of everythi...",Thanks to Paola and Marcel first of all.\nSimp...,Overall very nice time and enjoyed the hotel a...
6,My friends and I came for spring break and thi...,"The Urbanica Fifth it's amazing place, the loc...",Everything is great but could have a little la...,The Hotel is very near the Beach area with all...
7,The staff was super friendly and accommodating...,The staff was super friendly and accommodating...,"This is my second time coming to this hotel, a...",Great stay at the Gates hotel. Great amenities...
8,This hotel is in an excellent location near ma...,Everything is great but could have a little la...,Best location in Maimi ocean drive. The rooms ...,Justin had an amazing energy! We loved our roo...
9,VERY accommodating. We had an early flight in...,My friends and I came for spring break and thi...,It was an amazing check-in and information on ...,Such a treat to do a Daycation at the Carillon...
