## **Task 2: Recommender System using Embeddings.**
### **Step 1: Create Embeddings.**

In [1]:
#Mount my Google Drive
from google.colab import drive
drive.mount("/content/drive")
import os
directory = '/content/drive/My Drive/CSC575 Final Project'
os.chdir(directory)

#Ensure the files are there (in the folder)
!pwd

Mounted at /content/drive
/content/drive/My Drive/CSC575 Final Project


### **Loading Datasets**.

In [69]:
#Importing Necessary Libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#Text preprocessing.
import re
import ast
import nltk
from nltk.corpus import stopwords
import spacy
from spacy.tokens import Span
#Cosine calculation.
from numpy.linalg import norm
nlp = spacy.load("en_core_web_sm")
#Warnings.
import warnings
warnings.filterwarnings("ignore")
#Geo location.
import folium
from geopy.geocoders import Nominatim

**Loading Datasets**

In [3]:
#Training data.
df_train = pd.read_csv("reviews_train.csv", sep='\t', encoding = 'ISO-8859-1')

In [4]:
df_train.head(3)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,A-8BBGnBoz5-K1WaM5Cgaw,DUxCdkXnwYEzHZvT8MyvIw,ju4YP8SLdR_BmWr_-Xh83Q,5,2,0,1,Best pho in Santa Barbara County. Staff are g...,2018-07-27 00:09:23
1,k--beJRNBZzFklRoppa2MA,SAgf1IxxuomOWSIDzy07pQ,uE40984_YDgVvPeRpFcCaQ,5,0,0,0,We came for the hot chicken but were won over ...,2014-02-26 13:05:47
2,qmr304jvtYetK5i_Djrx_A,cMkPQZVDOibs2bz8St7Acg,JvawJ9bSr22xn4R9oLvl_w,3,0,0,0,I really should have used my better judgement ...,2014-02-26 22:33:39


In [5]:
#Test data.
df_test = pd.read_csv("reviews_test_all.csv", sep='\t', encoding = 'ISO-8859-1')

In [6]:
df_test.head(3)

Unnamed: 0.1,Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,37039,8QSAs3yVhcNS0y1fWSn41Q,BY_7xEliSP5iEig9bemaKw,e86IBzGCsrnhJbD_wELj7w,3,1,0,0,The main course was actually very tasty and th...,2011-01-16 20:23:33
1,19778,-lNpxdJNrvMtZ_RTvT2NtA,djOl6zKvKdbt4lNnDKUXJg,z22hSRptt_DS0nWjsIka2A,5,0,0,0,Outback is my favorite steakhouse. I've dined ...,2015-05-13 17:03:27
2,80124,7Ylcy1txacpnY76275KqQw,1iokf9rM43YAwxsa8bp1OQ,jRLskcm_icZIKs81mYC4iQ,4,5,3,3,"I absolutely love Ethiopian, I'd eat it every ...",2015-04-22 17:26:46


In [8]:
#Businesses data.
df_businesses = pd.read_csv("businesses.csv", sep='\t', encoding = 'ISO-8859-1')

In [9]:
df_businesses.head(3)

Unnamed: 0.1,Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."


### **Text Preprocessing.**
**Tokenize.**

In [10]:
def process_text(text):
  '''Tokenize_text function using Spacy's default stop words.'''

  #Lowering all text.
  text = re.sub(r'[^\w\s]', '', text.lower())

  #Filter out tokens.
  filtered_tokens = []

  for token_text in nlp(text):
    #Get tokens that are alpha, and not stop words.
    if token_text.text.isalpha() and token_text.text not in nlp.Defaults.stop_words:
      #Token length has to be more than 2 and less than 19.
      if 2 < len(token_text) < 19:
        filtered_tokens.append(str(token_text))
  return filtered_tokens

In [11]:
#--Takes a long time to process text: 40 mins!--
#Apply the tokenization function to all 'text' in the review training dataset.
processed_text = df_train.text.apply(process_text)

In [12]:
processed_text.head(5)

0    [best, pho, santa, barbara, county, staff, gre...
1    [came, hot, chicken, won, entire, menu, ill, s...
2    [better, judgement, ruled, eating, joint, bour...
3    [place, relatively, smaller, pho, places, sant...
4    [best, southeast, restaurant, philadelphia, fo...
Name: text, dtype: object

In [13]:
#Convert into a list of tokenized strings.
clean_txt = [str(i) for i in processed_text]

#Copied data frame with cleaned text.
cln_train = df_train
cln_train["cleaned_text"] = clean_txt
cln_train.head(5)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,cleaned_text
0,A-8BBGnBoz5-K1WaM5Cgaw,DUxCdkXnwYEzHZvT8MyvIw,ju4YP8SLdR_BmWr_-Xh83Q,5,2,0,1,Best pho in Santa Barbara County. Staff are g...,2018-07-27 00:09:23,"['best', 'pho', 'santa', 'barbara', 'county', ..."
1,k--beJRNBZzFklRoppa2MA,SAgf1IxxuomOWSIDzy07pQ,uE40984_YDgVvPeRpFcCaQ,5,0,0,0,We came for the hot chicken but were won over ...,2014-02-26 13:05:47,"['came', 'hot', 'chicken', 'won', 'entire', 'm..."
2,qmr304jvtYetK5i_Djrx_A,cMkPQZVDOibs2bz8St7Acg,JvawJ9bSr22xn4R9oLvl_w,3,0,0,0,I really should have used my better judgement ...,2014-02-26 22:33:39,"['better', 'judgement', 'ruled', 'eating', 'jo..."
3,KXNdht_of5t-Dh1eoaeYHQ,9m13F_RCcz_r48tQH82I5A,bdfZdB2MTXlT6-RBjSIpQg,3,0,0,0,This place is relatively smaller than other ph...,2013-12-09 08:12:28,"['place', 'relatively', 'smaller', 'pho', 'pla..."
4,asJ8k1sm8jO01bi-s5JW8g,goySBsZ3QJfSaElPIDIzLw,14ZGwnDyydXdSBsLXpSUrA,5,0,0,0,Best Southeast restaurant in Philadelphia. The...,2011-06-03 03:54:54,"['best', 'southeast', 'restaurant', 'philadelp..."


**Applied for user_id data.**

In [14]:
#Create USER dataframe.
user_train = cln_train.groupby("user_id")["cleaned_text"].agg(list).reset_index()
user_train["cleaned_text"] = user_train["cleaned_text"].apply(lambda x: " ".join(map(str, x)))
user_train.head(5)

Unnamed: 0,user_id,cleaned_text
0,---2PmXbF47D870stH1jqA,"['matter', 'night', 'goservice', 'food', 'exce..."
1,--4AjktZiHowEIBCMd4CZA,"['love', 'village', 'whiskey', 'shame', 'crowd..."
2,--E0uVPphTORm_OiZ5KCvA,"['worth', 'wait', 'worked', 'restaurants', 'to..."
3,--KMTwCrhKKUmr7riuS4WQ,"['loved', 'place', 'food', 'atmosphere', 'sant..."
4,--S8M395r8NtOCvS2LRfDw,"['awesome', 'pizza', 'large', 'piewingsand', '..."


**Applied for business_id data.**

In [15]:
#Create BUSINESS dataframe.
business_train = cln_train.groupby("business_id")["cleaned_text"].agg(list).reset_index()
business_train["cleaned_text"] = business_train["cleaned_text"].apply(lambda x: "".join(map(str, x)))
business_train.head(5)

Unnamed: 0,business_id,cleaned_text
0,--ZVrH2X2QXBFdCilbirsw,"['best', 'hoagies', 'prosciutto', 'classic', '..."
1,--_9CAxgfXZmoFdNIRrhHA,"['huge', 'place', 'boxes', 'big', 'store', 're..."
2,-02xFuruu85XmDn2xiynJw,"['new', 'tucson', 'new', 'physician', 'recomme..."
3,-0Ym1Wg3bXd_TDz8JtvOQg,"['neat', 'trendy', 'little', 'ice', 'cream', '..."
4,-0fvhILrC9UsQ6gLNpZlTQ,"['try', 'omg', 'stopped', 'night', 'sampling',..."


### **Create Embeddings.**

In [16]:
def load_glove(vector_len=50):
  '''Function using Glove.'''

  #Vector_len must be 50/100/200/300.
  GloVe = f"glove.6B.{vector_len}d.txt"

  #Dictionary stores embeddings of all words.
  embeddings_dict = {}

  with open(GloVe, 'r', encoding="utf-8") as f:
    for line in f:
      values = line.split()
      word = str(values[0])

      vector = np.asarray(values[1:], "float32")
      embeddings_dict[word] = vector
  return embeddings_dict

#Using vector length = 50 per recommendation.
glove_embeddings = load_glove(50)

In [17]:
def compute_mean_embedding(words, embeddings_dict):
  '''Function to compute mean embedding of a list of words.'''

  #Filter embedding_dict by words and store the embeddings in a list.
  embeddings = [embeddings_dict[word] for word in words if word in embeddings_dict]

  #Check if any embeddings were found.
  if embeddings:
    #Calculate the mean of the embeddings and return the result.
    return np.mean(embeddings, axis=0)
  else:
    #Return a zero vector if no embeddings were found.
    return np.zeros(len(next(iter(embeddings_dict.values()))))

**Generate mean embeddings.**

In [18]:
#Update the user dataframe with mean embeddings.
user_train['mean_embedding'] = user_train['cleaned_text'].apply(lambda x: compute_mean_embedding(x, glove_embeddings))

#Update the business dataframe with mean embeddings.
business_train['mean_embedding'] = business_train['cleaned_text'].apply(lambda x: compute_mean_embedding(x, glove_embeddings))

In [19]:
#Drop the cleaned text column.
user_train.drop("cleaned_text", axis=1, inplace=True)
business_train.drop("cleaned_text", axis=1, inplace=True)

In [20]:
#Check if the length of value is greater than or smaller than 50.
busineslen = business_train["mean_embedding"].apply(lambda x: len(x) == 50).any()

#Check if there are any issues while calculating mean.
print(f"Are the length of embedding equal to 50: {busineslen}")

#Check if the length equals to 50.
userlen = user_train["mean_embedding"].apply(lambda x: len(x) == 50).any()

#Check if there are any issues while calculating mean.
print(f"Are the length of embedding equal to 50: {userlen}")

Are the length of embedding equal to 50: True
Are the length of embedding equal to 50: True


In [21]:
#User data.
user_train.head(5)

Unnamed: 0,user_id,mean_embedding
0,---2PmXbF47D870stH1jqA,"[-0.040199738, 0.86262065, 0.5210789, 0.398323..."
1,--4AjktZiHowEIBCMd4CZA,"[-0.04254708, 0.8526091, 0.5595566, 0.43888918..."
2,--E0uVPphTORm_OiZ5KCvA,"[-0.048984144, 0.83067375, 0.5336113, 0.415816..."
3,--KMTwCrhKKUmr7riuS4WQ,"[-0.051070765, 0.83776945, 0.5122076, 0.394447..."
4,--S8M395r8NtOCvS2LRfDw,"[-0.013747137, 0.8503984, 0.5158654, 0.3981377..."


In [22]:
#Busines data.
business_train.head(5)

Unnamed: 0,business_id,mean_embedding
0,--ZVrH2X2QXBFdCilbirsw,"[-0.054003138, 0.8348256, 0.48453295, 0.372190..."
1,--_9CAxgfXZmoFdNIRrhHA,"[-0.061999418, 0.85494626, 0.5135551, 0.397855..."
2,-02xFuruu85XmDn2xiynJw,"[-0.06751067, 0.84385616, 0.5371855, 0.4114175..."
3,-0Ym1Wg3bXd_TDz8JtvOQg,"[-0.0600089, 0.854752, 0.5121427, 0.39534807, ..."
4,-0fvhILrC9UsQ6gLNpZlTQ,"[-0.06821111, 0.8545435, 0.50191456, 0.3986616..."


### **Step 2: Recommendations (Rating Predictions).**

**Note:** Since there are either business/user IDs value exists in test data, but not in the train data. We agreed to pass on that pairs.

It does not make sense to do data imputation here, since it represents a user-business's unique relationship characteristics.

**There are 5,520 pairs of business/user IDs that satisfies.**

In [23]:
def compute_predicted_score(user_id, business_id=None, user_train = user_train, business_train = business_train):
  '''Function to compute the predicted score.'''

  #Get user ID.
  user = user_train[user_train["user_id"] == user_id]
  #Get business_id.
  business = business_train[business_train["business_id"] == business_id]

  #If both id's exists in training data then.
  if len(user) != 0 and len(business) !=0:
    #Get the embedding for user.
    user_embedding = user["mean_embedding"].values[0]
    #Get the embedding for business.
    business_embedding = business["mean_embedding"].values[0]
    #Get the dot product of the embeddings.
    dot_product = np.dot(user_embedding, business_embedding)
    return dot_product

  #Either one of the ids is not in the training data, so we pass None.
  else:
    return None

In [24]:
def compute_squared_error(prediction, ground_truth):
  '''Computing squared error.'''
  return (prediction - ground_truth) ** 2

In [25]:
def compute_rmse(predictions):
  '''Computing RMSE.'''

  #Get root mean squared error.
  mse = np.mean(predictions)
  return np.sqrt(mse)

**Make prediction on test data.**

In [26]:
#--Takes 3 mins to run the cell!--
#Lists to store prediction (dot product), and the ground true (stars) values.
prediction = []
ground_truths = []

#Iterate through test data frame.
for idx, row in df_test.iterrows():

  #Get user/business IDs
  user_id = row["user_id"]
  business_id = row["business_id"]

  #Compute the score for each case.
  predicted = compute_predicted_score(user_id, business_id)

  #If predicted is calculable / not None.
  if predicted is not None:
    #Get the true value for that row.
    y_true = row["stars"]

    #Append the predicted value.
    prediction.append(predicted)
    #Append the true value.
    ground_truths.append(y_true)

In [27]:
print('Prediction (dot product value) for first 5 pairs:')
prediction[:5]

Prediction (dot product value) for first 5 pairs:


[15.320987, 15.26693, 15.388164, 15.225558, 15.357195]

In [28]:
print('Ground truth (stars value) for first 5 pairs:')
ground_truths[:5]

Ground truth (stars value) for first 5 pairs:


[3, 4, 4, 3, 5]

**Note:** At this point we unsure to use either:

1. The original dot product value to calculate RMSE.
2. Create a 'prediction' stars system based on dot product values to calculate RMSE.

Ultimately, we decided to do both approaches.

**Create stars system prediction based on dot product values.**

In [29]:
#Defining range of dot product values and star ratings.
min_dot_prod = min(prediction)
max_dot_prod = max(prediction)
star_rating = np.arange(1,6)

#Define the num of bins.
num_bin = len(star_rating) - 1

#Generating 5 evenly-spaced bin boundaries between min and max dot product values.
bin_boundaries = np.linspace(min_dot_prod, max_dot_prod, num_bin+1)

In [30]:
def assign_start_rating(dot_product):
  '''Function to assign dot product values to start rating.'''

  #Value is considered to be in the bin if <= the right boundary.
  return np.digitize(dot_product, bin_boundaries, right=True) - 1

In [31]:
#Binning the predictions.
binned_predictions = [assign_start_rating(pred) for pred in prediction]

In [32]:
print('Prediction (stars value) for first 5 pairs:')
binned_predictions[:5]

Prediction (stars value) for first 5 pairs:


[3, 3, 3, 2, 3]

**Calculate Squared-error.**

In [33]:
#Binned data.
squared_error_binned = []

#Iterating through the binned predictions and get the squared error.
for i in range(len(binned_predictions)):
  squared_error_binned.append(compute_squared_error(binned_predictions[i], ground_truths[i]))


#Unbinned data.
squared_error_unbinned = []

#Iterating through the binned predictions and get the squared error.
for i in range(len(prediction)):
  squared_error_unbinned.append(compute_squared_error(prediction[i], ground_truths[i]))

In [None]:
print('Squared error for first 5 pairs - binned:')
squared_error_binned[:5]

Squared error for first 5 pairs - binned:


[0, 1, 1, 1, 4]

In [34]:
print('Squared error for first 5 pairs - unbinned:')
squared_error_unbinned[:5]

Squared error for first 5 pairs - unbinned:


[151.80671443802657,
 126.94370320771122,
 129.6902694193932,
 149.46427528077857,
 107.27148620720618]

In [35]:
#Compute and print RMSE for binned predictions.
print(f"RMSE with binning predictions: {compute_rmse(squared_error_binned):.2f}")

#Compute the rmse score without binned predictions.
print(f"RMSE without binning predictions: {compute_rmse(squared_error_unbinned):.2f}")

RMSE with binning predictions: 1.89
RMSE without binning predictions: 11.45


## **Task 3: Item-based Collaborative Recommendation using Embeddings.**

**Simple process the businesses dataset.**

In [164]:
def splitCategories(category_str):
  '''Function to convert cell in 'categories' column into a list of strings.'''

  #Split the string by commas.
  parts = category_str.split(', ')

  #List to store individual words.
  wordList = []

  for part in parts:
    #wordList.extend(part.split(' '))
    words = part.strip().split(' ')
    wordList.extend(word.strip("()") for word in words)

  #Speical cases like 'Naturopathic/Holistic'.
  wordList = [word.split('/') if '/' in word else word for word in wordList]
  #Special cases. For ex: 'Coffee&Tea'.
  wordList = [word for word in wordList if word not in ('&', '/')]
  return wordList

In [165]:
businessData = df_businesses.copy()
businessData['categories'] = businessData['categories'].astype(str)
businessData['categories'] = businessData['categories'].apply(splitCategories)

In [166]:
businessData.head(5)

Unnamed: 0.1,Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"[Doctors, Traditional, Chinese, Medicine, [Nat...",
1,1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"[Shipping, Centers, Local, Services, Notaries,...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","[Department, Stores, Shopping, Fashion, Home, ...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","[Restaurants, Food, Bubble, Tea, Coffee, Tea, ...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","[Brewpubs, Breweries, Food]","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


**Generate filtered data based on user's query.**

In [167]:
def filteredData(text, data = businessData):
  '''Data filtering based on user's query. Derived from Task 1, without using 'Best' logic.'''

  #Tokenizing query text.
  doc = nlp(text)
  tokenList = process_text(text)
  tokenList = [token.capitalize() for token in tokenList]
  pattern = r"'(.*?)'"
  placeName = re.findall(pattern, text)

  #List to store city names: If user ask for multiple cities.
  cityList = []

  #Extract city.
  for ent in doc.ents:
    #Identify the text as locations.
    if ent.label_ =="GPE":
      cityList.append(ent.text)

  #Extract establishment name from query text.
  if placeName:
    placeName = placeName[0]

  #Extract categories list of the establishment name.
  if placeName:
    categories = businessData[businessData["name"] == placeName]['categories']
    categories = [item for sublist in categories for item in sublist]
  #If user not enter, empty.
  else:
    categories = []

  #---Compile the filtered dataset.---

  #If user did put a benchmark establishment.
  if categories:
    #Must match at least 3 categories of benchmarking establishment.
    filteredData = businessData[businessData['categories'].apply(lambda x: sum(category in x for category in categories) >= 3)]

    #Tokens in user query text, extra layer.
    mask = pd.Series(False, index=filteredData.index)
    for token in tokenList:
      mask = mask | filteredData['categories'].apply(lambda categories: any(token in category for category in categories))

      #Filter the DataFrame based on mask.
      filteredData_final = filteredData[mask]

  #If user did NOT put a benchmark establishment.
  else:
    #Use tokens in user query text for filtering.
    mask = pd.Series(False, index=businessData.index)
    for token in tokenList:
      mask = mask | businessData['categories'].apply(lambda categories: any(token in category for category in categories))
      filteredData_final = businessData[mask]

  #If user also put recommended cities.
  if len(cityList) == 0:
    return filteredData_final[["business_id", "name", "stars", "categories", "city"]]
  else:
    #Filter the DataFrame based on city names.
    filteredData_final = filteredData_final[filteredData_final["city"].isin(cityList)]
    return filteredData_final[["business_id", "name", "stars", "categories", "city"]]

**Test code for filterdData function.**

In [168]:
res = filteredData("Give me hair salons recommendation that like 'The Waxing Queen Of Tucson' in Tucson", businessData)
display(res.head(5))

Unnamed: 0,business_id,name,stars,categories,city
173,E-nhxuu3zbt02oCj_1AFng,Selah,5.0,"[Waxing, Hair, Removal, Skin, Care, Eyelash, S...",Tucson
233,h_3oLlQ_CFohwlzsrSZ6xQ,The Waxing Queen Of Tucson,4.5,"[Beauty, Spas, Hair, Removal, Day, Spas, Skin,...",Tucson
295,YG0OOMLP3MWYPKHpkA_mew,S&K Salon,4.0,"[Men's, Hair, Salons, Waxing, Hair, Salons, Sk...",Tucson
478,fCRGXD-TTLgNhaC4mVFFJQ,El Con Health & Wellness Center,4.5,"[Beauty, Spas, Nutritionists, Medical, Spas, L...",Tucson
763,1gVkgLjul7xSIsJRCZHhmg,Straight Edge Barber Shop,4.0,"[Professional, Services, Men's, Hair, Salons, ...",Tucson


In [169]:
def computeRecommendation(query, df=business_train):
  '''Function to get top recommendations.'''

  #Get the filtered data.
  city_df = filteredData(query)

  #Text process the query.
  clean_query = process_text(query)

  #Get the mean embedding for the query.
  mean_query = compute_mean_embedding(clean_query, glove_embeddings)

  #Get only data that are common in the business data and the filtered data.
  merged_df = pd.merge(df, city_df, on='business_id', how='inner')

  #Computing cosine manually.
  cosine_vals = []

  #Itereate through the merged data set.
  for i in range(len(merged_df)):
    #Get the embeddings.
    mean_business = merged_df["mean_embedding"].values[i]

    #Compute cosine similarity.
    cosine = np.dot(mean_query, mean_business) / (norm(mean_query) * norm(mean_business))

    #Store the similarity score to the list.
    cosine_vals.append(cosine)

  #Merging the new values to the merged df.
  merged_df["cosine_similarity"] = cosine_vals

  #Recommend restaurants based on the query and highest similarity score.
  sorted_df = merged_df.sort_values(by="cosine_similarity", ascending=False).reset_index()
  final_df = sorted_df[["name", "stars", "categories", "city"]]

  #If no cities then we return a print statement.
  if len(final_df) == 0:
    return f"There is no recommendations within those cities :/"
  else:
    #If we have a dataset then we return the recommendations.
    return final_df.head(5)

In [179]:
def geoRec(queryRes):
  '''Show recommendation as map.'''

  city = queryRes['city'].iloc[0]
  locData = queryRes.merge(businessData[['name', 'city', 'latitude', 'longitude']], on=['name', 'city'], how='left')

  #Tackle cases where same business name in differnt cities.
  filtered_locData = locData[locData['city'] == city]
  geolocator = Nominatim(user_agent="my_geocoder")

  #Get the latitude and longitude of the city.
  cityLocation = geolocator.geocode(city, timeout=None)
  cityLatitude, cityLongitude = cityLocation.latitude, cityLocation.longitude

  #Folium map using city as the center.
  mapCenter = [cityLatitude, cityLongitude]
  mapCity = folium.Map(location=mapCenter, zoom_start=12, width=700, height=500)

  #Add a marker for the city.
  folium.Marker(location=mapCenter, popup=f"City: {city}").add_to(mapCity)

  #Map each establishment in queryRes.
  for index, row in filtered_locData.iterrows():
    #Use respective latitude and longitude from businesses data.
    establishmentLocation = [row['latitude'], row['longitude']]
    folium.Marker(location=establishmentLocation, popup=row['name']).add_to(mapCity)

  display(mapCity)

### **Query 1: "List sushi Japanese restaurants that are similar to 'Tuna Bar' in Santa Barbara."**

In [180]:
query1 = f"List sushi Japanese restaurants that are similar to 'Tuna Bar' in Santa Barbara"
query1res = computeRecommendation(query1)
print(f"Here are the recommendataions for the 1st query:")
display(query1res)
print('\n')
geoRec(query1res)

Here are the recommendataions for the 1st query:


Unnamed: 0,name,stars,categories,city
0,Sakana Sushi Bar & Japanese,4.0,"[Sushi, Bars, Japanese, Restaurants]",Santa Barbara
1,Tsukemono,4.0,"[Sushi, Bars, Japanese, Restaurants]",Santa Barbara
2,Hibachi Steak House & Sushi Bar,3.5,"[Steakhouses, Sushi, Bars, Restaurants, Japanese]",Santa Barbara
3,Ichiban,3.5,"[Restaurants, Japanese, Sushi, Bars]",Santa Barbara
4,Yoichi's,4.5,"[Restaurants, Nightlife, Japanese, Bars]",Santa Barbara






### **Query 2: "Give me IV Hydration spas recommendation in Nashville that similar to '61Five Health & Wellness."**

In [181]:
query2 = f"Give me IV Hydration spas recommendation in Nashville that similar to '61Five Health & Wellness'"
query2res = computeRecommendation(query2)
print(f"Here is the recommendations for the 2nd query:")
display(query2res)
print('\n')
geoRec(query2res)

Here is the recommendations for the 2nd query:


Unnamed: 0,name,stars,categories,city
0,Supercuts,3.0,"[Men's, Hair, Salons, Cosmetics, Beauty, Suppl...",Nashville
1,Green Hills Barber Shop,4.0,"[Hair, Salons, Dry, Cleaning, Laundry, Hair, S...",Nashville
2,Irene's Spa & Wellness,5.0,"[Beauty, Spas, Skin, Care, Waxing, Day, Spas, ...",Nashville
3,Tina's Salon For Nails and Hair,4.0,"[Nail, Salons, Beauty, Spas, Hair, Salons]",Nashville
4,Studio 1 Nails,3.0,"[Nail, Salons, Beauty, Spas]",Nashville






### **Query 3: "Show me apartments complex like 'Ridge Carlton Apartments' in Indianapolis."**

In [182]:
#Only 3 locations satisify/
query3 = f"Show me apartments complex like 'Ridge Carlton Apartments' in Indianapolis"
query3res = computeRecommendation(query3)
print(f"Here is the recommendations for the 3rd query:")
display(query3res)
print('\n')
geoRec(query3res)

Here is the recommendations for the 3rd query:


Unnamed: 0,name,stars,categories,city
0,Scandia Apartments,2.0,"[Home, Services, Real, Estate, Apartments]",Indianapolis
1,Oak Lake At Crooked Creek Apartments by J.C. Hart,1.5,"[Apartments, Home, Services, Real, Estate]",Indianapolis
2,Carlyle Court,2.5,"[Real, Estate, Services, Home, Services, Real,...",Indianapolis






### **Query 4: "Give me recommendation like 'Vintage Bar and Grill' in Tampa."**

In [183]:
query4 = f"Give me recommendation like 'Vintage Bar and Grill' in Tampa"
query4res = computeRecommendation(query4)
print(f"Here is the recommendations for the 4th query:")
display(query4res)
print('\n')
geoRec(query4res)

Here is the recommendations for the 4th query:


Unnamed: 0,name,stars,categories,city
0,Scores Gentlemens Club & Steakhouse,2.5,"[Lounges, Adult, Entertainment, Bars, Champagn...",Tampa
1,The Gasparilla Bar,3.5,"[Bars, Nightlife]",Tampa
2,Twisted Olive,3.5,"[Piano, Bars, Bars, Nightlife]",Tampa
3,Four Green Fields,4.5,"[Irish, Pub, Bars, Restaurants, Irish, Nightlife]",Tampa
4,Chili's,2.0,"[Chicken, Wings, Bars, Restaurants, Nightlife,...",Tampa






### **Query 5: "Give me hair salons recommendation that like 'The Waxing Queen Of Tucson' in New Orleans."**

In [185]:
query5 = f"Give me hair salons recommendation that like 'The Waxing Queen Of Tucson' in New Orleans"
query5res = computeRecommendation(query5)
print(f"Here is the recommendations for the 5th query:")
display(query5res)
print('\n')
geoRec(query5res)

Here is the recommendations for the 5th query:


Unnamed: 0,name,stars,categories,city
0,Lee Nails,4.0,"[Nail, Salons, Massage, Beauty, Spas]",New Orleans
1,WAX,4.5,"[Hair, Removal, Skin, Care, Waxing, Beauty, Spas]",New Orleans
2,John Jay Salons,3.5,"[Beauty, Spas, Hair, Salons]",New Orleans
3,"The Ritz-Carlton Spa, New Orleans",4.0,"[Beauty, Spas, Restaurants, Hotels, Event, Pla...",New Orleans
4,Pamper & Polish Nails,2.5,"[Eyebrow, Services, Beauty, Spas, Nail, Salons...",New Orleans






In [189]:
!jupyter nbconvert --to html CSC575-Final-Project-Task2_3.ipynb

[NbConvertApp] Converting notebook CSC575-Final-Project-Task2_3.ipynb to html
[NbConvertApp] Writing 871512 bytes to CSC575-Final-Project-Task2_3.html
