In [1]:
import nltk
nltk.download('stopwords')

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from nltk.corpus import stopwords
from modules.dataImporter import yelp_import
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

[nltk_data] Downloading package stopwords to /Users/yhkim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# import the data (chunksize returns jsonReader for iteration)
datasets = yelp_import("small")

subset_business = datasets["businesses"]
subset_review = datasets["reviews"]
subset_checkin = datasets["checkins"]
subset_tip = datasets["tips"]
subset_user = datasets["users"]

In [3]:
# peak the tables
display(subset_business.head(2))
display(subset_review.head(2))
display(subset_checkin.head(2))
display(subset_tip.head(2))
display(subset_user.head(2))

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18


Unnamed: 0,business_id,date
0,---kPU91CF4Lq2-WlRu9Lw,"2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020..."
1,--0iUa4sNDFiZFrAdIWhZQ,"2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011..."


Unnamed: 0,user_id,business_id,text,date,compliment_count
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,They have lots of good deserts and tasty cuban...,2013-02-05 18:35:10,0


Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,...,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,...,264,184,157,251,1847,7054,3131,3131,1521,1946


In [4]:
# content-based
def recommend_restaurants(input_restaurant_name, my_city, dest_city, businesses,
                          reviews, vectorizer = TfidfVectorizer(), top_n = 10):
    """
        This function recommends similar restaurants in a destination city based on a restaurant name in the user's current city.

        Parameters:
            - input_restaurant_name (str): The name of the restaurant in the user's current city that they would like to
                                           find similar restaurants in the destination city.
            - my_city (str): The city where the input restaurant is located.
            - dest_city (str): The city where the user wants to find similar restaurants.
            - businesses (pandas.DataFrame): A DataFrame that contains information about businesses,
                                             including their name and location.
            - reviews (pandas.DataFrame): A DataFrame that contains information about reviews for businesses,
                                          including the text of the review and the business it was left for.
            - vectorizer (TfidfVectorizer, optional): A TfidfVectorizer object that is used to vectorize the text of the
                                                      reviews. The default is a new instance of the TfidfVectorizer class.
            - top_n (int, optional): number of output to return. Default is 10.

        Returns:
            - pandas.DataFrame or str: Returns a DataFrame with columns 'name', 'business_id', 'categories', 'similarity_score', 'avg_rating', and 'city'
                                       if similar restaurants are found in the destination city.
                Returns a string if:
                - The input restaurant is not found in the source city.
                - No reviews are found for the target business.
                - No similar business is found in the destination city.
    """
    
    # Convert both the input and the names in the 'businesses' data to lowercase
    input_restaurant_name = input_restaurant_name.lower()
    businesses['name'] = businesses['name'].str.lower()
    businesses['city'] = businesses['city'].str.lower()
    
    # Filter the businesses data to only include those in the source city
    businesses_in_my_city = businesses[businesses['city'] == my_city.lower()]
    
    # Find the row in the 'businesses' data where the name matches the input
    target_business = businesses_in_my_city[businesses_in_my_city['name'] == input_restaurant_name]
    
    # Check if there is a matching business name
    if target_business.empty:
        return 'Business not found in source city'
    
    target_business_id = target_business.iloc[0]['business_id']
    
    # Filter reviews data to only include reviews for the target business
    target_reviews = reviews[reviews['business_id'] == target_business_id]
    
    # Check if target_reviews is empty
    if target_reviews.empty:
        return "No reviews found for the target business"
    
    # Concatenate the text of all the reviews for the target business
    text = " ".join(review for review in target_reviews['text'])
    
    # Vectorize the text
    X = vectorizer.fit_transform([text])
    
    # Calculate cosine similarity
    cosine_similarities = cosine_similarity(X, X).flatten()
    
    # Find the indices of the most similar reviews
    related_review_indices = cosine_similarities.argsort()[:-11:-1]
    
    # Get the business_ids of the most similar reviews
    similar_business_ids = [reviews.iloc[index]['business_id'] for index in related_review_indices]
    
    # Filter businesses data to only include the most similar businesses
    similar_businesses = businesses[businesses['business_id'].isin(similar_business_ids)]
    
    # Remove the target business
    similar_businesses = similar_businesses[similar_businesses['business_id'] != target_business_id]
    
    # Filter the 'categories' column to only include categories without 'Restaurants'
    similar_businesses = similar_businesses[~similar_businesses['categories'].str.contains("Restaurants")]
    
    # Filter the businesses data to only include those in the destination city
    similar_businesses_in_dest_city = similar_businesses[similar_businesses['city'] == dest_city.lower()]
    
    # If there is no similiar businesses, return a string saying there is none
    if similar_businesses_in_dest_city.empty:
        return 'No similar business found in destination city'
    
    # Get the cosine similarity scores for the most similar businesses
    similarity_scores = [cosine_similarities[related_review_indices[i]] for i in range(len(related_review_indices))]
    
    # Get the average ratings
    avg_ratings = [reviews[reviews['business_id'] == business_id]['stars'].mean() for business_id in similar_business_ids]
    
    # Create a table with business name, categories, and similarity score
    result = pd.DataFrame({'name': similar_businesses['name'], 
                           'business_id': similar_business['business_id'],
                           'categories': similar_businesses['categories'], 
                           'similarity_score': similarity_scores,
                           'avg_rating': avg_ratings})
    
    # Sort the table by similarity score in descending order
    result = result.sort_values(by=['similarity_score', 'avg_rating'], ascending=[False, False])
    
    return result.head(top_n)


# Example usage
display(recommend_restaurants("tuna bar", 'philadelphia', 'philadelphia', subset_business, subset_review))

'No reviews found for the target business'

In [6]:
# item-based

def recommend_restaurants_item_based(input_restaurant_name, my_city, dest_city,
                                     businesses, reviews, top_n = 10):
    """
        Given an input restaurant name and two cities, this function recommends similar restaurants in the destination city
        by comparing the categories of businesses. The similarity score and average rating of each restaurant is calculated,
        and the top top_n most similar and highly rated restaurants are returned.

        Parameters:
            - input_restaurant_name (str): The name of the input restaurant.
            - my_city (str): The city of the input restaurant.
            - dest_city (str): The destination city to recommend similar restaurants.
            - businesses (pandas.DataFrame): A DataFrame containing information about businesses,
                                             including business name, ID, city, and categories.
            - reviews (pandas.DataFrame): A DataFrame containing information about reviews, including business ID and rating.
            - top_n (int): The number of restaurants to recommend (default is 10).

        Returns:
            - pandas.DataFrame: A DataFrame containing the name, ID, categories, similarity score, and average rating
                                of the top top_n most similar and highly rated restaurants in the destination city.
    """
    
    # Convert the input restaurant name and the names in the 'businesses' data to lowercase
    input_restaurant_name = input_restaurant_name.lower()
    businesses['name'] = businesses['name'].str.lower()
    
    # Filter the businesses data to only include those in the user's city
    my_city_businesses = businesses[businesses['city'] == my_city]
    
    # Find the row in the 'businesses' data where the name matches the input
    target_business = my_city_businesses[my_city_businesses['name'] == input_restaurant_name]
    
    # Check if there is a matching business name in the user's city
    if target_business.empty:
        return 'Business not found in your city'
    
    target_business_id = target_business.iloc[0]['business_id']
    
    # Filter the businesses data to only include those in the destination city
    dest_city_businesses = businesses[businesses['city'] == dest_city]
    
    # Create an empty list to store the similarity scores for each business
    similarity_scores = []
    
    # Loop through each business in the destination city
    for i, business in dest_city_businesses.iterrows():
        # Get the categories of the target business and the current business
        target_categories = set(target_business['categories'].iloc[0].split(", "))
        current_categories = set(business['categories'].split(", "))
        
        # Calculate the Jaccard similarity between the target business and the current business
        similarity_score = len(target_categories & current_categories) / len(target_categories | current_categories)
        
        # Store the similarity score for the current business
        similarity_scores.append((business['business_id'], similarity_score))
    
    # Sort the list of similarity scores in descending order
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Filter the 'businesses' data to only include the most similar businesses
    similar_businesses = businesses[businesses['business_id'].isin([x[0] for x in similarity_scores])]

    # Remove the target business
    similar_businesses = similar_businesses[similar_businesses['business_id'] != target_business_id]

    # If there are no similar businesses in the destination city, return a message saying so
    if similar_businesses.empty:
        return 'No similar businesses found in the destination city'

    # Create a list of business IDs for the most similar businesses in the destination city
    business_ids = similar_businesses['business_id'].tolist()
    
    # Get the average rating for each similar business
    avg_ratings = []
    for business_id in business_ids:
        ratings = reviews[reviews['business_id'] == business_id]['stars'].mean()
        avg_ratings.append((business_id, ratings))

    # Sort the list of average ratings in descending order
    avg_ratings = sorted(avg_ratings, key=lambda x: x[1], reverse=True)

    # Similarity score dictionary
    similarity_scores_dict = {x[0]: x[1] for x in similarity_scores}
    
    # Return the names of the most highly rated restaurants
    recommendations = []
    for business_id, avg_rating in avg_ratings:
        business = similar_businesses[similar_businesses['business_id'] == business_id].iloc[0]
        recommendations.append({'name': business['name'],
                                'business_id': business['business_id'],
                                'city': business['city'],
                                'categories': [x for x in business['categories'].split(", ") if x != 'Restaurants'],
                                'similarity_score': similarity_scores_dict[business_id],
                                'avg_rating': avg_rating})

    df = pd.DataFrame(recommendations)
    df = df.sort_values(by=['similarity_score', 'avg_rating'], ascending=[False, False])
    df.reset_index(drop = True, inplace = True)
    
    return df.head(top_n)
    

display(recommend_restaurants_item_based("bar one", 'philadelphia', 'tucson', subset_business, subset_review))

Unnamed: 0,name,business_id,city,categories,similarity_score,avg_rating
0,rosati's pizza,Y6heWJJ9AmEL58fZwgi9YQ,tucson,"[Sports Bars, Bars, Nightlife, Italian, Pizza]",0.571429,
1,home plate sports pub,9C2rpb56aQvW0ViZHK9sPw,tucson,"[Bars, Sports Bars, Nightlife]",0.5,
2,good oak bar,bCIZeggW02uPdz2lobSjUA,tucson,"[Pubs, Bars, Wine Bars, Nightlife, Cocktail Bars]",0.428571,
3,gavi italian restaurant,x9K0RfZaT_zlw6DklBDzjw,tucson,[Italian],0.4,
4,sher-e-punjab,f82dhKNiUXsDVPMLqKYiIQ,tucson,"[Salad, Pakistani, Indian, Cocktail Bars, Food...",0.333333,4.0
5,rockabilly grill,3swM60bulnut1d4AZti80Q,tucson,"[Nightlife, American (New), Music Venues, Brea...",0.230769,
6,blackrock brewers,EEc3X2r94uNhCz_jOTSzHg,tucson,"[Food, Breweries, Pubs, Beer Bar, Nightlife, B...",0.222222,
7,china dragon restaurant,wghnIlMb_i5U46HMBGx9ig,tucson,[Chinese],0.166667,
8,papa murphy's,anLQj9AM8vjbcLSIE0iUgg,tucson,[Pizza],0.166667,
9,pita pit,KUx_q3nzqUINYgUTrZc9OA,tucson,[Sandwiches],0.166667,
