In [2]:
import pandas as pd
import numpy as np

import json
import os
import re

import matplotlib.pyplot as plt
import seaborn

import gensim
from gensim.utils import simple_preprocess

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Add some new features

In [6]:
# restaurants = pd.read_json('/Users/liqingran/Desktop/Yelp System/nyc_restaurants.json')
# reviews = pd.read_csv('cleaned_reviews.csv')

In [3]:
df = pd.read_csv('restaurants_features.csv')

In [83]:
df.head()

Unnamed: 0,name,price,rating,city,category,transactions,review_count,cleaned_review,length
0,Thursday Kitchen,$$,4.5,New York,"['Korean', 'American (New)', 'Tapas/Small Plat...","['delivery', 'pickup']",1379,start saying outdoor seating true accomplishme...,5679
1,Amélie,$$,4.5,New York,"['French', 'Wine Bars']","['delivery', 'pickup']",2719,love cute little restaurant server nice food e...,4394
2,The Cabin NYC,$$,4.0,New York,"['American (New)', 'Cocktail Bars', 'Breakfast...","['restaurant_reservation', 'delivery', 'pickup']",269,honest place star wrong food amazing cute spot...,5236
3,The Osprey,$$,4.0,Brooklyn,['American (New)'],['delivery'],230,restaurant stunning incredible outdoor seating...,5143
4,Kong Sihk Tong 港食堂,$,4.0,New York,"['Chinese', 'Hong Kong Style Cafe']","['delivery', 'pickup']",300,kong sihk tong marvelous cantonese spot choice...,5427


In [4]:
def to_str(category):
    category = category.lower().replace('(new)', '').replace('/', ' ')
    category = re.sub(r'[^A-Za-z]+', ' ', category)
    category =  category.strip()
    return category

In [5]:
df['category'] = df['category'].apply(to_str)
df['features'] = df['category'] + df['cleaned_review']
df['features'] = df['features'].apply(lambda x: ' '.join(x.split()[:5000]))
df.head()

Unnamed: 0,name,price,rating,city,category,transactions,review_count,cleaned_review,length,features
0,Thursday Kitchen,$$,4.5,New York,korean american tapas small plates,"['delivery', 'pickup']",1379,start saying outdoor seating true accomplishme...,5679,korean american tapas small platesstart saying...
1,Amélie,$$,4.5,New York,french wine bars,"['delivery', 'pickup']",2719,love cute little restaurant server nice food e...,4394,french wine barslove cute little restaurant se...
2,The Cabin NYC,$$,4.0,New York,american cocktail bars breakfast brunch,"['restaurant_reservation', 'delivery', 'pickup']",269,honest place star wrong food amazing cute spot...,5236,american cocktail bars breakfast brunchhonest ...
3,The Osprey,$$,4.0,Brooklyn,american,['delivery'],230,restaurant stunning incredible outdoor seating...,5143,americanrestaurant stunning incredible outdoor...
4,Kong Sihk Tong 港食堂,$,4.0,New York,chinese hong kong style cafe,"['delivery', 'pickup']",300,kong sihk tong marvelous cantonese spot choice...,5427,chinese hong kong style cafekong sihk tong mar...


In [28]:
df = df[['name', 'price', 'rating', 'city', 'features']]
df.to_csv('train_features.csv', index='False')

### Model 

参数调整：
- BoW
- TF-IDF


模型：
- 根据输入餐厅名字进行推荐
- 根据输入一些关键词进行推荐，这个有点难

#### Restaurants + TFIDF

In [29]:
# Get the index of users input restaurant
def get_index_from_name(restaurant):
    return df[df.name == restaurant].index.values[0]

In [30]:
def similarity_model(df):
    tf = TfidfVectorizer(analyzer='word', ngram_range=(2, 2), stop_words='english')
    tfidf_matrix = tf.fit_transform(df['features'])
    sg = cosine_similarity(tfidf_matrix)
    return sg

#### Some improvements
1. 在之前的模型中，我们输入的餐厅名字必须所有标点大小写空格都要跟数据中的餐厅名字匹配，这样的功能会让推荐系统的效率很低，因为很多时候我们只希望输入部分内容。所以需要改进一下餐厅名字的模糊匹配机制，即输入一个大概的餐厅名字(忽略大小写和一些数字标点)，返回匹配度最高的那个作为用户输入的餐厅。

In [33]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [41]:
def get_recommendations(df, num_recommendation=10):
    user_likes = input('Input you favorarite restaurant: ')
    
    restaurants = df.name.tolist()
    input_restaurant = process.extract(user_likes, restaurants, limit=1)[0][0]
    
    input_business_idx = get_index_from_name(input_restaurant)
    sg =  similarity_model(df)
    similar_restaurants = list(enumerate(sg[input_business_idx]))
    sorted_similar_restaurants = sorted(similar_restaurants, key=lambda x: x[1], reverse=True)
    recommend_list = sorted_similar_restaurants[:num_recommendation]
    recommend_idx = [idx for idx, score in recommend_list] 
    recommend_restaurant = df.iloc[recommend_idx]['name']
    user_input = df.iloc[recommend_idx[0]]['name']
    print('Finding your input restaurant: ', user_input)
    return recommend_restaurant

In [42]:
get_recommendations(df, 10)

Input you favorarite restaurant: Kong Sihk
Finding your input restaurant:  Kong Sihk Tong 港食堂


4              Kong Sihk Tong 港食堂
247                S Wan Cafe 洋紫荆
379            Fan Fried Rice Bar
55     Yin Ji Chang Fen Rice Roll
450      E Noodle 粥麵館 - Chinatown
680        Kam Boat Bakery & Cafe
130        Taiwan Pork Chop House
212         Double Chicken Please
520              Rice Noodle Cart
88      Tonii’s Fresh Rice Noodle
Name: name, dtype: object

#### 在这一步之前停下！需要你们去本地的数据集里对照餐厅情况。。。。。。

2. 在返回推荐的时候，除了推荐的餐厅名字，同时还能返回推荐餐厅的其他信息，如部分用户评论（3-5条），价格，rating等...可能需要连接我的数据库来做

In [12]:
import pymongo
from pymongo import MongoClient

In [23]:
connect = 'mongodb+srv://m001-student:m001-mongodb-basics@sandbox.jqgjp.mongodb.net/restaurant_info?retryWrites=true&w=majority'
cluster = MongoClient(connect)
db = cluster['yelp_dataset']
restaurant_collection = db['restaurants']  
review_collection = db['reviews']      

In [24]:
def extrct_restaurant_info(recommend_restaurants):
    recs = recommend_restaurants.tolist()
    return_info = restaurant_collection.find({'name': {'$in': recs}})
    result = []
    for res in return_info:
        result.append(res)
    rec_df = pd.DataFrame(result)
    return rec_df

In [25]:
def extract_restaurant_review(recommend_restaurants):
    recs = recommend_restaurants.tolist()
    all_reviews = []
    for rec in recs:
        ret = review_collection.find({'name': rec})
        for r in ret:
            all_reviews.append(r)
    temp_df = pd.DataFrame(all_reviews)[['business_id', 'name', 'review']]
    df = pd.DataFrame()
    for rec in recs:
        df = pd.concat([df, temp_df[temp_df['name'] == rec].reset_index().iloc[:3]], ignore_index=True)
    review_df = pd.DataFrame(df.groupby(['name'])['review'].apply(list))
    review_df = review_df['review'].apply(pd.Series, index=['review1', 'review2', 'review3']).reset_index()
    return review_df

In [44]:
def recommendations_info(recommend_restaurants):
    rec_info = extrct_restaurant_info(recommend_restaurants)
    rec_review = extract_restaurant_review(recommend_restaurants)
    recommendation_df = pd.merge(rec_info, rec_review, on='name')[['name', 'price', 'rating', 'address', 'category', 'review1', 'review2', 'review3']]
    print('Restaurants recommended for you:')
    return recommendation_df

In [45]:
recommend_restaurants = get_recommendations(df, 10)
recommendations = recommendations_info(recommend_restaurants)
recommendations

Input you favorarite restaurant: kong sihk
Finding your input restaurant:  Kong Sihk Tong 港食堂
Restaurants recommended for you:


Unnamed: 0,name,price,rating,address,category,review1,review2,review3
0,Kong Sihk Tong 港食堂,$,4.0,"65 Bayard St New York, NY 10013","[Chinese, Hong Kong Style Cafe]",I was so excited to try this place out for ove...,"This is a classic HK cha chan tang, and I'm so...",HK style cafe that's gaining a lot of traffic!...
1,Yin Ji Chang Fen Rice Roll,$$,4.0,"91 Bayard St New York, NY 10013","[Breakfast & Brunch, Dim Sum, Cantonese]",The rice roll noodles are to die for here. In ...,Ordered takeout from here and the food was rea...,Joe's Steam Rice Rolls take the spot as Number...
2,Tonii’s Fresh Rice Noodle,$,4.5,"83 Bayard St New York, NY 10013",[Noodles],If you are like me and look forward to cheung ...,One of the most affordable meals in all of New...,"Ordered takeout - a fresh shrimp rice roll, a ..."
3,Taiwan Pork Chop House,$,4.0,"3 Doyers St New York, NY 10013","[Taiwanese, Chinese, Noodles]",Was craving for some Taiwanese food as I haven...,Service was quick. We called and placed order...,Stopped by the hole in the wall for lunch on m...
4,Double Chicken Please,,4.5,"115 Allen St New York, NY 10002","[Cocktail Bars, Burgers, Sandwiches]","Absolutely love the sandwiches here, they're d...",The story behind DCP is just as beautiful as i...,The new spot during the pandemic in the Lower ...
5,S Wan Cafe 洋紫荆,$,4.5,"85 Eldridge St Lower E New York, NY 10002","[Chinese, Hong Kong Style Cafe]",If I could give this place 0 star I would. Foo...,I've been to Chinatown since I was a little ki...,This is the most authentic hk style cafe that ...
6,Fan Fried Rice Bar,$,4.5,"525 Dekalb Ave Brooklyn, NY 11205",[Taiwanese],My husband and I dropped by here in this small...,"This is a Uber eats delivery review, we will n...",OMG SO GOOOOOD. some days you just crave reall...
7,E Noodle 粥麵館 - Chinatown,$,4.0,"5 Catherine St New York, NY 10038","[Hong Kong Style Cafe, Noodles, Chinese]",Adding amazing dishes that was ordered and not...,Updated review for delivery only. \n\nGood but...,Can't wait to go back to this place to try mor...
8,Rice Noodle Cart,$,4.5,"153 Centre St New York, NY 10013","[Cantonese, Street Vendors]",You will know this Rice Noodle Cart because th...,"So delicious, I grabbed a plain rice noodle in...",For those who have never been exposed to the c...
9,Kam Boat Bakery & Cafe,$,4.5,"111 Bowery St New York, NY 10002","[Bakeries, Cafes, Cupcakes]",Kam Boat is a quintessential no-frills Chinese...,Really good shumai and shrimp rice crepes. Egg...,I love the HK milk tea here. They have real d...
