In [1]:
# PROBLEMS:
# chefs club alias changed
#  - store yelp_alias_list; have a way to find original

In [2]:
#  - input data; load soup; get latest ratings history; get latest rating... (store history)
#  - need to store point-in-time history for: rating, review count, price_range
#  - only store changes (i.e. no new entry if business_name/rating/price_range does not change)
#  - to access data: max(insert_date) as last_updated, [data]
#  - for yelp_alias: [insert_date, yelp_biz_id (hopefully this is unique & doesn't change),
#                     metric_name (yelp_alias), metric_value (new_alias)]
#  - to retrieve all aliases: groupby(yelp_biz_id).make_list(yelp_alias)?
#  - first: build latest version of data, in order to determine if a data point has changed

In [3]:
# TODO:
# date_added, date_visited
# rating_better & rating_worse over time (are the restaurants better/worse after I went?)
# test PSQL locally; store as JSON
# check data before writing, if different throw error
#   - (i.e. new categories, historical ratings_history changes, address changes)
# gspread
# unittest

In [4]:
# TESTS:
# test that ratings_distribution_text is a list of length 10
# test that ratings_distribution data averages to rating

In [5]:
# date_visited_rating = ratings_history[date_visited.year][date_visited.month-1][1]
# most_recent_rating = ratings_history[max(ratings_history.keys())][-1][1]

In [6]:
# Imports
import ast
import bs4
import collections
import copy
import datetime
import json
import pandas
import re
import requests
import sqlite3
import urllib

### Define

In [7]:
def get_yelp_alias(soup):
    # TODO: test that input_yelp_alias == scraped_yelp_alias; url may get re-routed
    yelp_alias = soup.find('meta', {'property': 'og:url'})
    yelp_alias = yelp_alias['content'].split('/')[-1]
    return yelp_alias

In [8]:
def get_yelp_biz_id(soup):
    yelp_biz_id = soup.find('meta', {'name': 'yelp-biz-id'})
    yelp_biz_id = yelp_biz_id['content']
    return yelp_biz_id

In [9]:
def get_is_closed(soup):
    is_closed = 'CLOSED' in soup.title.text
    return is_closed

In [10]:
def get_business_name(soup):
    business_name = soup.find('h1', {'class': 'biz-page-title'})
    business_name = business_name.text.strip()
    return business_name

In [11]:
def get_rating(soup):
    rating = soup.find('div', {'class': 'i-stars'})
    rating = rating['title'].split()[0]
    rating = float(rating)
    return rating

In [12]:
def get_review_count(soup):
    review_count = soup.find('span', {'class': 'review-count rating-qualifier'})
    review_count = review_count.text.strip().split()[0]
    review_count = int(review_count)
    return review_count

In [13]:
def get_ratings_history(soup):
    ratings_history = soup.find('div', {'id': 'rating-details-modal-content'})
    ratings_history = ratings_history['data-monthly-ratings']
    ratings_history = ast.literal_eval(ratings_history)
    ratings_history = {int(k):v for k,v in ratings_history.items()}
    ratings_history = json.dumps(ratings_history)    # stringify for sqlite; note that keys become strings 
    return ratings_history

In [14]:
def get_ratings_distribution(soup):
    # Convert HTML table to len-10 string list
    text = soup.find('table', {'class': 'histogram'})
    text = text.text.replace('\n', ' ')
    text = re.sub(' {2,}', ',', text)    # Replace 2+ whitespace with 1 comma
    text = text.strip(',').split(',')
    
    # Convert string list to dictionary
    ratings = [text[i] for i in range(len(text)) if i%2 == 0]
    ratings = [i.split()[0] for i in ratings]
    ratings = [int(i) for i in ratings]
    counts  = [text[i] for i in range(len(text)) if i%2 != 0]
    counts  = [int(i) for i in counts]
    ratings_distribution = {'rating' : ratings, 'count' : counts}
    ratings_distribution = json.dumps(ratings_distribution)    # stringify for sqlite
    return ratings_distribution

In [15]:
# ratings_distribution = pandas.DataFrame(ratings_distribution)
# ratings_distribution['contrib'] = ratings_distribution['rating'] * ratings_distribution['count']
# ratings_distribution['contrib'].sum()/ratings_distribution['count'].sum()

In [16]:
def get_price_range(soup):
    price_range = soup.find('span', {'class': 'business-attribute price-range'})
    price_range = price_range.text.count('$')
    return price_range

In [17]:
def get_category_list(soup):
    category_list = soup.find('span', {'class': 'category-str-list'})
    category_list = category_list.contents
    category_list = [i.string for i in category_list]
    category_list = [i for i in category_list if '\n' not in i]
    category_list = json.dumps(category_list)    # stringify for sqlite
    return category_list

In [18]:
def get_address(soup):
    address = soup.find('address')
    address = copy.copy(address)    # Make a copy to prevent modifying the original document
    for tag in address.find_all('br'):
        tag.replace_with('\n')
    address = address.text.strip().split('\n')
    address = ', '.join(address)
    return address

In [19]:
def get_neighborhood_list(soup):
    neighborhood_list = soup.find('span', {'class': 'neighborhood-str-list'})
    if neighborhood_list is not None:
        neighborhood_list = neighborhood_list.text
        neighborhood_list = neighborhood_list.strip().split(', ')
    neighborhood_list = json.dumps(neighborhood_list)    # stringify for sqlite
    return neighborhood_list

### Testing

In [20]:
yelp_alias = 'momofuku-má-pêche-new-york-2'

In [21]:
urllib.parse.quote_plus(yelp_alias)

'momofuku-m%C3%A1-p%C3%AAche-new-york-2'

In [22]:
url = 'https://www.yelp.com/biz/{}'.format(yelp_alias)
r = requests.get(url, timeout=5)
soup = bs4.BeautifulSoup(r.text, 'lxml')

In [23]:
yelp_alias = soup.find('meta', {'property': 'og:url'})
yelp_alias = yelp_alias['content'].split('/')[-1]
yelp_alias

'momofuku-m%C3%A1-p%C3%AAche-new-york-2'

In [24]:
price_range = soup.find('span', {'class': 'business-attribute price-range'})
price_range = price_range.text.count('$')
price_range

3

In [25]:
rating = soup.find('div', {'class': 'i-stars'})
rating = rating['title'].split()[0]
rating = float(rating)
rating

3.5

In [26]:
address = soup.find('address')
address = copy.copy(address)    # Make a copy to prevent modifying the original document
for tag in address.find_all('br'):
    tag.replace_with('\n')
address = address.text.strip().split('\n')
address = ', '.join(address)
address

'Chambers Hotel, 15 W 56th St, New York, NY 10019'

### Run

In [27]:
yelp_aliases = pandas.read_csv('yelp_aliases.csv')

In [28]:
yelp_aliases['date_added'] = yelp_aliases['date_added'].apply(lambda x: None if x == '-' else x)
yelp_aliases['date_added'] = pandas.to_datetime(yelp_aliases['date_added'])
yelp_aliases['date_visited'] = pandas.to_datetime(yelp_aliases['date_visited'])

In [29]:
# I did not add aliases for some food carts & bakeries; drop them for now
yelp_aliases = yelp_aliases[yelp_aliases['yelp_alias'].notna()]

In [30]:
# # PROBLEM: chefs club alias changed
# connection = sqlite3.connect('restaurants.db')
# already_done = pandas.read_sql_query('select * from restaurants', connection)
# already_done = already_done['yelp_alias'].tolist()
# not_done = yelp_aliases['yelp_alias'].apply(
#     lambda x: urllib.parse.quote_plus(x) not in already_done)
# yelp_aliases = yelp_aliases[not_done]

In [31]:
connection = sqlite3.connect('restaurants.db')
tables = pandas.read_sql_query("""
select * from sqlite_master
where type = 'table'""", connection)

In [32]:
tables

Unnamed: 0,type,name,tbl_name,rootpage,sql


In [33]:
if 'yelp_raw' in tables['tbl_name'].values:
    print('(yelp_raw exists, incremental update)')
else:
    print('(yelp_raw does not exist, create table)')

(yelp_raw does not exist, create table)


In [34]:
%%time
# Iterate
all_data = pandas.DataFrame()
for index, row in yelp_aliases.iterrows():
    # Set up soup
    yelp_alias = row['yelp_alias']
    url = 'https://www.yelp.com/biz/{}'.format(yelp_alias)
    r = requests.get(url, timeout=5)
    soup = bs4.BeautifulSoup(r.text, 'lxml')
    
    # Get data
    data = collections.OrderedDict()
    data['yelp_alias'] = get_yelp_alias(soup)
    data['is_closed'] = get_is_closed(soup)
    data['business_name'] = get_business_name(soup)
    data['rating'] = get_rating(soup)
    data['review_count'] = get_review_count(soup)
    data['ratings_history'] = get_ratings_history(soup)
    data['price_range'] = get_price_range(soup)
    data['category_list'] = get_category_list(soup)
    data['address'] = get_address(soup)
    data['neighborhood_list'] = get_neighborhood_list(soup)
    print([data['yelp_alias'], data['business_name']])

    # Melt
    data = pandas.Series(data).to_frame().T
    data = data.melt()
    data.columns = ['metric_name', 'metric_value']
    data.insert(0, 'yelp_biz_id', get_yelp_biz_id(soup))
    data.insert(0, 'insert_datetime', datetime.datetime.now())
    
    # Append
    all_data = all_data.append(data, ignore_index=True)

['muk-eun-ji-new-york', 'Muk Eun Ji']
['ramenco-new-york-2', 'RamenCo']
['blt-bar-and-grill-new-york', 'BLT Bar & Grill']
['bills-bar-and-burger-downtown-new-york', 'Bill’s Bar & Burger Downtown']
['ruchi-new-york-2', 'Ruchi']
['komegashi-too-jersey-city', 'Komegashi Too']
['smashburger-new-york-6', 'Smashburger']
['harrys-italian-new-york-2', 'Harry’s Italian']
['goa-taco-new-york', 'goa taco']
['morgensterns-finest-ice-cream-new-york-2', 'Morgenstern’s Finest Ice Cream']
['vanessas-dumpling-house-new-york-2', 'Vanessa’s Dumpling House']
['philip-marie-new-york', 'Philip Marie']
['joes-shanghai-new-york-2', 'Joe’s Shanghai']
['oka-sushi-new-york', 'Oka Sushi']
['pisillo-italian-panini-new-york', 'Pisillo Italian Panini']
['friedmans-new-york-59', 'Friedman’s']
['prince-street-pizza-new-york-2', 'Prince Street Pizza']
['the-crooked-knife-new-york-3', 'The Crooked Knife']
['ramen-ya-new-york-5', 'Ramen-Ya']
['hamilton-pork-jersey-city', 'Hamilton Pork']
['the-malt-house-new-york-3', 'Th

### Write to DB

In [35]:
connection = sqlite3.connect('restaurants.db')
all_data.to_sql('yelp_raw', connection, if_exists='append', index=False)

### Read from DB

In [70]:
data = pandas.read_sql_query('select * from yelp_raw', connection)

In [71]:
data.head()

Unnamed: 0,insert_datetime,yelp_biz_id,metric_name,metric_value
0,2018-08-29 23:35:20.399598,g_F9WJJpRFB40oPJdoD2uA,yelp_alias,muk-eun-ji-new-york
1,2018-08-29 23:35:20.399598,g_F9WJJpRFB40oPJdoD2uA,is_closed,1
2,2018-08-29 23:35:20.399598,g_F9WJJpRFB40oPJdoD2uA,business_name,Muk Eun Ji
3,2018-08-29 23:35:20.399598,g_F9WJJpRFB40oPJdoD2uA,rating,3.5
4,2018-08-29 23:35:20.399598,g_F9WJJpRFB40oPJdoD2uA,review_count,365


In [72]:
data = data.pivot(index='yelp_biz_id', columns='metric_name', values='metric_value')

In [73]:
data.head()

metric_name,address,business_name,category_list,is_closed,neighborhood_list,price_range,rating,ratings_history,review_count,yelp_alias
yelp_biz_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
-7b17IgC8MNVWDM-JV0-Tw,"9 E 53rd St, New York, NY 10022",Burger Heaven,"[""Burgers"", ""Salad"", ""Sandwiches""]",0,"[""Midtown East""]",2,3.0,"{""2016"": [[0, 2.0], [1, 2.0], [2, 2.0], [3, 2....",204,burger-heaven-new-york
-lgNzAEyFr8LCTuBTWnAMg,"47 W 55th St, New York, NY 10019",Dim Sum Palace,"[""Dim Sum"", ""Seafood"", ""Noodles""]",0,"[""Midtown West""]",2,4.0,"{""2017"": [[6, 4.5], [7, 3.5], [8, 4.0], [9, 3....",156,dim-sum-palace-new-york-7
0087AC_NnFRtC0c9FsB9pw,"17 Waverly Pl, New York, NY 10003",The Boil,"[""Cajun/Creole"", ""Seafood""]",0,"[""Greenwich Village""]",2,4.0,"{""2016"": [[4, 4.5], [5, 4.5], [6, 4.5], [7, 4....",561,the-boil-new-york-5
0CjK3esfpFcxIopebzjFxA,"9 Pell St, New York, NY 10013",Joe’s Shanghai,"[""Shanghainese"", ""Seafood"", ""Venues & Event Sp...",0,"[""Chinatown"", ""Civic Center""]",2,4.0,"{""2016"": [[0, 4.0], [1, 3.5], [2, 4.0], [3, 3....",5322,joes-shanghai-new-york-2
0LpWRWQx8Agm-mZtsOQ5gg,"40 Wall St, New York, NY 10005",Neapolitan Express,"[""Pizza"", ""Italian""]",0,"[""Financial District""]",2,3.5,"{""2016"": [[0, 3.0], [1, 3.0], [2, 3.0], [3, 4....",159,neapolitan-express-new-york-4


In [74]:
data['ratings_history'] = data['ratings_history'].apply(json.loads)

In [75]:
data['ratings_history'][0].keys()

dict_keys(['2016', '2017', '2018', '2014', '2015'])