In [1]:
# TODO:
# gspread
# unittest
# PSQL; JSON datatype
# Move ratings_history to separate storage (for PSQL)
# rating_better & rating_worse over time (are the restaurants better/worse after I went?)

In [2]:
# date_visited_rating = ratings_history[date_visited.year][date_visited.month-1][1]
# most_recent_rating = ratings_history[max(ratings_history.keys())][-1][1]

In [3]:
# Imports
import ast
import bs4
import collections
import copy
import datetime
import json
import pandas
import random
import re
import requests
import sqlalchemy
import sqlite3
import time
import urllib
import IPython

### Define

In [4]:
def get_yelp_biz_id(soup):
    yelp_biz_id = soup.find('meta', {'name': 'yelp-biz-id'})
    yelp_biz_id = yelp_biz_id['content']
    return yelp_biz_id    # ybid does not need to serialized because it is not a metric_name

In [5]:
def get_yelp_alias(soup):
    # TODO: test that input_yelp_alias == scraped_yelp_alias; url may get re-routed
    yelp_alias = soup.find('meta', {'property': 'og:url'})
    yelp_alias = yelp_alias['content'].split('/')[-1]
    return yelp_alias

In [6]:
def get_is_closed(soup):
    is_closed = 'CLOSED' in soup.title.text
    return is_closed

In [7]:
def get_business_name(soup):
    business_name = soup.find('h1', {'class': 'biz-page-title'})
    business_name = business_name.text.strip()
    return business_name

In [8]:
def get_rating(soup):
    rating = soup.find('div', {'class': 'i-stars'})
    rating = rating['title'].split()[0]
    rating = float(rating)
    return rating

In [9]:
def get_review_count(soup):
    review_count = soup.find('span', {'class': 'review-count rating-qualifier'})
    review_count = review_count.text.strip().split()[0]
    review_count = int(review_count)
    return review_count

In [10]:
def get_ratings_history(soup):
    ratings_history = soup.find('div', {'id': 'rating-details-modal-content'})
    ratings_history = ratings_history['data-monthly-ratings']
    ratings_history = json.loads(ratings_history)
    ratings_history = {int(k):v for k,v in ratings_history.items()}
    return ratings_history

In [11]:
def get_ratings_distribution(soup):
    # Convert HTML table to len-10 string list
    text = soup.find('table', {'class': 'histogram'})
    text = text.text.replace('\n', ' ')
    text = re.sub(' {2,}', ',', text)    # Replace 2+ whitespace with 1 comma
    text = text.strip(',').split(',')
    
    # Convert string list to dictionary
    ratings = [text[i] for i in range(len(text)) if i%2 == 0]
    ratings = [i.split()[0] for i in ratings]
    ratings = [int(i) for i in ratings]
    counts  = [text[i] for i in range(len(text)) if i%2 != 0]
    counts  = [int(i) for i in counts]
    ratings_distribution = {'rating' : ratings, 'count' : counts}
    return ratings_distribution

In [12]:
def get_price_range(soup):
    price_range = soup.find('span', {'class': 'business-attribute price-range'})
    price_range = price_range.text.count('$')
    return price_range

In [13]:
def get_category_list(soup):
    category_list = soup.find('span', {'class': 'category-str-list'})
    category_list = category_list.contents
    category_list = [i.string for i in category_list]
    category_list = [i for i in category_list if '\n' not in i]
    return category_list

In [14]:
def get_address(soup):
    address = soup.find('address')
    address = copy.copy(address)    # Make a copy to prevent modifying the original document
    for tag in address.find_all('br'):
        tag.replace_with('\n')
    address = address.text.strip().split('\n')
    address = ', '.join(address)
    return address

In [15]:
def get_neighborhood_list(soup):
    neighborhood_list = soup.find('span', {'class': 'neighborhood-str-list'})
    if neighborhood_list is not None:
        neighborhood_list = neighborhood_list.text
        neighborhood_list = neighborhood_list.strip().split(', ')
    return neighborhood_list

### Testing

In [16]:
yelp_alias = 'ample-hills-creamery-jersey-city'

In [17]:
urllib.parse.quote_plus(yelp_alias)

'ample-hills-creamery-jersey-city'

In [18]:
url = 'https://www.yelp.com/biz/{}'.format(yelp_alias)
r = requests.get(url, timeout=5)
soup = bs4.BeautifulSoup(r.text, 'lxml')

In [19]:
yelp_alias = soup.find('meta', {'property': 'og:url'})
yelp_alias = yelp_alias['content'].split('/')[-1]
yelp_alias

'ample-hills-creamery-jersey-city'

In [20]:
is_closed = 'CLOSED' in soup.title.text
is_closed

False

In [21]:
price_range = soup.find('span', {'class': 'business-attribute price-range'})
price_range = price_range.text.count('$')
price_range

2

In [22]:
rating = soup.find('div', {'class': 'i-stars'})
rating = rating['title'].split()[0]
rating = float(rating)
rating

4.5

In [23]:
address = soup.find('address')
address = copy.copy(address)    # Make a copy to prevent modifying the original document
for tag in address.find_all('br'):
    tag.replace_with('\n')
address = address.text.strip().split('\n')
address = ', '.join(address)
address

'200 Greene St, Jersey City, NJ 07302'

### Check

In [39]:
# engine = sqlalchemy.create_engine('postgresql://localhost/postgres')
# tables = pandas.read_sql_query("""
# select * from pg_catalog.pg_tables
# where schemaname = 'public'
# and tablename = 'yelp_raw'
# """, engine)
# tables

In [40]:
connection = sqlite3.connect('restaurants.db')
tables = pandas.read_sql_query("""
select * from sqlite_master
where type = 'table'
""", connection)
tables

Unnamed: 0,type,name,tbl_name,rootpage,sql


In [41]:
if 'yelp_raw' in tables['tbl_name'].values:
    yelp_raw = pandas.read_sql_query('select * from yelp_raw', connection)
    yelp_raw['insert_datetime'] = pandas.to_datetime(yelp_raw['insert_datetime'])
    yelp_raw['insert_date'] = yelp_raw['insert_datetime'].dt.date
    count_by_insert_date = yelp_raw.groupby('insert_date').count()['yelp_biz_id']
    count_by_insert_date = count_by_insert_date.rename('count').reset_index()
    IPython.core.display.display(count_by_insert_date)

### Run

In [27]:
input_data = pandas.read_csv('input_data.csv')

In [28]:
input_data.head()

Unnamed: 0,date_added,date_visited,business_name,yelp_alias,neighborhood
0,-,6/21/2014,Muk Eun Ji,muk-eun-ji-new-york,Koreatown
1,-,7/20/2015,RamenCo,ramenco-new-york-2,Financial District
2,-,7/24/2015,BLT Bar & Grill,blt-bar-and-grill-new-york,Financial District
3,-,7/24/2015,Bill’s Bar & Burger Downtown,bills-bar-and-burger-downtown-new-york,Financial District
4,-,8/21/2015,Ruchi,ruchi-new-york-2,Financial District


In [29]:
# I did not add aliases for some food carts & bakeries; drop them for now
input_data = input_data[input_data['yelp_alias'].notna()]
input_data = input_data.reset_index(drop=True)

# I use '-' to represent N/A; convert to None
input_data['date_added'] = input_data['date_added'].apply(lambda x: None if x == '-' else x)
input_data['date_added'] = pandas.to_datetime(input_data['date_added'])
input_data['date_visited'] = pandas.to_datetime(input_data['date_visited'])

In [30]:
# TODO: proxies & IP pool?
class Connector:
    """ This class handles how to make a connection to a URL. """
    def __init__(self):
        self.attempt_limit = 3    # Attempt to connect N times
        self.sleep = 5            # Sleep N seconds after failing to connect
    
    def make_soup(self, url):
        """ Given a URL, attempt N times to connect.
            If successful, return soup object. """
        # Attempt to connect
        attempt = 1
        while attempt <= self.attempt_limit:
            # If connection is successful, sleep for a random interval & break
            try:
                r = requests.get(url, timeout=5)
                rand_sleep = random.uniform(1, 3)
                time.sleep(rand_sleep)
                break
            # Otherwise, sleep for a long time & try again
            except requests.exceptions.ReadTimeout:
                print('\nCould not connect, sleeping {} seconds...'.format(self.sleep))
                attempt += 1
                time.sleep(self.sleep)
                self.sleep *= 2
        
        # If connected, make soup
        if attempt > self.attempt_limit:
            raise Exception('Failed to connect after {} attempts'.format(self.attempt_limit))
        soup = bs4.BeautifulSoup(r.text, 'lxml')
        return soup

In [31]:
def dict_to_df(data_i):
    """ Convert dictionary to melted DataFrame format.
        data_i represents a single row in pivoted data. """
    # Serialize metric_values (TODO: remove when migrating to PSQL?)
    for key, value in data_i.items():
        data_i[key] = json.dumps(value)
    
    # Melt
    data_i = pandas.Series(data_i).to_frame().T
    data_i = data_i.melt()
    data_i.columns = ['metric_name', 'metric_value']
    data_i.insert(0, 'yelp_biz_id', get_yelp_biz_id(soup))
    data_i.insert(0, 'insert_datetime', datetime.datetime.now())
    return data_i

In [32]:
%%time
# Iterate
ybid_list = []    # Build list of yelp_biz_id, for input data
new_data = pandas.DataFrame()
connector = Connector()
for index, row in input_data.iterrows():
    # Set up
    print(index, end=" ")
    business_name = row['business_name']
    yelp_alias = row['yelp_alias']
    
    # Attempt to make soup
    url = 'https://www.yelp.com/biz/{}'.format(yelp_alias)
    soup = connector.make_soup(url)
    
    # Get data
    ybid_list.append(get_yelp_biz_id(soup))
    new_data_i = collections.OrderedDict()
    new_data_i['yelp_alias'] = get_yelp_alias(soup)
    new_data_i['is_closed'] = get_is_closed(soup)
    new_data_i['business_name'] = get_business_name(soup)
    new_data_i['rating'] = get_rating(soup)
    new_data_i['review_count'] = get_review_count(soup)
    new_data_i['ratings_history'] = get_ratings_history(soup)
    new_data_i['price_range'] = get_price_range(soup)
    new_data_i['category_list'] = get_category_list(soup)
    new_data_i['address'] = get_address(soup)
    new_data_i['neighborhood_list'] = get_neighborhood_list(soup)
    
    # Format & append
    new_data_i = dict_to_df(new_data_i)
    new_data = new_data.append(new_data_i, ignore_index=True)
print()

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 CPU times: user 38 s, sys: 1.03 s, total: 39 s
Wall time: 11min 24s


### input_data

In [33]:
# TODO: new_data vs. latest_data process should occur for both yelp_raw and input_data
# TODO: move this process to dvs

In [34]:
input_data['yelp_biz_id'] = pandas.Series(ybid_list)
input_data = input_data[['date_added', 'date_visited',
                         'business_name', 'yelp_alias', 'yelp_biz_id',
                         'neighborhood']]

In [35]:
if 'input_data' in tables['tbl_name'].values:
    # Check if any the newest input data differs from what is in the database
    latest_data = pandas.read_sql_query("""
    select * from input_data
    """, connection)
    latest_data = pandas.concat([latest_data, input_data, input_data])
    latest_data = latest_data.drop_duplicates(keep=False)
    if latest_data.shape[0] != 0:
        raise Exception('There appears to be a problem.')
else:
    print('(input_data does not exist, creating the table...)')
    input_data.to_sql('input_data', connection, if_exists='replace', index=False)

(input_data does not exist, creating the table...)


### yelp_raw

In [42]:
# def incremental(new_data, latest_data)? 

In [43]:
if 'yelp_raw' in tables['tbl_name'].values:
    # If the table exists, reduce new data to only the incremental data
    print('(yelp_raw exists, creating incremental data)')
    latest_data = pandas.read_sql_query("""
    with last_updated as (
        select
        yelp_biz_id, metric_name,
        max(insert_datetime) as last_update_datetime
        from yelp_raw
        group by 1,2 order by 1,2
    )
    select a.*
    from yelp_raw a
    join last_updated b
    on a.yelp_biz_id = b.yelp_biz_id
    and a.metric_name = b.metric_name
    and a.insert_datetime = b.last_update_datetime
    order by 1,2,3
    """, connection)
    
    # Find which metric values changed from the latest data to the newest data
    # (Only metric values that changed get written to the database)
    incr_data = new_data.merge(latest_data, how='outer',
                             on=['yelp_biz_id', 'metric_name'], suffixes=['_new', '_latest'])
    incr_data = incr_data[incr_data['metric_value_new'] != incr_data['metric_value_latest']]
    incr_data = incr_data[['insert_datetime_new', 'yelp_biz_id',
                           'metric_name', 'metric_value_new']]
    incr_data.columns = new_data.columns
else:
    print('(yelp_raw does not exist, writing full data...)')
    new_data.to_sql('yelp_raw', connection, if_exists='append', index=False)

(yelp_raw does not exist, writing full data...)


### Read from DB

In [45]:
data = pandas.read_sql_query("""
with last_updated as (
    select
    yelp_biz_id, metric_name,
    max(insert_datetime) as last_update_datetime
    from yelp_raw
    group by 1,2 order by 1,2
)
select a.*
from yelp_raw a
join last_updated b
on a.yelp_biz_id = b.yelp_biz_id
and a.metric_name = b.metric_name
and a.insert_datetime = b.last_update_datetime
order by 1,2,3
""", connection)

In [46]:
data.head(10)

Unnamed: 0,insert_datetime,yelp_biz_id,metric_name,metric_value
0,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,address,"""34 W 32nd St, Fl 1, New York, NY 10001"""
1,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,business_name,"""Muk Eun Ji"""
2,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,category_list,"[""Korean"", ""Barbeque""]"
3,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,is_closed,true
4,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,neighborhood_list,"[""Koreatown"", ""Midtown West""]"
5,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,price_range,2
6,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,rating,3.5
7,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,ratings_history,"{""2016"": [[0, 4.0], [1, 3.5], [2, 3.0], [3, 3...."
8,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,review_count,365
9,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,yelp_alias,"""muk-eun-ji-new-york"""


In [47]:
data['metric_value'] = data['metric_value'].apply(json.loads)

In [48]:
data.head(10)

Unnamed: 0,insert_datetime,yelp_biz_id,metric_name,metric_value
0,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,address,"34 W 32nd St, Fl 1, New York, NY 10001"
1,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,business_name,Muk Eun Ji
2,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,category_list,"[Korean, Barbeque]"
3,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,is_closed,True
4,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,neighborhood_list,"[Koreatown, Midtown West]"
5,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,price_range,2
6,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,rating,3.5
7,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,ratings_history,"{'2016': [[0, 4.0], [1, 3.5], [2, 3.0], [3, 3...."
8,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,review_count,365
9,2018-09-04 21:11:45.402122,g_F9WJJpRFB40oPJdoD2uA,yelp_alias,muk-eun-ji-new-york


In [49]:
data = data.pivot(index='yelp_biz_id', columns='metric_name', values='metric_value')

In [50]:
data.head()

metric_name,address,business_name,category_list,is_closed,neighborhood_list,price_range,rating,ratings_history,review_count,yelp_alias
yelp_biz_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
-7b17IgC8MNVWDM-JV0-Tw,"9 E 53rd St, New York, NY 10022",Burger Heaven,"[Burgers, Salad, Sandwiches]",False,[Midtown East],2,3.0,"{'2016': [[0, 2.0], [1, 2.0], [2, 2.0], [3, 2....",206,burger-heaven-new-york
-lgNzAEyFr8LCTuBTWnAMg,"47 W 55th St, New York, NY 10019",Dim Sum Palace,"[Dim Sum, Seafood, Noodles]",False,[Midtown West],2,4.0,"{'2017': [[6, 4.5], [7, 3.5], [8, 4.0], [9, 3....",157,dim-sum-palace-new-york-7
0087AC_NnFRtC0c9FsB9pw,"17 Waverly Pl, New York, NY 10003",The Boil,"[Cajun/Creole, Seafood]",False,[Greenwich Village],2,4.0,"{'2016': [[4, 4.5], [5, 4.5], [6, 4.5], [7, 4....",561,the-boil-new-york-5
0CjK3esfpFcxIopebzjFxA,"9 Pell St, New York, NY 10013",Joe’s Shanghai,"[Shanghainese, Seafood, Venues & Event Spaces]",False,"[Chinatown, Civic Center]",2,4.0,"{'2016': [[0, 4.0], [1, 3.5], [2, 4.0], [3, 3....",5333,joes-shanghai-new-york-2
0LpWRWQx8Agm-mZtsOQ5gg,"40 Wall St, New York, NY 10005",Neapolitan Express,"[Pizza, Italian]",False,[Financial District],2,3.5,"{'2016': [[0, 3.0], [1, 3.0], [2, 3.0], [3, 4....",159,neapolitan-express-new-york-4


In [51]:
data['ratings_history'][0].keys()

dict_keys(['2016', '2017', '2018', '2014', '2015'])

In [52]:
data['is_closed'].unique()

array([False, True], dtype=object)

In [53]:
type(data['is_closed'][0])

bool

### Data Tests

In [54]:
# TESTS:
# test that ratings_distribution_text is a list of length 10
# test that ratings_distribution data averages to rating
# check data before writing, if different throw error
#   - if yelp_biz_id changes in input_data, there is a problem
#   - (new categories, historical ratings_history changes, address changes)

In [55]:
# ratings_distribution = pandas.DataFrame(ratings_distribution)
# ratings_distribution['contrib'] = ratings_distribution['rating'] * ratings_distribution['count']
# ratings_distribution['contrib'].sum()/ratings_distribution['count'].sum()

In [56]:
# # TODO: unique_values handling for collections-based columns (i.e. ratings_history)
# unique_values = pandas.read_sql_query("""
#     select * from yelp_raw
#     where metric_name in ('yelp_alias', 'business_name')
#     """, connection) \
#       .groupby(['yelp_biz_id', 'metric_name']) \
#       .agg({'metric_value': ['unique']})
# unique_values.columns = ['metric_value']