In [1]:
# TODO:
# unittest
# rating_better & rating_worse over time
# date_added, date_visited
# ratings_distribution

In [2]:
# Imports
import ast
import bs4
import collections
import copy
import json
import pandas
import re
import requests
import sqlite3

In [3]:
yelp_alias_list = [
    'fuku-new-york-11',
    'contra-new-york',
    'razza-pizza-artigianale-jersey-city-2',
]

In [4]:
# date_visited = pandas.to_datetime(date_visited)

### Define

In [5]:
def get_yelp_alias(soup):
    # TODO: test that input yelp_alias == yelp_alias
    yelp_alias = soup.find('meta', {'property': 'og:url'})
    yelp_alias = yelp_alias['content'].split('/')[-1]
    return yelp_alias

In [6]:
def get_yelp_biz_id(soup):
    yelp_biz_id = soup.find('meta', {'name': 'yelp-biz-id'})
    yelp_biz_id = yelp_biz_id['content']
    return yelp_biz_id

In [7]:
def get_is_closed(soup):
    is_closed = 'CLOSED' in soup.title.text
    return is_closed

In [8]:
def get_business_name(soup):
    business_name = soup.find('h1', {'class': 'biz-page-title'})
    business_name = business_name.text.strip()
    return business_name

In [9]:
def get_rating(soup):
    rating = soup.find('div', {'class': 'i-stars'})
    rating = rating['title'].split()[0]
    rating = float(rating)
    return rating

In [10]:
def get_review_count(soup):
    review_count = soup.find('span', {'class': 'review-count rating-qualifier'})
    review_count = review_count.text.strip().split()[0]
    review_count = int(review_count)
    return review_count

In [11]:
def get_ratings_history(soup):
    # TODO: store as JSON in PSQL?
    ratings_history = soup.find('div', {'id': 'rating-details-modal-content'})
    ratings_history = ratings_history['data-monthly-ratings']
    ratings_history = ast.literal_eval(ratings_history)
    ratings_history = {int(k):v for k,v in ratings_history.items()}
    ratings_history = json.dumps(ratings_history)    # stringify for sqlite
    return ratings_history

In [12]:
# def get_ratings_distribution(soup):
# ratings_distribution

In [13]:
def get_price_range(soup):
    price_range = soup.find('span', {'class': 'business-attribute price-range'})
    price_range = price_range.text.count('$')
    return price_range

In [14]:
def get_category_list(soup):
    category_list = soup.find('span', {'class': 'category-str-list'})
    category_list = category_list.contents
    category_list = [i.string for i in category_list]
    category_list = [i for i in category_list if '\n' not in i]
    category_list = json.dumps(category_list)
    return category_list

In [15]:
def get_address(soup):
    address = soup.find('strong', {'class': 'street-address'})
    address = address.find('address')
    address = copy.copy(address)    # Make a copy to prevent modifying the original document
    address.find('br').replace_with('\n')
    address = address.text.strip().split('\n')
    address = ', '.join(address)
    return address

In [16]:
def get_neighborhood_list(soup):
    neighborhood_list = soup.find('span', {'class': 'neighborhood-str-list'})
    if neighborhood_list is not None:
        neighborhood_list = neighborhood_list.text
        neighborhood_list = neighborhood_list.strip().split(', ')
        neighborhood_list = json.dumps(neighborhood_list)
    return neighborhood_list

In [17]:
# date_visited_rating = ratings_history[date_visited.year][date_visited.month-1][1]
# most_recent_rating = ratings_history[max(ratings_history.keys())][-1][1]

### Testing

In [18]:
yelp_alias = 'the-kitchen-step-jersey-city'

In [19]:
# Set up soup
url = 'https://www.yelp.com/biz/{}'.format(yelp_alias)
r = requests.get(url, timeout=5)
soup = bs4.BeautifulSoup(r.text, 'lxml')

In [20]:
# price_range
price_range = soup.find('span', {'class': 'business-attribute price-range'})
price_range = price_range.text.count('$')
price_range

2

In [21]:
# TODO: test that ratings_distribution is a list of length 10
# TODO: test that ratings_distribution data averages to rating
text = soup.find('table', {'class': 'histogram'})
text = text.text.replace('\n', ' ')
text = re.sub(' {2,}', ',', text)         # Replace 2+ whitespace with 1 comma
text = text.strip(',').split(',')

In [22]:
keys = [text[i] for i in range(len(text)) if i%2 == 0]
vals = [text[i] for i in range(len(text)) if i%2 != 0]
ratings_distribution = {keys[i]:vals[i] for i in range(len(keys))}

In [23]:
ratings_distribution

{'1 star': '10',
 '2 stars': '17',
 '3 stars': '25',
 '4 stars': '75',
 '5 stars': '131'}

### Run

In [None]:
# Iterate
all_data = pandas.DataFrame()
for yelp_alias in yelp_alias_list:
    # Set up soup
    url = 'https://www.yelp.com/biz/{}'.format(yelp_alias)
    r = requests.get(url, timeout=5)
    soup = bs4.BeautifulSoup(r.text, 'lxml')
    
    # Get data
    data = collections.OrderedDict()
    data['yelp_alias'] = get_yelp_alias(soup)
    data['yelp_biz_id'] = get_yelp_biz_id(soup)
    data['is_closed'] = get_is_closed(soup)
    data['business_name'] = get_business_name(soup)
    data['rating'] = get_rating(soup)
    data['review_count'] = get_review_count(soup)
    data['ratings_history'] = get_ratings_history(soup)
    data['price_range'] = get_price_range(soup)
    data['category_list'] = get_category_list(soup)
    data['address'] = get_address(soup)
    data['neighborhood_list'] = get_neighborhood_list(soup)
    
    # Append
    print([data['yelp_alias'], data['business_name']])
    data = pandas.Series(data).to_frame().T
    all_data = all_data.append(data, ignore_index=True)

### Test DB

In [None]:
# TODO: check data is valid before writing to DB
connection = sqlite3.connect('restaurants.db')
cursor = connection.cursor()

In [None]:
# Create table
cursor.execute('drop table if exists restaurants')
cursor.execute("""
create table if not exists restaurants (
    yelp_alias text,
    yelp_biz_id text,
    is_closed boolean,
    business_name text,
    rating numeric,
    review_count integer,
    ratings_history json,
    price_range smallint,
    category_list text[],
    address text,
    neighborhood_list text[]
)""")

# Insert data
for index, row in all_data.iterrows():
    query = 'insert into restaurants values'
    query += ' ' + str(('?',) * all_data.columns.shape[0]).replace("'", "")
    cursor.execute(query, row.values.tolist())
connection.commit()

In [None]:
data = pandas.read_sql_query('select * from restaurants', connection)

In [None]:
data