In [1]:
import re
import time
import pdb
import sys
from collections import namedtuple
from operator import itemgetter

import psycopg2
from psycopg2.extras import NamedTupleConnection

import requests
from urlparse import urljoin
from bs4 import BeautifulSoup

## _This Notebook: Scrape reviews from complete entries in "restaurants" table, Clean, Build "reviews" table, update restuarant table (number of reviews, priceing info, review score)_

#--------------------------------------------------------------------------------------------

####Scrape For:
1. Number of Reviews
2. Up to 50 reviews
3. Rating (distribution?)
4. Price Range

#### TODO
1. Test beautifulsoup grabs for (1-5)
2. Design db_interfacer, yelp_website_interfacer, yelp_page_parser

### Proof of Concept

In [66]:
base_yelp_biz_url = 'http://www.yelp.com/biz/'
sample_biz_id = "prova-new-york"
restaurant_url = urljoin(base_yelp_biz_url, sample_biz_id)

response = requests.get(url = base_yelp_biz_url + sample_biz_id)
soup = BeautifulSoup(response.content)

#### Review Score

In [69]:
selection = soup.select('div.biz-page-header-left meta')[0]
float(selection.attrs['content'])

4.5

#### Review Count

Total

In [70]:
selection = soup.select('div.biz-page-header-left span[itemprop=reviewCount]')[0]
int(selection.text)

59

English

In [71]:
selection = soup.select('div.feed-sorts-filters.clearfix span.count')[0]
int(selection.text.strip())

58

#### Price Info

Dollar Signs

In [72]:
selection = soup.select('div.iconed-list-avatar > span.business-attribute.price-range')[0]
len(selection.text)

2

Price Range

In [73]:
selection = soup.select('dd.nowrap.price-description')[0]
(low, high) = tuple(map(int, tuple(selection.text.strip()[1:].split('-'))))
print low, high

11 30


#### Reviews

In [74]:
reviews = soup.select('div.review-list li div.review-content')
for review_info_soup in reviews:
    
    score_soup = review_info_soup.select('meta[itemprop=ratingValue]')
    score = float(score_soup[0].attrs['content'])
    date_soup = review_info_soup.select('meta[itemprop=datePublished]')
    date = date_soup[0].attrs['content']
    review_soup = review_info_soup.select('p[itemprop=description]')
    review_text = review_soup[0].text
    
    print "Date: {0}".format(date)
    print "Score: {0}".format(score)
    print "Review: "
    print review_text
    print "***********************************"

Date: 2015-06-14
Score: 5.0
Review: 
Low key, fantastic drinks, quality ingredients, and handsome Italian bartenders? Sign me up please! This review is for the bar only, although I'll definitely be back to try the pizzas which looked and smelled amazing. Stopped in here for cocktails with a friend before her birthday dinner and could have stayed all night. The specialty cocktails were delicious, specifically the "just got fired" - mezcal, lemon, jalapeños, and magic were the key ingredients I believe. The staff was warm and friendly. The bartender even gave our friend a drink recipe to try on their own at home. Looking forward to going back
***********************************
Date: 2015-06-09
Score: 5.0
Review: 
Easy to get a last minute reservation for a fairly big party. The pizzas were really good. Also, $6 wines during happy hour.  I'll just leave the review at that. I mean, $6 wines. HELLO.
***********************************
Date: 2015-05-28
Score: 4.0
Review: 
I came here with 5

Implementation Goals:
Parallel Gets
Live Parsing
Clean Separation:
1. Pulling Restaurant ids from DB
2. Pagination:
 - Pull first page, record number of english reviews
 - for i = 1 to i s.t. 40*i > # of english reviews
 - (pull page)

2a. Pulling Webpage
3. Parsing Response
4. Recording Data

Standard for format of data passed back and forth.

**YelpScrapeCoordinator**

init
(pull all (camis, yelp_id) - sorted by camis)
(set called pulled camis?)
(YelpPaginator)
cur
_open_conn
_close_conn
_get_ids
((camis, yelp_id) for init)

pull_all()
(pull all yelp_ids)
(record)
pull_n(from, to)
(record)
_record


**YelpPaginator**
init(number of reviews)
(YelpWebsiteInterfacer)
pull(base_url)
[from first page:
(get (reviews: (date, rating, review_text), overall_rating, price info, english reviews count)
for i = 1 to max i s.t. 40*i <= # of english reviews
pull page ?start=40*i
return list of all page contents
]

**YelpWebsiteInterfacer**
init
(YelpPageParser)
pull(url, params)
response = requests.get(url, params)
parser = ((reviews: (date, ratings, review_text)), overall_rating, dollar_signs, price_lower, price_upper, english_review_count

**YelpPageParser**
parse(page content)



errors to be aware of: parse_errors, connection_error, db_errors, encoding errors

In [48]:
class YelpPageParser():
    
    def __init__(self):
          
        self._PageExtract = self._build_packager()
    
    def _build_packager(self):
        
        fields = ['avg_rating', 'total_count', 'english_count',
                 'dollar_signs', 'price_range', 'reviews']
        
        return namedtuple('Single_Page_Extract', fields)
    
    def parse(self, raw_html):
        
        soup = BeautifulSoup(raw_html)
        avg_rating = self._get_avg_rating(soup)
        total_count, english_count = self._get_review_counts(soup)
        dollar_signs, price_range = self._get_price_info(soup)
        reviews = self._get_reviews(soup)
        
        return self._package_results(avg_rating = avg_rating, 
                                    total_count = total_count,
                                    english_count = english_count,
                                    dollar_signs = dollar_signs, 
                                    price_range = price_range,
                                    reviews = reviews
                                    )
    
    def _get_avg_rating(self, page_soup):
        
        selection = page_soup.select('div.biz-page-header-left meta')[0]
        return float(selection.attrs['content'])
    
    def _get_review_counts(self, page_soup):
        
        selection = page_soup.select('div.biz-page-header-left span[itemprop=reviewCount]')[0]
        total_count = int(selection.text)
        
        selection = page_soup.select('div.feed-sorts-filters.clearfix span.count')[0]
        english_count = int(selection.text.strip())
        
        return (total_count, english_count)
    
    def _get_price_info(self, page_soup):
        
        selection = page_soup.select('div.iconed-list-avatar > span.business-attribute.price-range')[0]
        dollar_signs = len(selection.text)
        
        
        selection = page_soup.select('dd.nowrap.price-description')[0]
        price_range = tuple(map(int, tuple(selection.text.strip()[1:].split('-'))))
        
        return dollar_signs, price_range
        
    def _get_reviews(self, page_soup):
        
        reviews = []
        
        review_soup_list = page_soup.select('div.review-list li div.review-content')
        for review_soup in review_soup_list:

            rating_soup = review_soup.select('meta[itemprop=ratingValue]')
            rating = float(rating_soup[0].attrs['content'])
            
            date_soup = review_soup.select('meta[itemprop=datePublished]')
            date = date_soup[0].attrs['content']
            
            review_soup = review_soup.select('p[itemprop=description]')
            review_text = review_soup[0].text

            reviews.append((date, rating, review_text))
            
        return reviews
    
    def _package_results(self, **kwargs):

        return self._PageExtract(**kwargs)
    
    def _report_parse_error(self):
        # To implement once I get an idea of what errors beautiful soup throws.
        pass
        
    
    

In [20]:
base_yelp_biz_url = 'http://www.yelp.com/biz/'
sample_biz_id = "prova-new-york"
sample_restaurant_url = urljoin(base_yelp_biz_url, sample_biz_id)

sample_response = requests.get(url = sample_restaurant_url)
sample_html = response.content

In [32]:
ypp = YelpPageParser()

In [36]:
page_extract = ypp.parse(sample_html)

In [42]:
print page_extract.avg_rating
print page_extract.dollar_signs
print page_extract.english_count
print page_extract.price_range
print page_extract.reviews[0]
print len(page_extract.reviews)
print page_extract.total_count

4.5
2
58
(11, 30)
('2015-06-14', 5.0, u'Low key, fantastic drinks, quality ingredients, and handsome Italian bartenders? Sign me up please! This review is for the bar only, although I\'ll definitely be back to try the pizzas which looked and smelled amazing. Stopped in here for cocktails with a friend before her birthday dinner and could have stayed all night. The specialty cocktails were delicious, specifically the "just got fired" - mezcal, lemon, jalape\xf1os, and magic were the key ingredients I believe. The staff was warm and friendly. The bartender even gave our friend a drink recipe to try on their own at home. Looking forward to going back')
40
59


In [43]:
class YelpWebsiteInterfacer():
    
    def __init__(self):
        
        self.MAX_GET_ATTEMPTS = 3
        
    def get_page(self, url):
        
        page_html = None
        for k in xrange(1, self.MAX_GET_ATTEMPTS+1):
            
            try:  
                response = requests.get(url)
                page_html = response.content
                break
            
            except requests.ConnectionError:
                self._report_connection_error(url, k)
                
                
        if page_html is None:
            return None
        
        return page_html
    
    def _report_connection_error(self, url, k):
        
            print '************************'
            print "Connection Error encountered on attempt number {0}.".format(k)
            print "URL:"
            print url
            
            if k < self.MAX_GET_ATTEMPTS:
                print "Attempting to get page again..."
            else:
                print "Will not attempt to get this page anymore."
                
            print '************************'
        

In [46]:
ywi = YelpWebsiteInterfacer()
page_html = ywi.get_page(sample_restaurant_url)

In [None]:
# **YelpPaginator**
# init(number of reviews)
# (YelpWebsiteInterfacer)
# pull(base_url)
# [from first page:
# (get (reviews: (date, rating, review_text), overall_rating, price info, english reviews count)
# for i = 1 to max i s.t. 40*i <= # of english reviews
# pull page ?start=40*i
# return list of all page contents
# ]

In [None]:
class YelpPaginator():
    
    def __init__(self, max_reviews):
        
        self.MAX_REVIEWS = max_reviews
        self.BASE_YELP_BIZ_URL = 'http://www.yelp.com/biz/'
        self.web_interfacer = YelpWebsiteInterfacer()
        self.page_parser = YelpPageParser()
        
    def pull_business(self, yelp_biz_id):
        
        base_page_url = urljoin(self.BASE_YELP_BIZ_URL, yelp_biz_id)
        full_extract = _paginate_pull(base_page_url)
        return full_extract
              
    def _paginate_pull(self, base_page_url):
        
        page_html = self.web_interfacer.get_page(base_page_url)
        restaurant_extract = self.page_parser(page_html)
        
        avg_rating = 
        total_count = 
        english_count = 
        dollar_signs = 
        price_range = 
        reviews = 
        
        




