In [None]:
import re
import time
import pdb
import sys
import numpy as np
from collections import namedtuple
from operator import itemgetter
from itertools import takewhile, count, islice

import psycopg2
from psycopg2.extras import NamedTupleConnection

import requests
from urlparse import urljoin
from bs4 import BeautifulSoup

import yelpUtils

## _This Notebook: Scrape reviews from complete entries in "restaurants" table, Clean, Build "reviews" table, update restuarant table (number of reviews, priceing info, review score)_

#--------------------------------------------------------------------------------------------

####Scrape For:
1. Number of Reviews
2. Up to 50 reviews
3. Rating (distribution?)
4. Price Range

#### TODO
1. Test beautifulsoup grabs for (1-5)
2. Design db_interfacer, yelp_website_interfacer, yelp_page_parser

### Proof of Concept

In [None]:
base_yelp_biz_url = 'http://www.yelp.com/biz/'
sample_biz_id = "prova-new-york"
restaurant_url = urljoin(base_yelp_biz_url, sample_biz_id)

response = requests.get(url = base_yelp_biz_url + sample_biz_id)
soup = BeautifulSoup(response.content)

In [213]:
class YelpPageParser():
    
    def __init__(self):
          
        self._PageExtract = self._build_packager()
    
    def _build_packager(self):
        
        fields = ['avg_rating', 'total_count', 'english_count',
                 'dollar_signs', 'reviews']
        
        return namedtuple('Single_Page_Extract', fields)
    
    def parse(self, raw_html):
        
        soup = BeautifulSoup(raw_html)
        avg_rating = self._get_avg_rating(soup)
        total_count, english_count = self._get_review_counts(soup)
        dollar_signs = self._get_dollar_signs(soup)
        reviews = self._get_reviews(soup)
            

        
        return self._package_results(avg_rating = avg_rating, 
                                    total_count = total_count,
                                    english_count = english_count,
                                    dollar_signs = dollar_signs, 
                                    reviews = reviews
                                    )
    
    def _get_avg_rating(self, page_soup):
        
        selection = page_soup.select('div.biz-page-header-left meta')[0]
        return float(selection.attrs['content'])
    
    def _get_review_counts(self, page_soup):
        
        selection = page_soup.select('div.biz-page-header-left span[itemprop=reviewCount]')[0]
        total_count = int(selection.text)
        
        selection = page_soup.select('div.feed-sorts-filters.clearfix span.count')[0]
        english_count = int(selection.text.strip())
        
        return (total_count, english_count)
    
    def _get_dollar_signs(self, page_soup):
        
        try:
            selection = page_soup.select('div.iconed-list-avatar > span.business-attribute.price-range')[0]
            dollar_signs = len(selection.text)
        
        except:
            self._report_parse_error(page_soup, "Could not find Pricing Info. Returning 'NULL'.")
            return 'NULL'
        

#             selection = page_soup.select('dd.nowrap.price-description')[0]
#         price_range = tuple(map(int, tuple(selection.text.strip()[1:].split('-'))))
        
        return dollar_signs
        
    def _get_reviews(self, page_soup):
        
        reviews = []
        
        review_soup_list = page_soup.select('div.review-list li div.review-content')
        for review_soup in review_soup_list:

            rating_soup = review_soup.select('meta[itemprop=ratingValue]')
            rating = float(rating_soup[0].attrs['content'])
            
            date_soup = review_soup.select('meta[itemprop=datePublished]')
            date = date_soup[0].attrs['content']
            
            review_soup = review_soup.select('p[itemprop=description]')
            review_text = review_soup[0].text

            reviews.append((date, rating, review_text))
            
        return reviews
    
    def _package_results(self, **kwargs):

        return self._PageExtract(**kwargs)
    
    def _report_parse_error(self, soup, *args):
        # To implement once I get an idea of what errors beautiful soup throws.
        print '************************'
        name = unicode(soup.select('h1.biz-page-title')[0].text)
        print u'Encountered Parse Error for the {0} business.'.format(name.strip())
        print '************************'
        for arg in args:
            print args
        print '************************' 
        sys.stdout.flush()
    
    

In [None]:
base_yelp_biz_url = 'http://www.yelp.com/biz/'
sample_biz_id = "prova-new-york"
sample_restaurant_url = urljoin(base_yelp_biz_url, sample_biz_id)

sample_response = requests.get(url = sample_restaurant_url)
sample_html = response.content

In [None]:
ypp = YelpPageParser()

In [None]:
page_extract = ypp.parse(sample_html)

In [None]:
print page_extract.avg_rating
print page_extract.dollar_signs
print page_extract.english_count
print page_extract.reviews[0]
print len(page_extract.reviews)
print page_extract.total_count

In [214]:
class YelpWebsiteInterfacer():
    
    def __init__(self):
        
        self.MAX_GET_ATTEMPTS = 3
        
    def get_page(self, url, params = {}):
        
        page_html = None
        for k in xrange(1, self.MAX_GET_ATTEMPTS+1):
            
            try:  
                response = requests.get(url, params = params)
                page_html = response.content
                break
            
            except requests.ConnectionError:
                self._report_connection_error(url, k)
                
                
        if page_html is None:
            return None
        
        return page_html
    
    def _report_connection_error(self, url, k):
        
            print '************************'
            print "Connection Error encountered on attempt number {0}.".format(k)
            print "URL:"
            print url
            
            if k < self.MAX_GET_ATTEMPTS:
                print "Attempting to get page again..."
            else:
                print "Will not attempt to get this page anymore."
                
            print '************************'
        

In [None]:
ywi = YelpWebsiteInterfacer()
page_html = ywi.get_page(sample_restaurant_url)

In [None]:
# **YelpPaginator**
# init(number of reviews)
# (YelpWebsiteInterfacer)
# pull(base_url)
# [from first page:
# (get (reviews: (date, rating, review_text), overall_rating, price info, english reviews count)
# for i = 1 to max i s.t. 40*i <= # of english reviews
# pull page ?start=40*i
# return list of all page contents
# ]

In [215]:
class YelpPaginator():
    
    def __init__(self, max_page_pulls = 4):
        
        self.max_page_pulls = max_page_pulls
        self.REVIEWS_PER_PAGE = 40
        self.BASE_YELP_BIZ_URL = 'http://www.yelp.com/biz/'
        self.web_interfacer = YelpWebsiteInterfacer()
        self.page_parser = YelpPageParser()
        
    def pull_business(self, yelp_biz_id):
        
        base_page_url = urljoin(self.BASE_YELP_BIZ_URL, yelp_biz_id)
        full_restaurant_extract = self._paginate_pull(base_page_url)
        return full_restaurant_extract
    
    def _paginate_pull(self, base_page_url):
        
        page_html = self.web_interfacer.get_page(base_page_url)
        restaurant_extract = self.page_parser.parse(page_html)
        
        english_review_count = restaurant_extract.english_count #number of english reviews
        
        for i in count(1):
            
            offset = i*self.REVIEWS_PER_PAGE
            if offset >= english_review_count or i >= self.max_page_pulls:
                break
            
            params = {'start':offset}
            page_html = self.web_interfacer.get_page(base_page_url, params = params)
            page_extract = self.page_parser.parse(page_html)
            
            restaurant_extract.reviews.extend(page_extract.reviews)
        
        return restaurant_extract
            

In [None]:
yp = YelpPaginator()

In [None]:
extract = yp.pull_business(sample_biz_id)

In [None]:
from pprint import pprint
pprint(extract)

In [None]:
print extract.avg_rating
print extract.english_count
print extract.dollar_signs
print extract.reviews[-1]
print len(extract.reviews)

In [None]:
# **YelpScrapeCoordinator**

# init
# (pull all (camis, yelp_id) - sorted by camis)
# (set called pulled camis?)
# (YelpPaginator)
# cur
# _open_conn
# _close_conn
# _get_ids
# ((camis, yelp_id) for init)

# pull_all()
# (pull all yelp_ids)
# (record)
# pull_n(from, to)
# (record)
# _record

In [217]:
class YelpScrapeCoordinator():
    
    def __init__(self, start = 0, max_page_pulls = 2):
        
        self._paginator = YelpPaginator(max_page_pulls=max_page_pulls)
        self._reviews_table = ReviewsTableBuilder()
        if start == 0:
            self._reviews_table.build_table()
        
        try:
            self._open_conn()
            self._restaurant_iter = iter(self._get_ids()[start:])
        finally:
            self._close_conn()
 
    def pull_all_restaurant_pages(self):
        pass
    
    def pull_n_restaurant_pages(self, n):
        
        extracts = []
        for i,r_id in enumerate(islice(self._restaurant_iter, n)):
            
            camis = r_id.camis
            yelp_id = r_id.yelp_id
            print yelp_id
            verbose = i % 5 == 0
            if verbose:
                print "Pulling restaurant number {0}...".format(i)
                sys.stdout.flush()
                
            extract = self._paginator.pull_business(yelp_id)
#             self.restautants_table.update_records(extract) ## Need to implement ##
            extracts.append((yelp_id, extract))
            if verbose:
                print "Pull {0} complete.".format(i)
                sys.stdout.flush()
        
        self._record(extracts)
    
    def _get_ids(self):
        
        q = '''
        SELECT camis, yelp_id
        FROM restaurants
        ORDER BY camis ASC;
        '''
        self.c.execute(q)
        restaurant_ids = self.c.fetchall()
        return restaurant_ids
    
    def _record(self, extracts):
        
        self._reviews_table.add_records(extracts)
        
    def _open_conn(self):
        
        self.conn = psycopg2.connect("dbname=yelp", cursor_factory=psycopg2.extras.NamedTupleCursor)
        self.c = self.conn.cursor()

    def _close_conn(self):
        
        if self.conn is not None:
            self.conn.close()
        self.conn = None
    

In [None]:
class ReviewsTableBuilder(yelpUtils.TableBuilder):
    
    def __init__(self):
        yelpUtils.TableBuilder.__init__(self)
    
    def _create_table(self):
        
        # reviews
        ## (yelp_id, date, rating, review)
        self.c.execute("DROP TABLE IF EXISTS reviews")
        q = '''
        CREATE TABLE reviews (
        yelp_id varchar(80),
        date date,
        rating real,
        review varchar(5000)
        )
        '''
        self.c.execute(q)
        self.conn.commit()        
    
    def _table_exists(self):
        
        self.c.execute("select exists(select * from information_schema.tables where table_name='reviews');")
        return self.c.fetchone()[0]
    
    def _add_records(self, restaurant_extracts):
    
        if not self._table_exists():
            self._create_table()
    
        q_template = u'''INSERT INTO reviews 
        (yelp_id, date, rating, review)
        VALUES (
        '{yelp_id}',
        to_date('{date}','YYYY-MM-DD'),
        {rating},
        '{review}'
        );
        '''
        
        if not isinstance(restaurant_extracts, list):
            restaurant_extracts = [restaurant_extracts]
        
        
        for restaurant_extract in restaurant_extracts:
        
            yelp_id = restaurant_extract[0]
            pages_extract = restaurant_extract[1]
            reviews = pages_extract.reviews
            for review in reviews:

                date = review[0]
                rating = review[1]
                text = review[2]
                text = self._psql_safe_text(text)
                q = q_template.format(yelp_id = yelp_id, date = date, rating = rating, review = text)
#                 print q
                self.c.execute(q)

        self.conn.commit()
    
    
    def _psql_safe_text(self, text):
        
        text = text.replace("'", "''")
        return text
        
        

In [211]:
%%timeit -n 1
ysc = YelpScrapeCoordinator(max_page_pulls = 2)
ysc.pull_n_restaurant_pages(200)


corner-bistro-new-york
nostrand-donut-shop-brooklyn
dorrians-red-hand-new-york
la-grenouille-new-york
como-pizza-new-york
Pulling restaurant number 75...
Pull number 75 complete.
keats-restaurant-new-york-5
angelos-new-york-2
panchitos-mexican-restaurant-and-cantina-new-york
colandrea-new-corner-restaurant-brooklyn-2
princeton-club-new-york
Pulling restaurant number 80...
************************
Encountered Parse Error for the 
        Princeton Club
     business.
************************
("Could not find Pricing Info. Returning 'NULL'.",)
************************
Pull number 80 complete.
arturos-new-york
le-perigord-new-york
joe-and-pats-staten-island
spain-restaurant-new-york
palace-cafe-brooklyn
Pulling restaurant number 85...
Pull number 85 complete.
yankee-tavern-bronx
the-donut-pub-new-york
reben-luncheonette-brooklyn
v-and-t-pizzeria-and-restaurant-new-york
king-yum-fresh-meadows
Pulling restaurant number 90...
Pull number 90 complete.
jahns-jackson-heights
j-and-v-pizzeria-b

In [None]:
soup = BeautifulSoup(sample_html)
soup.select('h1.biz-page-title[itemprop=name]')