# Scraping Script

Paul Lim

05/19/2017

## Libraries

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import requests
from bs4 import BeautifulSoup
import time

from fake_useragent import UserAgent

## List of Functions

In [122]:
def scrape_biz_links(url_template, start, end, delay=5):
    biz_name_list = []
    biz_link_list = []
    ua = UserAgent()
    
    count = 0
    for i in range(start, end+1, 10):
        url = url_template.format(num=i)
        user_agent = {'User-agent': ua.random}
        
        try:
            link = requests.get(url, headers=user_agent)
        except:
            print(url)
            print('URL does not work.')
            
        page = link.text
        soup = BeautifulSoup(page, 'html5lib')
        
        all_items = soup.find_all('span', {'class': 'indexed-biz-name'})
        
        if all_items:
            for item in all_items:
                name = item.getText()
                name2 = re.sub('[0-9]+\.', '', name)
                name_clean = re.sub('\s+', '', name2)

                biz_link = item.find('a', href=True).get('href')
                biz_link2 = 'https://www.yelp.com' + biz_link
                
                if not name_clean:
                    print(url)
                    print('No name of business found')
                    name_clean = 'FILLER'
                elif not biz_link2:
                    print(url)
                    print('No link to business found')
                    continue
                    
                biz_name_list.append(name_clean)
                biz_link_list.append(biz_link2)
        else:
            print(url)
            print('Could not find indexed-biz-name')
            
        time.sleep(delay + 2*np.random.rand())
        
        print(count*10)
        count += 1
    
    return biz_name_list, biz_link_list

def scrape_biz_reviews(url_template, start, end, delay=5):
    biz_rating_list = []
    biz_review_list = []
    ua = UserAgent()
    
    count = 0
    for i in range(start, end+1, 20):
        url = url_template.format(num=i)
        user_agent = {'User-agent': ua.random}
        
        try:
            link = requests.get(url, headers=user_agent)
        except:
            print(url)
            print('URL does not work.')
            
        page = link.text
        soup = BeautifulSoup(page, 'html5lib')
        
        all_items = soup.find_all('div', {'class': 'review-content'})
        
        if all_items:
            for item in all_items:
                rating = item.find('div', {'class': 'i-stars'})['title']
                rating2 = re.sub('\D', '', rating)
                rating_clean = re.sub('0', '', rating2)

                review = item.find('p').getText()
                
                if not rating_clean:
                    print(url)
                    print('No rating found')
                    rating_clean = '-1'
                elif not review:
                    print(url)
                    print('No review found')
                    continue
                
                biz_rating_list.append(rating_clean)
                biz_review_list.append(review)
        else:
            print(url)
            print('Could not find review-content')
            
        time.sleep(delay + 2*np.random.rand())
        
        print(count)
        count += 1
    
    return biz_rating_list, biz_review_list

## Scrape Links

### Start with one page

In [2]:
url_ex = 'https://www.yelp.com/search?find_desc=Caf%C3%A9s+%26+Coffee+Shops&find_loc=San+Francisco%2C+CA&ns=1'

link_ex = requests.get(url_ex)
page_ex = link_ex.text
soup_ex = BeautifulSoup(page_ex, 'html5lib')

#### Get the name of business

In [34]:
ex_biz_name = soup_ex.find('span', {'class': 'indexed-biz-name'}).getText()
ex_biz_name2 = re.sub('[0-9]+\.', '', ex_biz_name)
ex_biz_name_clean = re.sub('\s+', '', ex_biz_name2)

#### Get the link to the business

In [37]:
ex_link_name = soup_ex.find('span', {'class': 'indexed-biz-name'}).find('a', href=True).get('href')
ex_link_name2 = 'https://www.yelp.com' + ex_link_name

#### Get all links and business names for one page

In [39]:
one_page = soup_ex.find_all('span', {'class': 'indexed-biz-name'})
ex_biz_name_list = []
ex_biz_link_list = []

for item in one_page:
    ex_name = item.getText()
    ex_name2 = re.sub('[0-9]+\.', '', ex_name)
    ex_name_clean = re.sub('\s+', '', ex_name2)
    ex_biz_name_list.append(ex_name_clean)
    
    ex_link = item.find('a', href=True).get('href')
    ex_link2 = 'https://www.yelp.com' + ex_link
    ex_biz_link_list.append(ex_link2)

### Generalize to any number of pages

#### Try function on two pages

In [116]:
url_template_2 = 'https://www.yelp.com/search?find_desc=Cafés+%26+Coffee+Shops&find_loc=San+Francisco,+CA&start={num}'
name_2, link_2 = scrape_biz_links(url_template_2, 0, 20)
list(zip(name_2, link_2))

0
10
20


[('FourBarrelCoffee',
  'https://www.yelp.com/biz/four-barrel-coffee-san-francisco?osq=Caf%C3%A9s+%26+Coffee+Shops'),
 ('Home',
  'https://www.yelp.com/biz/home-san-francisco-30?osq=Caf%C3%A9s+%26+Coffee+Shops'),
 ('Rise&Grind',
  'https://www.yelp.com/biz/rise-and-grind-san-francisco-2?osq=Caf%C3%A9s+%26+Coffee+Shops'),
 ('SaltrootCafé',
  'https://www.yelp.com/biz/saltroot-caf%C3%A9-san-francisco-4?osq=Caf%C3%A9s+%26+Coffee+Shops'),
 ('SightglassCoffee',
  'https://www.yelp.com/biz/sightglass-coffee-san-francisco?osq=Caf%C3%A9s+%26+Coffee+Shops'),
 ('SaintFrankCoffee',
  'https://www.yelp.com/biz/saint-frank-coffee-san-francisco-2?osq=Caf%C3%A9s+%26+Coffee+Shops'),
 ('RéveilleCoffeeCo.',
  'https://www.yelp.com/biz/r%C3%A9veille-coffee-co-san-francisco-6?osq=Caf%C3%A9s+%26+Coffee+Shops'),
 ('CoffeeShop',
  'https://www.yelp.com/biz/coffeeshop-san-francisco?osq=Caf%C3%A9s+%26+Coffee+Shops'),
 ('GreenhouseCafe',
  'https://www.yelp.com/biz/greenhouse-cafe-san-francisco-2?osq=Caf%C3%A9s

#### Test case for broken link

In [117]:
url_template_broken = 'https://www.yelp.com/seh?find_desc=Cafés+%26+Coffee+Shops&find_loc=San+Francisco,+CA&start={num}'
name_b, link_b = scrape_biz_links(url_template_broken, 0, 20)
list(zip(name_b, link_b))

https://www.yelp.com/seh?find_desc=Cafés+%26+Coffee+Shops&find_loc=San+Francisco,+CA&start=0
Could not find indexed-biz-name
0
https://www.yelp.com/seh?find_desc=Cafés+%26+Coffee+Shops&find_loc=San+Francisco,+CA&start=10
Could not find indexed-biz-name
10
https://www.yelp.com/seh?find_desc=Cafés+%26+Coffee+Shops&find_loc=San+Francisco,+CA&start=20
Could not find indexed-biz-name
20


[]

## Scrape Reviews

### Start with one page

In [75]:
url_att = 'https://www.yelp.com/biz/r%C3%A9veille-coffee-co-san-francisco-6?osq=Caf%C3%A9s+%26+Coffee+Shops'

link_att = requests.get(url_att)
page_att = link_att.text
soup_att = BeautifulSoup(page_att, 'html5lib')

#### Get the rating value

In [94]:
container_att = soup_att.find('div', {'class': 'review-content'})
rating_att = container_att.find('div', {'class': 'i-stars'})['title']
rating_act_att = re.sub('\D', '', rating_att)

# Remove the trailing zero since all ratings can be an integer between 0-5.
rating_act_att2 = re.sub('0', '', rating_act_att)

#### Get the text review

In [97]:
review_att = container_att.find('p').getText()

#### Get all ratings and text reviews for one page

In [101]:
all_containers_att = soup_att.find_all('div', {'class': 'review-content'})
ratings_list_att = []
reviews_list_att = []

for container in all_containers_att:
    ex_rating = container.find('div', {'class': 'i-stars'})['title']
    ex_rating2 = re.sub('\D', '', ex_rating)
    ex_rating_clean = re.sub('0', '', ex_rating2)
    
    ex_review = container.find('p').getText()
    
    ratings_list_att.append(ex_rating_clean)
    reviews_list_att.append(ex_review)

### Generalize to any number of pages

#### Try function on two pages

In [120]:
url_template_3 = 'https://www.yelp.com/biz/r%C3%A9veille-coffee-co-san-francisco-6?start={num}'
name_3, link_3 = scrape_biz_reviews(url_template_3, 0, 40)
list(zip(name_3, link_3))

0
1
2


[('5',
  "love this spot. GREAT coffee and good food. and an outdoor area!and the staff is 99% great as well. they get 5 stars for that, but there is one french girl who is a total buzz kill. like she's too cool to smile or talk to anyone. she's such a downer that if i go in and she's working i actually LEAVE or wait for someone else to take a turn at the register."),
 ('4',
  "This is one of the Reveille locations, which is located in Castro. I've been to the other location in the Dogpatch area and really liked the vanilla latte and the ambience at that location. Here at the Castro location, it is a lot smaller and dimmer. The venue itself is more crowded. There is seating outside which looked pretty cute but inside there's bar seating in the center and table seating lining one of the walls. I ordered an iced latte and it came in a very cute glass cup. The coffee itself wasn't very strong and the flavor got diluted by the ice very quickly. You can ask for the wifi password, and the in

#### Test case for broken link

In [121]:
url_template_broken2 = 'https://www.yelp.com/biz/r%C9veille-coffee-co-san-francisco-6?start={num}'
name_b2, link_b2 = scrape_biz_reviews(url_template_broken, 0, 40)
list(zip(name_b2, link_b2))

https://www.yelp.com/seh?find_desc=Cafés+%26+Coffee+Shops&find_loc=San+Francisco,+CA&start=0
Could not find review-content
0
https://www.yelp.com/seh?find_desc=Cafés+%26+Coffee+Shops&find_loc=San+Francisco,+CA&start=20
Could not find review-content
1
https://www.yelp.com/seh?find_desc=Cafés+%26+Coffee+Shops&find_loc=San+Francisco,+CA&start=40
Could not find review-content
2


[]

## Scrape the page link template for each business
### The url for the initial call to the business' yelp profile is not the same as when you go further into their review pages

### Start with one business

In [124]:
url_biz = 'https://www.yelp.com/biz/r%C3%A9veille-coffee-co-san-francisco-6?osq=Caf%C3%A9s+%26+Coffee+Shops'

link_biz = requests.get(url_biz)
page_biz = link_biz.text
soup_biz = BeautifulSoup(page_biz, 'html5lib')

#### Get the page link template

In [127]:
link_class = soup_biz.find('div', {'class': 'arrange_unit page-option'})
link_temp = link_class.find('a', href=True)['href']

### Incorporate into the review scraping function

In [None]:
def scrape_biz_reviews_complete(biz_url, start, end, delay=5):
    biz_rating_list = []
    biz_review_list = []
    ua = UserAgent()
    
    count = 0
    
    biz_url_list = []
    
    init_link = requests.get(biz_url)
    init_page = init_link.text
    init_soup = BeautifulSoup(init_page, 'html5lib')
    
    link_list = init_soup.find_all('div', {'class:' ''})
    
    for i in range(start, end+1, 20):
        url = biz_url
        user_agent = {'User-agent': ua.random}
        
        try:
            link = requests.get(url, headers=user_agent)
        except:
            print(url)
            print('URL does not work.')
            
        page = link.text
        soup = BeautifulSoup(page, 'html5lib')
        
        
        
        all_items = soup.find_all('div', {'class': 'review-content'})
        
        if all_items:
            for item in all_items:
                rating = item.find('div', {'class': 'i-stars'})['title']
                rating2 = re.sub('\D', '', rating)
                rating_clean = re.sub('0', '', rating2)

                review = item.find('p').getText()
                
                if not rating_clean:
                    print(url)
                    print('No rating found')
                    rating_clean = '-1'
                elif not review:
                    print(url)
                    print('No review found')
                    continue
                
                biz_rating_list.append(rating_clean)
                biz_review_list.append(review)
        else:
            print(url)
            print('Could not find review-content')
            
        time.sleep(delay + 2*np.random.rand())
        
        print(count)
        count += 1
    
    return biz_rating_list, biz_review_list