## Web Scraping Yelp

   For this project, I am using Yelp reviews of Restaurants around downtown Nashville.
   
   Quickly found that Yelp API does not provide more than 3 reviews for a restaurant.
   So, need to web scrape !
   

In [1]:
import requests
import re
from time import sleep
import sys
from bs4 import BeautifulSoup
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

url='https://www.yelp.com/search?find_desc=Restaurants&find_loc=Nashville'

In [3]:
from ediblepickle import checkpoint
import os
from urllib.parse import quote
 
cache_dir = 'cache'
if not os.path.exists(cache_dir):
    os.mkdir(cache_dir)

@checkpoint(key=lambda args, kwargs: quote(str(args[0])) + '.pkl', work_dir=cache_dir)
def business_caching(start):
    print('making request...')
    params = { 'format'        :'json',
               'start'         : start}
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    result = requests.get('https://www.yelp.com/search?find_desc=Restaurants+Nashville+Tn&find_loc=Nashville', params=params, headers=headers)
    print("response code:", result.status_code)
    print('API request complete...')
    return result

In [4]:
import random, time
def get_business_from_all_pages(npages):
    for page in range(npages):
        n = round(random.uniform(1,5), 1) # random times for sleeping
        start = 10*page # each page has 10 business starting from 3rd index
        response = business_caching(start)
        print(f"caching completed for page {page}...")
        time.sleep(n)                
    return None


In [5]:
def get_business_names(response):
    """
    returns the business names
    """
    restaurants = dict()
    soup = BeautifulSoup(response.text, 'html.parser')     
    bizs = soup.find_all('a',href=True,  class_='css-1m051bw')  
    nbiz = len(bizs)
    for biz in range(5, 15):  # the business seems to start from index 5
        name = bizs[biz].get_text()
        url = bizs[biz]['href']  
        restaurants.update({name :{'url': url}})      
    return restaurants   


In [6]:
# write the restaurants and link to a file for future reference
import urllib
import urllib.request as url

cache_dir_res = 'cache_restaurants'  
if not os.path.exists(cache_dir_res):
    os.mkdir(cache_dir_res)

@checkpoint(key=lambda args, kwargs: quote(str(args[0])) + '.pkl', work_dir=cache_dir_res)
def parse_restaurant(name, myurl, start):
    print(f"making request for restaurant {url} ..")
    params = { 'format'        :'json',
               'start'         : start}
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    #result = requests.get(url, params=params, headers=headers)
    
    result = urllib.request.urlopen(myurl).read().decode('utf-8')
    #print("response code:", result.status_code)
    print(f" API request complete for page {start}... ")          
    return result


In [7]:
def get_restaurant_info(restaurants):
    """
    input=restaurants is a dictionary of dictionary
    from 
    """
    for res_name, v in restaurants.items():
        url =v['url']
        print(url, res_name)
    return None

In [8]:
def get_star_ratings(soup):
    """
    soup is the BeautifulSoup ed 
    """
    ratings = soup.find_all("div", {"class": "five-stars__09f24__mBKym"})
    rating = float(ratings[0]["aria-label"].split()[0])
    return rating

In [9]:
def get_review_count(soup):
    """
    soup is the BeautifulSoup ed 
    """
    nreviews = soup.find('span', {'class': 'css-1x9ee72'}).text.split()[0]
    return nreviews
    

In [10]:
def get_address(soup):
    """
    soup is the BeautifulSoup ed 
    """
    addr=soup.find('a', href=True, class_="css-1um3nx")
    address = addr.text
    return address

In [11]:
def get_reviews(soup):
    """
    returns the list of reviews of the souped (page)
    """
    reviews = []
    response=soup.find_all('p', class_='comment__09f24__gu0rG css-qgunke')
    for div in response:
        for html_class in div.find_all('span',class_="raw__09f24__T4Ezm"):
            text = html_class.find('span')
            review = html_class.getText().strip()
            reviews.append(review)
            #print(review,"\n")
    return reviews              

In [12]:
def parse_pages(biz_name, biz_url, start=None):
    """
    url is the main first page url (with start=0).
    res_name is the name of restaurant
    
    we will get the number of pages of each restaurant from the number of reviews
    """  
    
    
    if start ==0:
        biz_name = biz_name 
    else:
        biz_name = biz_name + str(start)
    

    if start == 0:
        biz_url = 'https://www.yelp.com' + biz_url
    else:             
        biz_url = 'https://www.yelp.com' + biz_url +"&start="+ str(start)
  
    response=parse_restaurant(biz_name, biz_url, start)
    soup = BeautifulSoup(response, 'html.parser')
       
    
    nreviews = 0
    address = ''
    star_rating = 0.0
    npages = 0
    reviews = []
    
    nreviews = int(get_review_count(soup))
    address = get_address(soup)
    star_rating = get_star_ratings(soup)
    npages = int(nreviews/10) +1   
    reviews=get_reviews(soup) 
    return(nreviews, address, star_rating, reviews)
        
    #print(f"Number of reviews = {nreviews} \nAddress = {address} \nStar Rating = {star_rating} ")
    #else:
    #    reviews=get_reviews(soup)  
    #    return(reviews)
    

In [13]:
# get all businesses from all pages
# there are 24 pages in nashville restaurant
npages = 24
#get_business_from_all_pages(npages) # comment when not needed.

In [14]:
# lets make a restaurant dictionary and keep adding attributes.
# This will have all attributes related to each restaurants and will keep updating this.
restaurants = dict()
for page in range(npages):
    start = 10*page
    restaurant = get_business_names(business_caching(start))
    restaurants.update(restaurant)


In [15]:
#with open('restaurants.pkd', 'rb') as f:
#    res = dill.load(f)
#print(res)

In [43]:
#get_restaurant_info(restaurants)
import dill

#url_test = 'https://www.yelp.com'+'/biz/the-stillery-nashville?osq=Restaurants+Nashville+Tn'
#url_test="/biz/the-stillery-nashville?osq=Restaurants+Nashville+Tn"
#name_test = "The Stillery" 

#explore restaurants based on page. ie extract 1st page from all restaurant, 2nd page and so on

for start in range(0, 10, 10):
    
    # go over each restaurant first
    for restaurant in restaurants:
        restaurant_name = restaurant
        restaurant_url = restaurants[restaurant]['url']
        print(restaurant, restaurant_url)
        
        if start == 0: # first page, extract these attributes
            print(f"running on page: {start}")
            
            try:
                (nreviews, address, star_rating, reviews)=parse_pages(restaurant_name, restaurant_url,start)
                # add attributes to the existing main restaurnats dictionary
                
                restaurants[restaurant_name]['nreviews']= nreviews 
                restaurants[restaurant_name]['address'] = address
                restaurants[restaurant_name]['star_rating'] = star_rating  
                restaurants[restaurant_name]['reviews'] = reviews
                
            except:
                pass
            else:
                restaurants[restaurant_name]['nreviews']= 0; # placing default value as 0 reviews
                print("there was some error on first page")
        
        # make sure the start is less than number of reviews, otherwise page does not exists
        else:
            try:
                if start <= restaurants[restaurant]['nreviews']:
                    (nreviews, address, star_rating, reviews) = parse_pages(restaurant_name, restaurant_url,start)
            except:
                pass
            else:
                (nreviews, address, star_rating, reviews) = parse_pages(restaurant_name, restaurant_url,start)
                print("working on to extract restaurant info from ")           
            if restaurants[restaurant_name]['address'] not in restaurants[restaurant_name]:
                # assuming that if 'address' is not present, then other attributes are also not present
                restaurants[restaurant_name]['nreviews']= nreviews 
                restaurants[restaurant_name]['address'] = address
                restaurants[restaurant_name]['star_rating'] = star_rating  
                restaurants[restaurant_name]['reviews'] = reviews
            
            else:            
                #add the reviews in the list
                restaurants[restaurant_name]['reviews'].extend(reviews)
        
        # wait for some time before sending another reques via parse_pages
        n = round(random.uniform(10,120), 1) # random times for sleeping
        #time.sleep(n)
        print(f"sleeping for {n} seconds...")
        

# open pickle file
with open('restaurants_yelp.pkd', 'wb') as f:
    dill.dump(restaurants, f)


The Stillery /biz/the-stillery-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
there was some error on first page
sleeping for 77.4 seconds...
The Hampton Social - Nashville /biz/the-hampton-social-nashville-nashville-2?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 22.6 seconds...
Etch /biz/etch-nashville-4?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 52.4 seconds...
Milk and Honey Nashville /biz/milk-and-honey-nashville-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 56.5 seconds...
The Butter Milk Ranch /biz/the-butter-milk-ranch-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 86.4 seconds...
The Twelve Thirty Club /biz/honky-tonk-the-twelve-thirty-club-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 106.6 seconds...
Adele’s - Nashville /biz/adeles-nashville-nashville-4?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 111.6 seconds...
Sixty Vines /biz/sixty-vines-

sleeping for 18.6 seconds...
Xiao Bao /biz/xiao-bao-nashville-2?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 23.8 seconds...
Butchertown Hall /biz/butchertown-hall-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 66.6 seconds...
Two Hands /biz/two-hands-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 69.5 seconds...
Oak Steakhouse /biz/oak-steakhouse-nashville-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 83.2 seconds...
Jasper’s /biz/jaspers-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 71.0 seconds...
Lauter /biz/lauter-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 57.1 seconds...
Party Fowl - Nashville /biz/party-fowl-nashville-nashville-4?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 74.2 seconds...
Red Perch /biz/red-perch-nashville-6?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 60.0 seconds...
The Bridge /biz/the-br

sleeping for 59.5 seconds...
Il Forno /biz/il-forno-nashville-2?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 38.7 seconds...
Pelican & Pig /biz/pelican-and-pig-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 61.1 seconds...
Hathorne /biz/hathorne-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 112.9 seconds...
Steam Boys /biz/steam-boys-nashville-2?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 43.9 seconds...
Nicky’s Coal Fired /biz/nickys-coal-fired-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 118.2 seconds...
Assembly Food Hall /biz/assembly-food-hall-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 60.6 seconds...
The District Bar & Kitchen /biz/the-district-bar-and-kitchen-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 40.7 seconds...
Carne Mare /biz/carne-mare-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 34.4

sleeping for 38.2 seconds...
Bourbon Street Blues & Boogie Bar /biz/bourbon-street-blues-and-boogie-bar-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 72.9 seconds...
Maple Street Biscuit Company /biz/maple-street-biscuit-company-nashville-2?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 61.2 seconds...
TailGate Brewery Music Row /biz/tailgate-brewery-music-row-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 111.8 seconds...
lola /biz/lola-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 119.8 seconds...
Boston Commons /biz/boston-commons-nashville-4?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 82.5 seconds...
Whiskey River Saloon /biz/whiskey-river-saloon-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 33.9 seconds...
Voodoo Gumbo /biz/voodoo-gumbo-nashville?osq=Restaurants+Nashville+Tn
running on page: 0
sleeping for 56.0 seconds...
Bavarian Bierhaus /biz/bavarian

In [49]:
print(restaurants['The Hampton Social - Nashville'])

{'url': '/biz/the-hampton-social-nashville-nashville-2?osq=Restaurants+Nashville+Tn'}


In [None]:
import dill

#explore restaurants based so that there are at least 50 reviews per each restaurant.

#for start in range(10, 20, 10):
    
# go over each restaurant first
for restaurant in restaurants:
    for page in range(1,5):
        start = page*10
        restaurant_name = restaurant
        restaurant_url = restaurants[restaurant]['url']
        print(restaurant, restaurant_url, page)
        
        if start == 0: # first page, extract these attributes
            print(f"running on page: {start}")
            
            try:
                (nreviews, address, star_rating, reviews)=parse_pages(restaurant_name, restaurant_url,start)
                # add attributes to the existing main restaurnats dictionary
                
                restaurants[restaurant_name]['nreviews']= nreviews 
                restaurants[restaurant_name]['address'] = address
                restaurants[restaurant_name]['star_rating'] = star_rating  
                restaurants[restaurant_name]['reviews'] = reviews
                
            except:
                pass
            else:
                restaurants[restaurant_name]['nreviews']= 0; # placing default value as 0 reviews
                print("there was some error on first page")
        
        # make sure the start is less than number of reviews, otherwise page does not exists
        else:
            try:
                (nreviews, address, star_rating, reviews) = parse_pages(restaurant_name, restaurant_url,start)
                print(f"working on to extract restaurant info from page: {page}, {restaurant_name}")           
                if 'address' not in restaurants[restaurant_name]:
                    # assuming that if 'address' is not present, then other attributes are also not present
                    restaurants[restaurant_name]['nreviews']= nreviews 
                    restaurants[restaurant_name]['address'] = address
                    restaurants[restaurant_name]['star_rating'] = star_rating  
                    restaurants[restaurant_name]['reviews'] = reviews
                else:
                    #add the reviews in the list
                    restaurants[restaurant_name]['reviews'].extend(reviews)                    
            except:
                pass
            else:
                print(f"something not working in this page {page} for restaurant {restaurant_name}")       
        # wait for some time before sending another request via parse_pages
        n = round(random.uniform(10,100), 1) # random times for sleeping
        print(f"sleeping for {n} seconds...")
        time.sleep(n)
       
        

# open pickle file
with open('restaurants_yelp.pkd', 'wb') as f:
    dill.dump(restaurants, f)


The Stillery /biz/the-stillery-nashville?osq=Restaurants+Nashville+Tn
working on to extract restaurant info from page: 1, The Stillery
something not working in this page 1 for restaurant The Stillery
sleeping for 80.4 seconds...
The Stillery /biz/the-stillery-nashville?osq=Restaurants+Nashville+Tn
working on to extract restaurant info from page: 2, The Stillery
something not working in this page 2 for restaurant The Stillery
sleeping for 91.4 seconds...
The Stillery /biz/the-stillery-nashville?osq=Restaurants+Nashville+Tn
working on to extract restaurant info from page: 3, The Stillery
something not working in this page 3 for restaurant The Stillery
sleeping for 38.0 seconds...
The Stillery /biz/the-stillery-nashville?osq=Restaurants+Nashville+Tn
working on to extract restaurant info from page: 4, The Stillery
something not working in this page 4 for restaurant The Stillery
sleeping for 73.5 seconds...
The Hampton Social - Nashville /biz/the-hampton-social-nashville-nashville-2?osq=Res

sleeping for 27.2 seconds...
Hattie B’s Hot Chicken - Nashville - Midtown /biz/hattie-b-s-hot-chicken-nashville-midtown-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 10... 
sleeping for 74.1 seconds...
Hattie B’s Hot Chicken - Nashville - Midtown /biz/hattie-b-s-hot-chicken-nashville-midtown-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 20... 
sleeping for 91.9 seconds...
Hattie B’s Hot Chicken - Nashville - Midtown /biz/hattie-b-s-hot-chicken-nashville-midtown-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 30... 
sleeping for 43.6 seconds...
Hattie B’s Hot Chicken - Nashville - Midtown /biz/

 API request complete for page 40... 
sleeping for 27.0 seconds...
Rolf and Daughters /biz/rolf-and-daughters-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 10... 
sleeping for 96.6 seconds...
Rolf and Daughters /biz/rolf-and-daughters-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 20... 
sleeping for 14.2 seconds...
Rolf and Daughters /biz/rolf-and-daughters-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 30... 
sleeping for 58.2 seconds...
Rolf and Daughters /biz/rolf-and-daughters-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/li

Ole Smoky Distillery/Yee-Haw Brewing Co. /biz/ole-smoky-distillery-yee-haw-brewing-co-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 10... 
sleeping for 36.4 seconds...
Ole Smoky Distillery/Yee-Haw Brewing Co. /biz/ole-smoky-distillery-yee-haw-brewing-co-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 20... 
sleeping for 15.5 seconds...
Ole Smoky Distillery/Yee-Haw Brewing Co. /biz/ole-smoky-distillery-yee-haw-brewing-co-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 30... 
sleeping for 21.7 seconds...
Ole Smoky Distillery/Yee-Haw Brewing Co. /biz/ole-smoky-distillery-yee-haw-brewing-co-nashvill

 API request complete for page 30... 
sleeping for 16.2 seconds...
The Mockingbird Nashville /biz/the-mockingbird-nashville-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 40... 
sleeping for 72.9 seconds...
Bourbon Steak by Michael Mina /biz/bourbon-steak-by-michael-mina-nashville-2?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 10... 
sleeping for 36.5 seconds...
Bourbon Steak by Michael Mina /biz/bourbon-steak-by-michael-mina-nashville-2?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 20... 
sleeping for 93.3 seconds...
Bourbon Steak by Michael Mina /biz/bourbon-steak-by-michael-mina-nashville-2?osq=Restaurants+N

sleeping for 38.4 seconds...
Butcher & Bee /biz/butcher-and-bee-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 30... 
sleeping for 24.7 seconds...
Butcher & Bee /biz/butcher-and-bee-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 40... 
sleeping for 74.2 seconds...
The Loveless Cafe /biz/the-loveless-cafe-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 10... 
sleeping for 52.9 seconds...
The Loveless Cafe /biz/the-loveless-cafe-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete 

Lockeland Table /biz/lockeland-table-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 30... 
sleeping for 20.5 seconds...
Lockeland Table /biz/lockeland-table-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 40... 
sleeping for 86.1 seconds...
Pancake Pantry /biz/pancake-pantry-nashville-3?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 10... 
sleeping for 58.3 seconds...
Pancake Pantry /biz/pancake-pantry-nashville-3?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 20... 
sleeping for 56.2

 API request complete for page 30... 
sleeping for 16.1 seconds...
Graze Nashville /biz/graze-nashville-nashville-2?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 40... 
sleeping for 37.3 seconds...
NashHouse Southern Spoon and Saloon /biz/nashhouse-southern-spoon-and-saloon-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 10... 
sleeping for 37.2 seconds...
NashHouse Southern Spoon and Saloon /biz/nashhouse-southern-spoon-and-saloon-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 20... 
sleeping for 49.5 seconds...
NashHouse Southern Spoon and Saloon /biz/nashhouse-southern-spoon-and-saloon-nashville?osq=R

 API request complete for page 20... 
sleeping for 77.4 seconds...
Butchertown Hall /biz/butchertown-hall-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 30... 
sleeping for 67.4 seconds...
Butchertown Hall /biz/butchertown-hall-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 40... 
sleeping for 82.2 seconds...
Two Hands /biz/two-hands-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 10... 
sleeping for 62.7 seconds...
Two Hands /biz/two-hands-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API req

Black Rabbit /biz/black-rabbit-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 40... 
sleeping for 93.7 seconds...
Bag Lady’s Fry Joint /biz/bag-lady-s-fry-joint-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 10... 
sleeping for 39.4 seconds...
Bag Lady’s Fry Joint /biz/bag-lady-s-fry-joint-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 20... 
sleeping for 62.5 seconds...
Bag Lady’s Fry Joint /biz/bag-lady-s-fry-joint-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 

 API request complete for page 30... 
sleeping for 88.6 seconds...
City House /biz/city-house-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 40... 
sleeping for 47.3 seconds...
Monell’s Dining & Catering /biz/monells-dining-and-catering-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 10... 
sleeping for 23.0 seconds...
Monell’s Dining & Catering /biz/monells-dining-and-catering-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 20... 
sleeping for 90.6 seconds...
Monell’s Dining & Catering /biz/monells-dining-and-catering-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module '

 API request complete for page 40... 
sleeping for 44.2 seconds...
HiFi Clyde’s Nashville /biz/hifi-clydes-nashville-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 10... 
sleeping for 28.6 seconds...
HiFi Clyde’s Nashville /biz/hifi-clydes-nashville-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 20... 
sleeping for 95.6 seconds...
HiFi Clyde’s Nashville /biz/hifi-clydes-nashville-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 30... 
sleeping for 63.0 seconds...
HiFi Clyde’s Nashville /biz/hifi-clydes-nashville-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.

 API request complete for page 40... 
sleeping for 91.9 seconds...
Emery Wood Fired /biz/emery-wood-fired-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 10... 
sleeping for 10.9 seconds...
Emery Wood Fired /biz/emery-wood-fired-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 20... 
sleeping for 77.1 seconds...
Emery Wood Fired /biz/emery-wood-fired-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 30... 
sleeping for 54.6 seconds...
Emery Wood Fired /biz/emery-wood-fired-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/url

 API request complete for page 40... 
sleeping for 65.2 seconds...
The Cookery /biz/the-cookery-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 10... 
sleeping for 63.8 seconds...
The Cookery /biz/the-cookery-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 20... 
sleeping for 55.4 seconds...
The Cookery /biz/the-cookery-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 30... 
sleeping for 24.6 seconds...
The Cookery /biz/the-cookery-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complet

 API request complete for page 40... 
sleeping for 67.4 seconds...
Raising Canes Chicken /biz/raising-canes-chicken-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 10... 
sleeping for 64.6 seconds...
Raising Canes Chicken /biz/raising-canes-chicken-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 20... 
sleeping for 79.8 seconds...
Raising Canes Chicken /biz/raising-canes-chicken-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 30... 
sleeping for 25.4 seconds...
Raising Canes Chicken /biz/raising-canes-chicken-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.requ

Sun Diner /biz/sun-diner-nashville-2?osq=Restaurants+Nashville+Tn
sleeping for 27.5 seconds...
Sun Diner /biz/sun-diner-nashville-2?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 20... 
sleeping for 30.9 seconds...
Sun Diner /biz/sun-diner-nashville-2?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 30... 
sleeping for 83.8 seconds...
Sun Diner /biz/sun-diner-nashville-2?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 40... 
sleeping for 71.2 seconds...
Two Ten Jack /biz/two-ten-jack-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'>

 API request complete for page 10... 
sleeping for 40.4 seconds...
Il Forno /biz/il-forno-nashville-2?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 20... 
sleeping for 61.2 seconds...
Il Forno /biz/il-forno-nashville-2?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 30... 
sleeping for 12.4 seconds...
Il Forno /biz/il-forno-nashville-2?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 40... 
sleeping for 16.7 seconds...
Pelican & Pig /biz/pelican-and-pig-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for 

 API request complete for page 20... 
sleeping for 13.8 seconds...
North Italia - Nashville /biz/north-italia-nashville-nashville-3?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 30... 
sleeping for 83.9 seconds...
North Italia - Nashville /biz/north-italia-nashville-nashville-3?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 40... 
sleeping for 36.3 seconds...
Answer /biz/answer-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 10... 
sleeping for 91.5 seconds...
Answer /biz/answer-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/requ

 API request complete for page 20... 
sleeping for 25.2 seconds...
Henley /biz/henley-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 30... 
sleeping for 69.7 seconds...
Henley /biz/henley-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 40... 
sleeping for 13.1 seconds...
The 404 Kitchen /biz/the-404-kitchen-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 10... 
sleeping for 52.1 seconds...
The 404 Kitchen /biz/the-404-kitchen-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete fo

 API request complete for page 30... 
sleeping for 19.9 seconds...
Scout’s Pub /biz/scouts-pub-nashville-2?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 40... 
sleeping for 90.7 seconds...
The Loading Dock /biz/the-loading-dock-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 10... 
sleeping for 12.7 seconds...
The Loading Dock /biz/the-loading-dock-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 20... 
sleeping for 51.7 seconds...
The Loading Dock /biz/the-loading-dock-nashville?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/reque

 API request complete for page 40... 
sleeping for 94.1 seconds...
Brothers Burger Joint /biz/brothers-burger-joint-berry-hill?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 10... 
sleeping for 34.6 seconds...
Brothers Burger Joint /biz/brothers-burger-joint-berry-hill?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 20... 
sleeping for 92.2 seconds...
Brothers Burger Joint /biz/brothers-burger-joint-berry-hill?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.request' from '/usr/local/lib/python3.10/urllib/request.py'> ..
 API request complete for page 30... 
sleeping for 78.2 seconds...
Brothers Burger Joint /biz/brothers-burger-joint-berry-hill?osq=Restaurants+Nashville+Tn
making request for restaurant <module 'urllib.

In [2]:
#url_test = 'https://www.yelp.com'+'/biz/the-stillery-nashville?osq=Restaurants+Nashville+Tn'
restaurant_url="/biz/etch-nashville-4?osq=Restaurants+Nashville+Tn"
restaurant_name = "Etch" 
start=80
parse_pages(restaurant_name, restaurant_url,start)

NameError: name 'parse_pages' is not defined

In [30]:
# go through each restaurants now and collect the relevant data:
for restaurant in restaurants:
    for page in range(1,2):
        start = page*10
        restaurant_name = restaurant
        restaurant_url = restaurants[restaurant]['url']
        print(restaurant, restaurant_url, page)
        
        if start == 0: # first page, extract these attributes
            print(f"running on page: {start}")
            
            try:
                (nreviews, address, star_rating, reviews)=parse_pages(restaurant_name, restaurant_url,start)
                # add attributes to the existing main restaurnats dictionary
                
                restaurants[restaurant_name]['nreviews']= nreviews 
                restaurants[restaurant_name]['address'] = address
                restaurants[restaurant_name]['star_rating'] = star_rating  
                restaurants[restaurant_name]['reviews'] = reviews
                
            except:
                pass
            else:
                restaurants[restaurant_name]['nreviews']= 0; # placing default value as 0 reviews
                print("there was some error on first page")
        
        # make sure the start is less than number of reviews, otherwise page does not exists
        else:
            try:
                (nreviews, address, star_rating, reviews) = parse_pages(restaurant_name, restaurant_url,start)
                print(f"working on to extract restaurant info from page: {page}, {restaurant_name}")           
                if 'address' not in restaurants[restaurant_name]:
                    # assuming that if 'address' is not present, then other attributes are also not present
                    restaurants[restaurant_name]['nreviews']= nreviews 
                    restaurants[restaurant_name]['address'] = address
                    restaurants[restaurant_name]['star_rating'] = star_rating  
                    restaurants[restaurant_name]['reviews'] = reviews
                else:
                    #add the reviews in the list
                    restaurants[restaurant_name]['reviews'].extend(reviews)                    
            except:
                pass
            else:
                print(f"something not working in this page {page} for restaurant {restaurant_name}")       
        # wait for some time before sending another request via parse_pages
        n = round(random.uniform(10,100), 1) # random times for sleeping
        print(f"sleeping for {n} seconds...")
        #time.sleep(n)
       
        

# open pickle file
with open('restaurants_yelp.pkd', 'wb') as f:
    dill.dump(restaurants, f)


The Stillery /biz/the-stillery-nashville?osq=Restaurants+Nashville+Tn 1
working on to extract restaurant info from page: 1, The Stillery
something not working in this page 1 for restaurant The Stillery
sleeping for 37.0 seconds...
The Hampton Social - Nashville /biz/the-hampton-social-nashville-nashville-2?osq=Restaurants+Nashville+Tn 1
sleeping for 60.5 seconds...
Etch /biz/etch-nashville-4?osq=Restaurants+Nashville+Tn 1
sleeping for 42.5 seconds...
Milk and Honey Nashville /biz/milk-and-honey-nashville-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 16.3 seconds...
The Butter Milk Ranch /biz/the-butter-milk-ranch-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 48.4 seconds...
The Twelve Thirty Club /biz/honky-tonk-the-twelve-thirty-club-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 16.7 seconds...
Adele’s - Nashville /biz/adeles-nashville-nashville-4?osq=Restaurants+Nashville+Tn 1
sleeping for 12.2 seconds...
Sixty Vines /biz/sixty-vines-nashville?osq=Restaurants+

sleeping for 92.3 seconds...
Bag Lady’s Fry Joint /biz/bag-lady-s-fry-joint-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 61.8 seconds...
Common Ground /biz/common-ground-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 45.9 seconds...
Jeff Ruby’s Steakhouse- Nashville /biz/jeff-rubys-steakhouse-nashville-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 35.4 seconds...
Superica /biz/superica-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 17.5 seconds...
Mas Tacos Por Favor /biz/mas-tacos-por-favor-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 73.4 seconds...
Jane’s Hideaway /biz/jane-s-hideaway-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 18.1 seconds...
The Library Restaurant & Lounge /biz/the-library-restaurant-and-lounge-nashville-3?osq=Restaurants+Nashville+Tn 1
sleeping for 33.1 seconds...
City House /biz/city-house-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 62.5 seconds...
Monell’s Dining & Catering /biz/monells-dining-and-cate

sleeping for 85.0 seconds...
Jack Brown’s Beer & Burger Joint /biz/jack-browns-beer-and-burger-joint-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 56.3 seconds...
Mangia Nashville /biz/mangia-nashville-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 87.9 seconds...
Pastaria /biz/pastaria-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 91.6 seconds...
PennePazze /biz/pennepazze-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 25.3 seconds...
Etc Restaurant /biz/etc-restaurant-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 15.7 seconds...
Mother’s Ruin /biz/mother-s-ruin-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 63.6 seconds...
Scout’s Pub /biz/scouts-pub-nashville-2?osq=Restaurants+Nashville+Tn 1
sleeping for 42.5 seconds...
The Loading Dock /biz/the-loading-dock-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 28.0 seconds...
Mofongo Cafe /biz/mofongo-cafe-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 32.7 seconds...
Gray & Dudle

sleeping for 59.6 seconds...
Waldo’s Chicken & Beer /biz/waldos-chicken-and-beer-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 73.5 seconds...
Lou|Na /biz/lou-na-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 16.8 seconds...
Green Hills Grille /biz/green-hills-grille-nashville-2?osq=Restaurants+Nashville+Tn 1
sleeping for 65.0 seconds...
Cafe Roze /biz/cafe-roze-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 19.7 seconds...
House of Cards /biz/house-of-cards-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 22.0 seconds...
Valentino’s Ristorante /biz/valentinos-ristorante-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 90.2 seconds...
Mission BBQ /biz/mission-bbq-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 42.6 seconds...
MacHenry’s Meat & Three /biz/machenrys-meat-and-three-nashville?osq=Restaurants+Nashville+Tn 1
sleeping for 97.8 seconds...
The Rutledge Downtown Nashville /biz/the-rutledge-downtown-nashville-nashville?osq=Restaurants+Nashvil

In [42]:
import dill
with open('restaurants_yelp.pkd', 'rb') as f:
    res = dill.load(f)

print({d.keys() for d in res})

AttributeError: 'str' object has no attribute 'keys'

(36.1622767, -86.7742984)
Latitude: 36.1622767
Longitude: -86.7742984


Lets find the review count, star rating , address, price for each restaurant