In [1]:
# scraping libraries to read html code
import requests
from bs4 import BeautifulSoup
import json

In [2]:
# pandas library to create dataframe
# re for match
import pandas as pd
import numpy as np
import re

In [3]:
# Direct to initial results
url = 'https://www.yelp.com/search?find_desc='
keyword = input('What do you want to eat ? ')
address = input('Where are you ? ')

url = 'https://www.yelp.com/search?find_desc='+ keyword + '&find_loc=' + address 
url

What do you want to eat ? pizza
Where are you ? New York


'https://www.yelp.com/search?find_desc=pizza&find_loc=New York'

In [4]:
# Checking if the url is "valid", the status code should be 200
response = requests.get(url)
response.status_code

200

In [5]:
# extract the html code from the web page = https://www.yelp.com/search?find_desc=Restaurants&find_loc=New+York,+NY&start=30
results_page = BeautifulSoup(response.content,'lxml')

In [6]:
# creating a list of businesses with the html code of the business as elements of the list
businesses = results_page.find_all('a',class_ = "biz-name js-analytics-click")

In [7]:
# selecting the first element (a particular restaurant in our case) of the list businesses and looking in the
#ignore the first element of the businesses list, it is an ad by yelp, regular list start from the second (index = 1)
# html code for its name
business = businesses[1]
name = business.find('span').get_text()

# getting the link to the yelp page this particular restaurant
link = 'https://www.yelp.com' + business.get('href')

In [8]:
print(name,link)

Juliana’s Pizza https://www.yelp.com/biz/julianas-pizza-brooklyn-5?osq=pizza


In [9]:
# extracting the html code of the web page associated with the first business of our list businesses
response_business = requests.get(link)
results_business_page = BeautifulSoup(response_business.content,'lxml')

In [10]:
# looking in the html page of the restaurant for latitude and longitude coordinates of the restaurant
# the json object json_map contains these coordinates
json_map = results_business_page.find('div', class_ = "lightbox-map hidden").get('data-map-state')

# we can convert this json object to a python dictionnary using the following method
dict_map = json.loads(json_map)

# the relevant keys in this dict are "markers", "location" and "latitude" and "longitude"
latitude = dict_map["markers"][1]["location"]["latitude"]
longitude = dict_map["markers"][1]["location"]["longitude"]

In [11]:
json_map

'{"serviceAreas": [], "moMapPossible": true, "scrollwheelZoom": false, "zoomControlPosition": "top_right", "minZoomlevel": null, "isFullBleed": false, "maxZoomlevel": null, "zoom": 15, "library": "google", "fitToGeobox": false, "hoods": [], "adPinColor": null, "markers": [{"location": null, "key": "directions_marker", "icon": {"name": "directions", "anchorOffset": [12, 32], "activeOrigin": [0, 0], "scaledSize": [24, 32], "regularUri": "https://s3-media4.fl.yelpcdn.com/assets/srv0/yelp_maps/79f63ebc20db/assets/img/directions@2x.png", "size": [24, 32], "activeUri": "https://s3-media4.fl.yelpcdn.com/assets/srv0/yelp_maps/7249ab345ac8/assets/img/directions_highlighted@2x.png", "regularOrigin": [0, 0]}}, {"resourceType": "business", "url": "/biz/julianas-pizza-brooklyn-5", "resourceId": "ysqgdbSrezXgVwER2kQWKA", "shouldOpenInNewTab": false, "location": {"latitude": 40.7026153030093, "longitude": -73.9934159993549}, "key": "starred_business", "hovercardId": "35vhJmMTZyntSL3LHb0Ebw", "icon": 

In [12]:
print(latitude,longitude)

40.7026153030093 -73.9934159993549


In [13]:
# The category can be found in the 'a' tag under "span", class_ = 'category-str-list' tag, 
category= results_business_page.find("span", class_ = 'category-str-list').find('a').get_text()

In [14]:
category

'Pizza'

In [15]:
# the price range ('$','$$','$$$', or '$$$$') can be found in the "span", class_ = 'business-attribute price-range' tag
# the price range is not always available, that is why we need to catch errors 
# if necessary (hence the try: except) structure

try:
    price_range = results_business_page.find("span", class_ = 'business-attribute price-range').get_text()
except:
    price_range = None

In [16]:
print(price_range)

$$


In [17]:
# the ratings associated to a particular restaurant are in the "div", itemprop = 'aggregateRating' tag
# the rating value (between 0 and 5) can be accessed through ratings (= results_business_page.find("div", 
# itemprop = 'aggregateRating') at the tag "meta" and the attribute 'content'
# the number of reviews is available through ratings at the 'span' tag

ratings = results_business_page.find("div", itemprop = 'aggregateRating')
rating_value = ratings.find("meta").get('content')
review_count = ratings.find("span").get_text()

In [18]:
print(rating_value,review_count)

4.5 1844


In [19]:
# the phone number can be accessed at the tag 'span',itemprop = "telephone"
# phone numbers are not always available so we need to catch errors using the try:... except: structure

try:
    phone_number = results_business_page.find('span',itemprop = "telephone").get_text().strip()
except:
    phone_number = None   

In [20]:
print(phone_number)

(718) 596-6700


In [21]:
# We want to know if the restaurant is currently open: this information is 
# given at the 'span', class_ = "nowrap extra open" tag. This information is not always 
# available that is why we use the try: ... except:... structure

try:
    open_ = results_business_page.find('span', class_ = "nowrap extra open").get_text()
except:
    open_ = None

In [22]:
print(open_)

None


In [23]:
try:
    addr = results_business_page.find('strong', class_ = "street-address").get_text().strip()
except:
    addr = None
addr

'19 Old Fulton StBrooklyn, NY 11201'

In [24]:
try:
    web = results_business_page.find('span', class_ = "biz-website js-biz-website js-add-url-tagging").find('a').get_text()
except:
    web = None
web

'julianaspizza.com'

In [25]:
#want to put all infomations we want to get from one business into one funtion
#input: specific restaurant url link
#output: all details we tested above
def get_info(link):
    response_business = requests.get(link)
    results_business_page = BeautifulSoup(response_business.content,'lxml')
    
    try:
        addr = results_business_page.find('strong', class_ = "street-address").get_text().strip()
    except:
        addr = None

    try:
        json_map = results_business_page.find('div', class_ = "lightbox-map hidden").get('data-map-state')
        dict_map = json.loads(json_map)
        lat = dict_map["markers"][1]["location"]["latitude"]
        lon = dict_map["markers"][1]["location"]["longitude"]
    except: 
        json_map = None 

    try: 
        category = results_business_page.find("span", class_ = 'category-str-list').find('a').get_text()
    except:
        category = None
        
    try:
        price_range = results_business_page.find("span", class_ = 'business-attribute price-range').get_text()
    except:
        price_range = None
        
    try: 
        rating_value = results_business_page.find("div", {'itemprop' : "aggregateRating"}).find("meta").get('content')
    except: 
        rating_value = None
          
    try:
        phone_number = results_business_page.find('span',itemprop = "telephone").get_text().strip()
    except:
        phone_number = None
        
    try:
        open_ = results_business_page.find('span', class_ = "nowrap extra open").get_text()
    except:
        open_ = 'closed now'
        
    try:
        web = results_business_page.find('span', class_ = "biz-website js-biz-website js-add-url-tagging").find('a').get_text()
    except:
        web = None
    
    try:
        moreinfo = results_business_page.find_all('dl')
        delivery = "No"
        for i in moreinfo[1:]:
            try:
                attr = i.find('dt',{'class':"attribute-key"}).get_text().strip()
                yon = i.find('dd').get_text().strip()
                if attr == 'Delivery':
                    delivery = yon
                    break
            except:
                delivery = delivery

    except:
        delivery = None
        
    return(addr,category,price_range,rating_value,phone_number,open_,web,delivery,lat,lon)
    

In [26]:
#Get a list of url links of all businesses (or maybe the first 30)
#so that we could compile a list of details on, for example, the first 30 search-result restaurants
link_list = []

for i in range(1,len(businesses)):
    business = businesses[i]
    link = 'https://www.yelp.com' + business.get('href')
    link_list.append(link)

In [27]:
#Create a list of restaurant names
name_list=[]
for i in range(1,len(businesses)):
    business = businesses[i]
    name = business.find('span').get_text()
    name_list.append(name)

In [29]:
#Create a list of all restaurants' information
info_list=[]

for i in range(len(link_list)):
    info_list.append(get_info(link_list[i]))

In [30]:
def get_businesses():
    keyword = input('What do you want to eat ? ')
    address = input('Where are you ? ')
    limit = input('How many results do you want? ')
    filters = input('Do you want to sort by \n 1)price low to high  \n 2)price high to low  \n 3)ratings? ')
    
    #first page - figure out numbers of results per page
    url = 'https://www.yelp.com/search?find_desc='+ keyword + '&find_loc=' + address 
    response = requests.get(url)
    try:
        results_page = BeautifulSoup(response.content,'lxml')
    except:
        print("we didn't get back result page from yelp correctly")
    noperpage = results_page.find('span',{'class':'pagination-results-window'}).get_text().strip()[10:12]
    noperpage = int(noperpage)
    businesses = results_page.find_all('a',{'data-analytics-label':"biz-name"})[1:]
    
    #start loop from the second page to get enough number of restaurants
    loop_range = (int(limit)-1)//noperpage
    for pagination in range(1, loop_range+1):
        url = 'https://www.yelp.com/search?find_desc='+ keyword + '&find_loc=' + address + '&start=' + str(pagination*noperpage)
        response = requests.get(url)
        try:
            results_page = BeautifulSoup(response.content,'lxml')
        except:
            print("we didn't get back result page from yelp correctly")
        
        businesses.extend(results_page.find_all('a',{'data-analytics-label':"biz-name"})[1:])
    businesses = businesses[:int(limit)]
    
    #generate details by get_info and format in dataframe
    business_list = []
    for business in businesses:
        name = business.find('span').get_text()
        link = 'https://www.yelp.com' + business.get('href')
        addr,category,price_range,rating_value,phone_number,open_,web,delivery,lat,lon= get_info(link)
        business_list.append([name,addr,open_,price_range,rating_value,category,phone_number,web,delivery,lat,lon])
    df = pd.DataFrame(business_list[:int(limit)+1], columns = ['name','address','open or closed','price range','rating value','category','phone','website','delivery service','lat','lon'])
    
    #sortby...
    pattern = r'rat'
    match = bool(re.search(pattern, filters))
    if match or filters == '3':
        df = df.sort_values(by='rating value',ascending=False)
    
    elif filters == 'price low to high' or filters == '1':
        df['length'] = df['price range'].apply(lambda x: len(x))
        df = df.sort_values(by='length',ascending=True)
        df = df.drop('length', axis=1)
        
    elif filters == 'price high to low' or filters == '2':
        df['length'] = df['price range'].apply(lambda x: len(x))
        df = df.sort_values(by='length',ascending=False)
        df = df.drop('length', axis=1)
    
    return(df)

In [31]:
res = get_businesses()

What do you want to eat ? pizza
Where are you ? New York
How many results do you want? 5
Do you want to sort by 
 1)price low to high  
 2)price high to low  
 3)ratings? price low to high


In [34]:
# print(filters)
# print(re.search(pattern, filters))
res

Unnamed: 0,name,address,open or closed,price range,rating value,category,phone,website,delivery service,lat,lon
3,Prince Street Pizza,"27 Prince StNew York, NY 10012",Open now,$,4.5,Pizza,(212) 966-4100,princestpizzanewyork.com,Yes,40.722909,-73.994486
4,Brooklyn Pizza Masters,"1055 1st AveNew York, NY 10022",closed now,$,4.5,Pizza,(646) 669-7757,,Yes,40.758655,-73.963018
0,Juliana’s Pizza,"19 Old Fulton StBrooklyn, NY 11201",closed now,$$,4.5,Pizza,(718) 596-6700,julianaspizza.com,No,40.702615,-73.993416
1,Macoletta,"28-15 24th AveAstoria, NY 11102",closed now,$$,4.5,Pizza,(718) 777-4992,macoletta.com,Yes,40.773242,-73.916415
2,Lombardi’s Pizza,"32 Spring StNew York, NY 10012",closed now,$$,4.0,Pizza,(212) 941-7994,firstpizza.com,Yes,40.721637,-73.995721


In [35]:
import folium
m = folium.Map(location=[40.8075355, -73.9625727],zoom_start=14)

In [36]:
# a marker is created for any restaurant: red for critical situation and orange for not critical
for k in range(len(res)):
    folium.Marker([res['lat'].iloc[k],res['lon'].iloc[k]],popup=res['name'].iloc[k]).add_to(m)
m

In [65]:
#workspace with saved results page
url = 'https://www.yelp.com/search?find_desc=lobster&find_loc=new+york&start=60'
response = requests.get(url)
try:
    results_page = BeautifulSoup(response.content,'lxml')
except:
    print("we didn't get back result page from yelp correctly")
  

[<a class="biz-name js-analytics-click" data-analytics-label="biz-name" data-hovercard-id="9DuvvLi4D8pNRx96SH4FDw" href="/adredir?ad_business_id=AMNoZngyjT5wiFxIHuYGiA&amp;campaign_id=m4feyFq9f3E874eCYh3l1A&amp;click_origin=search_results&amp;placement=above_search&amp;redirect_url=https%3A%2F%2Fwww.yelp.com%2Fbiz%2Froclyns-bronx&amp;request_id=48f1ab73aa43f7b3&amp;signature=6532e4143762359db9c627ddbd9cf6e4558c0dec701e889a73bfa7e26417acb3&amp;slot=0"><span>Roclyn’s</span></a>,
 <a class="biz-name js-analytics-click" data-analytics-label="biz-name" data-hovercard-id="z0XS1eRHD6uK0PN77FI2kg" href="/biz/bap-new-york?osq=lobster"><span>Bap</span></a>,
 <a class="biz-name js-analytics-click" data-analytics-label="biz-name" data-hovercard-id="LEXrAvYPfshHjC0L6UsHpA" href="/biz/raffettos-new-york-2?osq=lobster"><span>Raffetto’s</span></a>,
 <a class="biz-name js-analytics-click" data-analytics-label="biz-name" data-hovercard-id="Ze0ZcyeWi7VJm-roBwR1rw" href="/biz/ocean-prime-new-york?osq=lobs

In [68]:
businesses = results_page.find_all('a',{'data-analytics-label':"biz-name"})[1:]
businesses.extend(businesses)
businesses

[<a class="biz-name js-analytics-click" data-analytics-label="biz-name" data-hovercard-id="z0XS1eRHD6uK0PN77FI2kg" href="/biz/bap-new-york?osq=lobster"><span>Bap</span></a>,
 <a class="biz-name js-analytics-click" data-analytics-label="biz-name" data-hovercard-id="LEXrAvYPfshHjC0L6UsHpA" href="/biz/raffettos-new-york-2?osq=lobster"><span>Raffetto’s</span></a>,
 <a class="biz-name js-analytics-click" data-analytics-label="biz-name" data-hovercard-id="Ze0ZcyeWi7VJm-roBwR1rw" href="/biz/ocean-prime-new-york?osq=lobster"><span>Ocean Prime</span></a>,
 <a class="biz-name js-analytics-click" data-analytics-label="biz-name" data-hovercard-id="ozrPxTeAtHnhNOG0zY3zXA" href="/biz/the-original-soupman-new-york-7?osq=lobster"><span>The Original Soupman</span></a>,
 <a class="biz-name js-analytics-click" data-analytics-label="biz-name" data-hovercard-id="eT6hA-oUP2DjC_1DjBwQcQ" href="/biz/rh-rooftop-restaurant-new-york-3?osq=lobster"><span>RH Rooftop Restaurant</span></a>,
 <a class="biz-name js-an

In [37]:
# Yelp is also providing API keys to do some web scraping on there website. We created our own API key to be able
# to do some web scraping when limited by the direct html scraping method we were using before

with open('./YelpAPIKeys.txt','r') as f:
    count = 0
    for line in f:
        if count == 0:
            CLIENT_ID = line.strip()
        if count == 1:
            API_KEY = line.strip()
        count+=1

In [38]:
print(CLIENT_ID,API_KEY)

lAFFi3W2fS4DEF5N0746Pg -lBWAWUBZqrndjWzjK9RrS3OSpCA_Haz_hjDutky0OjQcLXiLO7AAl4K_JlelcZSRNmysHWMfmHFLpf2QsPtWY1uCRIgSMJbdsqV6oTv8z00qR_lpJTkzzRa5zraW3Yx


In [39]:
API_HOST = 'https://api.yelp.com' # this is the API url header
SEARCH_PATH = '/v3/businesses/search' # this is the path for an API request to find businesses
BUSINESS_PATH = '/v3/businesses/'  # this is the path to get data for a single business

In [40]:
# this function creates a list of businesses, given an API key and a location like New York,...

def get_restaurants(api_key,location,number=50):
    import requests
    
    #First we get the access token
    #Set up the search data dictionary
    search_data = {
    'term': "restaurant",
    'location': location.replace(' ', '+'),
    'limit': number
    }
    url = API_HOST + SEARCH_PATH
    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }
    response = requests.request('GET', url, headers=headers, params=search_data).json()
    businesses = response.get('businesses')
    return businesses

In [41]:
restaurants_data = get_restaurants(API_KEY,"New York, NY")

In [42]:
restaurants_data

[{'id': 'ETgJqJHV7BW6pIr9Ox74sA',
  'alias': 'amélie-new-york',
  'name': 'Amélie',
  'image_url': 'https://s3-media4.fl.yelpcdn.com/bphoto/cSDgVuPMnJgMLTrTNSEXug/o.jpg',
  'is_closed': False,
  'url': 'https://www.yelp.com/biz/am%C3%A9lie-new-york?adjust_creative=lAFFi3W2fS4DEF5N0746Pg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=lAFFi3W2fS4DEF5N0746Pg',
  'review_count': 2254,
  'categories': [{'alias': 'french', 'title': 'French'},
   {'alias': 'wine_bars', 'title': 'Wine Bars'}],
  'rating': 4.5,
  'coordinates': {'latitude': 40.7327, 'longitude': -73.99766},
  'transactions': ['restaurant_reservation'],
  'price': '$$',
  'location': {'address1': '22 W 8th St',
   'address2': '',
   'address3': '',
   'city': 'New York',
   'zip_code': '10011',
   'country': 'US',
   'state': 'NY',
   'display_address': ['22 W 8th St', 'New York, NY 10011']},
  'phone': '+12125332962',
  'display_phone': '(212) 533-2962',
  'distance': 3036.049355695584},
 {'id': 'ehUuSk5g