In [1]:
#This is the notebook for tools_project

In [22]:
# scraping libraries to read html code
import requests
from bs4 import BeautifulSoup
import json

In [23]:
# pandas library to create dataframe
import pandas as pd
import numpy as np

In [14]:
# Direct to initial results
url = 'https://www.yelp.com/search?find_desc='
keyword = input('What do you want to eat ? ')
address = input('Where are you ? ')

url = 'https://www.yelp.com/search?find_desc='+ keyword + '&find_loc=' + address 
url

What do you want to eat ? pizza
Where are you ? boston


'https://www.yelp.com/search?find_desc=pizza&find_loc=boston'

In [15]:
# Checking if the url is "valid", the status code should be 200
response = requests.get(url)
response.status_code

200

In [16]:
# extract the html code from the web page = https://www.yelp.com/search?find_desc=Restaurants&find_loc=New+York,+NY&start=30
results_page = BeautifulSoup(response.content,'lxml')

In [17]:
# creating a list of businesses with the html code of the business as elements of the list
businesses = results_page.find_all('a',class_ = "biz-name js-analytics-click")

In [18]:
# selecting the first element (a particular restaurant in our case) of the list businesses and looking in the
#ignore the first element of the businesses list, it is an ad by yelp, regular list start from the second (index = 1)
# html code for its name
business = businesses[1]
name = business.find('span').get_text()

# getting the link to the yelp page this particular restaurant
link = 'https://www.yelp.com' + business.get('href')

In [19]:
print(name,link)

Regina Pizzeria https://www.yelp.com/biz/regina-pizzeria-boston-28?osq=pizza


In [20]:
# extracting the html code of the web page associated with the first business of our list businesses
response_business = requests.get(link)
results_business_page = BeautifulSoup(response_business.content,'lxml')

In [21]:
# looking in the html page of the restaurant for latitude and longitude coordinates of the restaurant
# the json object json_map contains these coordinates
json_map = results_business_page.find('div', class_ = "lightbox-map hidden").get('data-map-state')

# we can convert this json object to a python dictionnary using the following method
dict_map = json.loads(json_map)

# the relevant keys in this dict are "markers", "location" and "latitude" and "longitude"
latitude = dict_map["markers"][1]["location"]["latitude"]
longitude = dict_map["markers"][1]["location"]["longitude"]

In [78]:
json_map

'{"serviceAreas": [], "moMapPossible": true, "scrollwheelZoom": false, "zoomControlPosition": "top_right", "minZoomlevel": null, "isFullBleed": false, "maxZoomlevel": null, "zoom": 15, "library": "google", "fitToGeobox": false, "hoods": [], "adPinColor": null, "markers": [{"location": null, "key": "directions_marker", "icon": {"name": "directions", "anchorOffset": [12, 32], "activeOrigin": [0, 0], "scaledSize": [24, 32], "regularUri": "https://s3-media4.fl.yelpcdn.com/assets/srv0/yelp_maps/79f63ebc20db/assets/img/directions@2x.png", "size": [24, 32], "activeUri": "https://s3-media4.fl.yelpcdn.com/assets/srv0/yelp_maps/7249ab345ac8/assets/img/directions_highlighted@2x.png", "regularOrigin": [0, 0]}}, {"resourceType": "business", "url": "/biz/ramen-hood-new-york", "resourceId": "oy_FqW6XO8S06pg6nHbhog", "shouldOpenInNewTab": false, "location": {"latitude": 40.7223112701284, "longitude": -73.9973774072879}, "key": "starred_business", "hovercardId": "xJJUNXIR1JECvREs7j-Mzg", "icon": {"name

In [79]:
print(latitude,longitude)

40.7223112701284 -73.9973774072879


In [80]:
# The category can be found in the 'a' tag under "span", class_ = 'category-str-list' tag, 
category= results_business_page.find("span", class_ = 'category-str-list').find('a').get_text()

In [81]:
category

'Ramen'

In [82]:
# the price range ('$','$$','$$$', or '$$$$') can be found in the "span", class_ = 'business-attribute price-range' tag
# the price range is not always available, that is why we need to catch errors 
# if necessary (hence the try: except) structure

try:
    price_range = results_business_page.find("span", class_ = 'business-attribute price-range').get_text()
except:
    price_range = None

In [83]:
print(price_range)

$$


In [84]:
# the ratings associated to a particular restaurant are in the "div", itemprop = 'aggregateRating' tag
# the rating value (between 0 and 5) can be accessed through ratings (= results_business_page.find("div", 
# itemprop = 'aggregateRating') at the tag "meta" and the attribute 'content'
# the number of reviews is available through ratings at the 'span' tag

ratings = results_business_page.find("div", itemprop = 'aggregateRating')
rating_value = ratings.find("meta").get('content')
review_count = ratings.find("span").get_text()

In [85]:
print(rating_value,review_count)

4.5 31


In [86]:
# the phone number can be accessed at the tag 'span',itemprop = "telephone"
# phone numbers are not always available so we need to catch errors using the try:... except: structure

try:
    phone_number = results_business_page.find('span',itemprop = "telephone").get_text().strip()
except:
    phone_number = None   

In [87]:
print(phone_number)




In [88]:
# We want to know if the restaurant is currently open: this information is 
# given at the 'span', class_ = "nowrap extra open" tag. This information is not always 
# available that is why we use the try: ... except:... structure

try:
    open_ = results_business_page.find('span', class_ = "nowrap extra open").get_text()
except:
    open_ = None

In [89]:
print(open_)

None


In [119]:
try:
    addr = results_business_page.find('strong', class_ = "street-address").get_text().strip()
except:
    addr = None
addr

"62 Spring StChef's Club CounterNew York, NY 10012"

In [22]:
try:
    web = results_business_page.find('span', class_ = "biz-website js-biz-website js-add-url-tagging").find('a').get_text()
except:
    web = None
web

'reginapizzeria.com'

In [24]:
#want to put all infomations we want to get from one business into one funtion
#input: url link
#output: all details we tested above
def get_info(link):
    response_business = requests.get(link)
    results_business_page = BeautifulSoup(response_business.content,'lxml')
    
    try:
        addr = results_business_page.find('strong', class_ = "street-address").get_text().strip()
    except:
        addr = None

    try:
        json_map = results_business_page.find('div', class_ = "lightbox-map hidden").get('data-map-state')
        dict_map = json.loads(json_map)
        lat = dict_map["markers"][1]["location"]["latitude"]
        lon = dict_map["markers"][1]["location"]["longitude"]
    except: 
        json_map = None 

    try: 
        category = results_business_page.find("span", class_ = 'category-str-list').find('a').get_text()
    except:
        category = None
        
    try:
        price_range = results_business_page.find("span", class_ = 'business-attribute price-range').get_text()
    except:
        price_range = None
        
    try: 
        rating_value = results_business_page.find("div", {'itemprop' : "aggregateRating"}).find("meta").get('content')
    except: 
        rating_value = None
        
#     try:
#         review_count = results_business_page.find("div", {'itemprop' : "aggregateRating"}).find("span").get_text()
#     except: 
#         rating_count = None
    
    try:
        phone_number = results_business_page.find('span',itemprop = "telephone").get_text().strip()
    except:
        phone_number = None
        
    try:
        open_ = results_business_page.find('span', class_ = "nowrap extra open").get_text()
    except:
        open_ = 'closed now'
        
    try:
        web = results_business_page.find('span', class_ = "biz-website js-biz-website js-add-url-tagging").find('a').get_text()
    except:
        web = None
        
    return(addr,category,price_range,rating_value,phone_number,open_,web,lat,lon)
    

In [24]:
#test the function on cafe mogador
restaurant_url = "https://www.yelp.com/biz/lombardis-pizza-new-york?osq=pizza"
get_info(restaurant_url)

('32 Spring StNew York, NY 10012',
 'Pizza',
 '$$',
 '4.0',
 '(212) 941-7994',
 'Open now',
 'firstpizza.com',
 40.7216368,
 -73.9957205)

In [94]:
#Get a list of url links of all businesses (or maybe the first 30)
#so that we could compile a list of details on, for example, the first 30 search-result restaurants
link_list = []

for i in range(1,len(businesses)):
    business = businesses[i]
    link = 'https://www.yelp.com' + business.get('href')
    link_list.append(link)

In [96]:
#Create a list of restaurant names
name_list=[]
for i in range(1,len(businesses)):
    business = businesses[i]
    name = business.find('span').get_text()
    name_list.append(name)

In [98]:
#Create a list of all restaurants' information
info_list=[]

for i in range(len(link_list)):
    info_list.append(get_info(link_list[i]))

In [100]:
#Create a dictionary showing the information of each restaurant
business_info={}

for i in range(len(name_list)):
    business_info[name_list[i]] = info_list[i]        

In [None]:
for page_index in range(18):
    print (page_index)

In [28]:
def get_businesses():
    business_list = []
    
    keyword = input('What do you want to eat ? ')
    address = input('Where are you ? ')
    limit = input('How many results do you want?')
    filters = input('Do you want to sort by price or by rating? ')
    
    loop_range = (int(limit)-1)//10
    for page_index in range(loop_range+1):
        url = 'https://www.yelp.com/search?find_desc='+ keyword + '&find_loc=' + address + '&start=' + str(page_index*10)
        response = requests.get(url)
        try:
            results_page = BeautifulSoup(response.content,'lxml')
        except:
            print("we didn't get back result page from yelp correctly")
        
        businesses = results_page.find_all('a',{'data-analytics-label':"biz-name"})
        business_list = []
        for business in businesses[1:]:
            name = business.find('span').get_text()
            link = 'https://www.yelp.com' + business.get('href')
            addr,category,price_range,rating_value,phone_number,open_,web,lat,lon= get_info(link)
            business_list.append([name,addr,open_,price_range,rating_value,category,phone_number,web,lat,lon])
    df = pd.DataFrame(business_list[:int(limit)+1], columns = ['name','address','open or closed','price range','rating value','category','phone','website','lat','lon'])
    
    if filters == 'rates':
        df = df.sort_values(by='rating value',ascending=False)
        
    elif filters == 'price':
        df['length'] = df['price range'].apply(lambda x: len(x))
        df = df.sort_values(by='length',ascending=False)
        df = df.drop('length', axis=1)
    
    return(df)

In [36]:
res = get_businesses()

What do you want to eat ? chinese
Where are you ? manhatten
How many results do you want?9
Do you want to sort by price or by rating? rating


In [37]:
res

Unnamed: 0,name,address,open or closed,price range,rating value,category,phone,website,lat,lon
0,Café China,"13 E 37th StNew York, NY 10016",Open now,$$,4.0,Szechuan,(212) 213-2810,cafechinanyc.com,40.749923,-73.981946
1,Hui Restaurant & Bar,"314 E 70th StFL 1New York, NY 10021",Open now,$$,4.5,Bars,(646) 869-0339,huirestaurantandbar.com,40.766929,-73.958518
2,China Xiang,"360 W 42nd StreetNew York, NY 10036",Open now,$$,4.0,Chinese,(212) 967-6088,chinaxiang360.com,40.75829,-73.992511
3,Xi’an Famous Foods,"24 W 45th StNew York, NY 10036",Open now,$,4.0,Chinese,(212) 786-2068,xianfoods.com,40.755795,-73.980813
4,Zest Szechuan,"45 W 39th StNew York, NY 10018",Open now,$$,3.5,Szechuan,(646) 870-0521,zestszechuan.com,40.752733,-73.984451
5,Radiance,"208 E 50th St2nd & 3rd AveNew York, NY 10022",Open now,$$,4.0,Chinese,(212) 888-8060,radiancetea.com,40.755578,-73.97035
6,Joe’s Shanghai,"24 W 56th StNew York, NY 10019",Open now,$$,3.5,Shanghainese,(212) 333-3868,joeshanghairestaurants.com,40.762769,-73.975901
7,Spice Symphony,"150 E 50th StNew York, NY 10022",Open now,$$,4.5,Chinese,(212) 300-4869,spicesymphony.com,40.755905,-73.97157
8,Jasmine Restaurant,"216 E 49th StNew York, NY 10017",Open now,$$,4.0,Shanghainese,(212) 371-2348,jasminerestaurantnyc.com,40.75473,-73.97064
9,Dim Sum Palace,"334 W 46th StNew York, NY 10036",Open now,$$,4.0,Dim Sum,(646) 861-1910,dimsumpalace.com,40.760196,-73.989345


In [None]:
import folium
m = folium.Map(location=[40.8075355, -73.9625727],zoom_start=14)

In [None]:
# a marker is created for any restaurant: red for critical situation and orange for not critical
for k in range(len(res)):
    folium.Marker([res['lat'].iloc[k],res['lon'].iloc[k]],popup=res['name'].iloc[k]).add_to(m)
m

In [None]:
#workspace
url = "https://www.yelp.com/search?find_desc=burger+&find_loc=new+york"
response = requests.get(url)
results_page = BeautifulSoup(response.content,'lxml')
businesses = results_page.find_all('a',{'data-analytics-label':"biz-name"})
business_list = []
for business in businesses:
    name = business.find('span').get_text()
    link = 'https://www.yelp.com' + business.get('href')
    addr,category,price_range,rating_value,phone_number,open_,web = get_info(link)
    #[name,open_,latitude,longitude,price_range,rating_value,review_count,category,phone_number,web]        
    #df.loc[i] = [name,open_,latitude,longitude,price_range,rating_value,review_count,category,phone_number,web]
    business_list.append([name,open_,price_range,rating_value,category,phone_number,web])
business_list
