In [1]:
#This is the notebook for tools_project

In [6]:
# scraping libraries to read html code
import requests
from bs4 import BeautifulSoup
import json

In [7]:
# pandas library to create dataframe
import pandas as pd
import numpy as np

In [8]:
# Direct to initial results
url = 'https://www.yelp.com/search?'
business = input('What business are you looking for (Restaurants, Hairdressers,...) ? ')
address = input('What is your search area : (city,street,neighborhood,...) ? ')
address = address.replace(' ','+')

url = 'https://www.yelp.com/search?find_desc=%s&find_loc=%s,+NY&start=30' % (business,address)
print(url)

What business are you looking for (Restaurants, Hairdressers,...) ? Restaurants
What is your search area : (city,street,neighborhood,...) ? New York
https://www.yelp.com/search?find_desc=Restaurants&find_loc=New+York,+NY&start=30


In [9]:
# Checking if the url is "valid", the status code should be 200
response = requests.get(url)
response.status_code

200

In [10]:
# extract the html code from the web page = https://www.yelp.com/search?find_desc=Restaurants&find_loc=New+York,+NY&start=30
results_page = BeautifulSoup(response.content,'lxml')
results_page

<!DOCTYPE HTML>
<!--[if lt IE 7 ]> <html xmlns:fb="http://www.facebook.com/2008/fbml" class="ie6 ie ltie9 ltie8 no-js" lang="en"> <![endif]--><!--[if IE 7 ]>    <html xmlns:fb="http://www.facebook.com/2008/fbml" class="ie7 ie ltie9 ltie8 no-js" lang="en"> <![endif]--><!--[if IE 8 ]>    <html xmlns:fb="http://www.facebook.com/2008/fbml" class="ie8 ie ltie9 no-js" lang="en"> <![endif]--><!--[if IE 9 ]>    <html xmlns:fb="http://www.facebook.com/2008/fbml" class="ie9 ie no-js" lang="en"> <![endif]--><!--[if (gt IE 9)|!(IE)]><!--><html class="no-js" lang="en" xmlns:fb="http://www.facebook.com/2008/fbml"> <!--<![endif]-->
<head>
<script>
            (function() {
                var main = null;

                var main=function(){window.onerror=function(k,a,c,i,f){var j=(document.getElementsByTagName("html")[0].getAttribute("webdriver")==="true"||navigator.userAgent==="selenium");var h=f&&(f.name==="ServerSideRenderingError"||f.name==="CSRFallbackError");if(j&&!h){document.body.innerHTML=

In [12]:
# creating a list of businesses with the html code of the business as elements of the list
businesses = results_page.find_all('li',class_ = "regular-search-result")

In [14]:
# selecting the first element (a particular restaurant in our case) of the list businesses and looking in the
# html code for its name
business = businesses[0]
a_tag = business.find('a',{'class': "biz-name js-analytics-click"})
business_name = a_tag.find('span').get_text()

# getting the link to the yelp page this perticular restaurant
business_link = 'https://www.yelp.com' + a_tag.get('href')

In [22]:
print(business_name,business_link)

Midwinter Kitchen https://www.yelp.com/biz/midwinter-kitchen-new-york?osq=Restaurants


In [17]:
# extracting the html code of the web page associated with the first business of our list businesses
response_business = requests.get(business_link)
results_business_page = BeautifulSoup(response_business.content,'lxml')

In [18]:
# looking in the html page of the restaurant for latitude and longitude coordinates of the restaurant
# the json object json_map contains these coordinates
json_map = results_business_page.find('div', class_ = "lightbox-map hidden").get('data-map-state')

# we can convert this json object to a python dictionnary using the following method
dict_map = json.loads(json_map)

# the relevant keys in this dict are "markers", "location" and "latitude" and "longitude"
latitude = dict_map["markers"][1]["location"]["latitude"]
longitude = dict_map["markers"][1]["location"]["longitude"]

In [26]:
print(latitude,longitude)

40.7354179 -73.9829477


In [23]:
# The categories can be found in the "span", class_ = 'category-str-list' tag
# the a tags of this tag gives access to the categories
# We need to loop through these a tags to find all the categories associated with this restaurant
category_list = []
categories = results_business_page.find("span", class_ = 'category-str-list')
for category in categories.find_all('a'):
    category_list.append(category.get_text())

In [25]:
print(category_list)

['American (New)']


In [27]:
# the price range ('$','$$','$$$', or '$$$$') can be found in the "span", class_ = 'business-attribute price-range' tag
# the price range is not always available, that is why we need to catch errors 
# if necessary (hence the try: except) structure

try:
    price_range = results_business_page.find("span", class_ = 'business-attribute price-range').get_text()
except:
    price_range = None

In [28]:
print(price_range)

$$


In [29]:
# the ratings associated to a particular restaurant are in the "div", itemprop = 'aggregateRating' tag
# the rating value (between 0 and 5) can be accessed through ratings (= results_business_page.find("div", 
# itemprop = 'aggregateRating') at the tag "meta" and the attribute 'content'
# the number of reviews is available through ratings at the 'span' tag

ratings = results_business_page.find("div", itemprop = 'aggregateRating')
rating_value = ratings.find("meta").get('content')
review_count = ratings.find("span").get_text()

In [30]:
print(rating_value,review_count)

4.5 423


In [31]:
# the phone number can be accessed at the tag 'span',itemprop = "telephone"
# phone numbers are not always available so we need to catch errors using the try:... except: structure

try:
    phone_number = results_business_page.find('span',itemprop = "telephone").get_text().strip()
except:
    phone_number = None   

In [32]:
print(phone_number)

(212) 505-8500


In [33]:
# We want to know if the restaurant is currently open: this information is 
# given at the 'span', class_ = "nowrap extra open" tag. This information is not always 
# available that is why we use the try: ... except:... structure

try:
    open_ = results_business_page.find('span', class_ = "nowrap extra open").get_text()
except:
    open_ = None

In [34]:
print(open_)

Open now
