In [None]:
#This is the notebook for tools_project

In [None]:
# scraping libraries to read html code
import requests
from bs4 import BeautifulSoup
import json

In [None]:
# pandas library to create dataframe
import pandas as pd
import numpy as np

In [None]:
# Direct to initial results
url = 'https://www.yelp.com/search?find_desc='
keyword = input('What do you want to eat now ? ')
address = input('Where are you ? ')

url = 'https://www.yelp.com/search?find_desc='+ keyword + '&find_loc=' + address 
url

In [None]:
# Checking if the url is "valid", the status code should be 200
response = requests.get(url)
response.status_code

In [None]:
# extract the html code from the web page = https://www.yelp.com/search?find_desc=Restaurants&find_loc=New+York,+NY&start=30
results_page = BeautifulSoup(response.content,'lxml')

In [None]:
# creating a list of businesses with the html code of the business as elements of the list
businesses = results_page.find_all('a',class_ = "biz-name js-analytics-click")

In [None]:
businesses

In [None]:
# selecting the first element (a particular restaurant in our case) of the list businesses and looking in the
#ignore the first element of the businesses list, it is an ad by yelp, regular list start from the second (index = 1)
# html code for its name
business = businesses[1]
name = business.find('span').get_text()

# getting the link to the yelp page this perticular restaurant
link = 'https://www.yelp.com' + business.get('href')

In [None]:
print(name,link)

In [None]:
# extracting the html code of the web page associated with the first business of our list businesses
response_business = requests.get(link)
results_business_page = BeautifulSoup(response_business.content,'lxml')

In [None]:
# looking in the html page of the restaurant for latitude and longitude coordinates of the restaurant
# the json object json_map contains these coordinates
json_map = results_business_page.find('div', class_ = "lightbox-map hidden").get('data-map-state')

# we can convert this json object to a python dictionnary using the following method
dict_map = json.loads(json_map)

# the relevant keys in this dict are "markers", "location" and "latitude" and "longitude"
latitude = dict_map["markers"][1]["location"]["latitude"]
longitude = dict_map["markers"][1]["location"]["longitude"]

In [None]:
json_map

In [None]:
print(latitude,longitude)

In [None]:
# The category can be found in the 'a' tag under "span", class_ = 'category-str-list' tag, 
category= results_business_page.find("span", class_ = 'category-str-list').find('a').get_text()

In [None]:
category

In [None]:
# the price range ('$','$$','$$$', or '$$$$') can be found in the "span", class_ = 'business-attribute price-range' tag
# the price range is not always available, that is why we need to catch errors 
# if necessary (hence the try: except) structure

try:
    price_range = results_business_page.find("span", class_ = 'business-attribute price-range').get_text()
except:
    price_range = None

In [None]:
print(price_range)

In [None]:
# the ratings associated to a particular restaurant are in the "div", itemprop = 'aggregateRating' tag
# the rating value (between 0 and 5) can be accessed through ratings (= results_business_page.find("div", 
# itemprop = 'aggregateRating') at the tag "meta" and the attribute 'content'
# the number of reviews is available through ratings at the 'span' tag

ratings = results_business_page.find("div", itemprop = 'aggregateRating')
rating_value = ratings.find("meta").get('content')
review_count = ratings.find("span").get_text()

In [None]:
print(rating_value,review_count)

In [None]:
# the phone number can be accessed at the tag 'span',itemprop = "telephone"
# phone numbers are not always available so we need to catch errors using the try:... except: structure

try:
    phone_number = results_business_page.find('span',itemprop = "telephone").get_text().strip()
except:
    phone_number = None   

In [None]:
print(phone_number)

In [None]:
# We want to know if the restaurant is currently open: this information is 
# given at the 'span', class_ = "nowrap extra open" tag. This information is not always 
# available that is why we use the try: ... except:... structure

try:
    open_ = results_business_page.find('span', class_ = "nowrap extra open").get_text()
except:
    open_ = None

In [None]:
print(open_)

In [None]:
try:
    addr = results_business_page.find('strong', class_ = "street-address").get_text().strip()
except:
    addr = None
addr

In [None]:
try:
    web = results_business_page.find('span', class_ = "biz-website js-biz-website js-add-url-tagging").find('a').get_text()
except:
    web = None
web

In [None]:
#want to put all infomations we want to get from one business in to one funtion
#input: url link
#output: all details we tested above
def get_info(link):
    response_business = requests.get(link)
    results_business_page = BeautifulSoup(response_business.content,'lxml')
    
    try:
        addr = results_business_page.find('strong', class_ = "street-address").get_text().strip()
    except:
        addr = None

    json_map = results_business_page.find('div', class_ = "lightbox-map hidden").get('data-map-state')
    dict_map = json.loads(json_map)
    latitude = dict_map["markers"][1]["location"]["latitude"]
    longitude = dict_map["markers"][1]["location"]["longitude"]

    category= results_business_page.find("span", class_ = 'category-str-list').find('a').get_text()
        
    try:
        price_range = results_business_page.find("span", class_ = 'business-attribute price-range').get_text()
    except:
        price_range = None
        
    ratings = results_business_page.find("div", itemprop = 'aggregateRating')
    rating_value = ratings.find("meta").get('content')
    review_count = ratings.find("span").get_text()
    
    try:
        phone_number = results_business_page.find('span',itemprop = "telephone").get_text().strip()
    except:
        phone_number = None
        
    try:
        open_ = results_business_page.find('span', class_ = "nowrap extra open").get_text()
    except:
        open_ = None
    try:
        web = results_business_page.find('span', class_ = "biz-website js-biz-website js-add-url-tagging").find('a').get_text()
    except:
        web = None
        
    return(addr,latitude,longitude,category,price_range,rating_value,review_count,phone_number,open_,web)
    

In [None]:
#test the function on cafe mogador
restaurant_url = "https://www.yelp.com/biz/cafe-mogador-brooklyn?osq=Restaurants"
get_info(restaurant_url)

In [None]:
#Get a list of url links of all businesses (or maybe the first 30)
#so that we could compile a list of details on, for example, the first 30 search-result restaurants
link_list = []

for i in range(1,len(businesses)):
    business = businesses[i]
    link = 'https://www.yelp.com' + business.get('href')
    link_list.append(link)

In [None]:
link_list

In [None]:
#Create a list of restaurant names
name_list=[]
for i in range(1,len(businesses)):
    business = businesses[i]
    name = business.find('span').get_text()
    name_list.append(name)

In [None]:
name_list

In [None]:
#Create a list of all restaurants' information
info_list=[]

for i in range(len(link_list)):
    info_list.append(get_info(link_list[i]))

In [None]:
#Create a dictionary showing the information of each restaurant
business_info={}

for i in range(len(name_list)):
    business_info[name_list[i]] = info_list[i]        

In [None]:
business_info