In [10]:
# scraping libraries to read html code
import requests
from bs4 import BeautifulSoup
import json
import folium

In [11]:
# pandas library to create dataframe
# re for match
import pandas as pd
import numpy as np
import re

In [12]:
# standard library to count frequency 
import collections 

In [9]:
# Direct to initial results
url = 'https://www.yelp.com/search?find_desc='
keyword = input('What do you want to eat ? ')
address = input('Where are you ? ')

url = 'https://www.yelp.com/search?find_desc='+ keyword + '&find_loc=' + address 
url

KeyboardInterrupt: 

In [7]:
# Checking if the url is "valid", the status code should be 200
response = requests.get(url)
response.status_code

200

In [17]:
# extract the html code from the web page = https://www.yelp.com/search?find_desc=Restaurants&find_loc=New+York,+NY&start=30
results_page = BeautifulSoup(response.content,'lxml')

In [None]:
# creating a list of businesses with the html code of the business as elements of the list
businesses = results_page.find_all('a',class_ = "biz-name js-analytics-click")

In [None]:
# selecting the first element (a particular restaurant in our case) of the list businesses and looking in the
#ignore the first element of the businesses list, it is an ad by yelp, regular list start from the second (index = 1)
# html code for its name
business = businesses[1]
name = business.find('span').get_text()

# getting the link to the yelp page this particular restaurant
link = 'https://www.yelp.com' + business.get('href')

In [None]:
print(name,link)

In [None]:
# extracting the html code of the web page associated with the first business of our list businesses
response_business = requests.get(link)
results_business_page = BeautifulSoup(response_business.content,'lxml')

In [None]:
# looking in the html page of the restaurant for latitude and longitude coordinates of the restaurant
# the json object json_map contains these coordinates
json_map = results_business_page.find('div', class_ = "lightbox-map hidden").get('data-map-state')

# we can convert this json object to a python dictionnary using the following method
dict_map = json.loads(json_map)

# the relevant keys in this dict are "markers", "location" and "latitude" and "longitude"
latitude = dict_map["markers"][1]["location"]["latitude"]
longitude = dict_map["markers"][1]["location"]["longitude"]

In [None]:
json_map

In [None]:
print(latitude,longitude)

In [None]:
# The category can be found in the 'a' tag under "span", class_ = 'category-str-list' tag, 
category= results_business_page.find("span", class_ = 'category-str-list').find('a').get_text()

In [None]:
category

In [None]:
# the price range ('$','$$','$$$', or '$$$$') can be found in the "span", class_ = 'business-attribute price-range' tag
# the price range is not always available, that is why we need to catch errors 
# if necessary (hence the try: except) structure

try:
    price_range = results_business_page.find("span", class_ = 'business-attribute price-range').get_text()
except:
    price_range = None

In [None]:
print(price_range)

In [None]:
# the ratings associated to a particular restaurant are in the "div", itemprop = 'aggregateRating' tag
# the rating value (between 0 and 5) can be accessed through ratings (= results_business_page.find("div", 
# itemprop = 'aggregateRating') at the tag "meta" and the attribute 'content'
# the number of reviews is available through ratings at the 'span' tag

ratings = results_business_page.find("div", itemprop = 'aggregateRating')
rating_value = ratings.find("meta").get('content')
review_count = ratings.find("span").get_text()

In [None]:
print(rating_value,review_count)

In [None]:
# the phone number can be accessed at the tag 'span',itemprop = "telephone"
# phone numbers are not always available so we need to catch errors using the try:... except: structure

try:
    phone_number = results_business_page.find('span',itemprop = "telephone").get_text().strip()
except:
    phone_number = None   

In [None]:
print(phone_number)

In [None]:
# We want to know if the restaurant is currently open: this information is 
# given at the 'span', class_ = "nowrap extra open" tag. This information is not always 
# available that is why we use the try: ... except:... structure

try:
    open_ = results_business_page.find('span', class_ = "nowrap extra open").get_text()
except:
    open_ = None

In [None]:
print(open_)

In [None]:
try:
    addr = results_business_page.find('strong', class_ = "street-address").get_text().strip()
except:
    addr = None
addr

In [None]:
try:
    web = results_business_page.find('span', class_ = "biz-website js-biz-website js-add-url-tagging").find('a').get_text()
except:
    web = None
web

In [13]:
#want to put all infomations we want to get from one business into one funtion
#input: specific restaurant url link
#output: all details we tested above
def get_info(link):
    response_business = requests.get(link)
    results_business_page = BeautifulSoup(response_business.content,'lxml')
    
    try:
        addr = results_business_page.find('strong', class_ = "street-address").get_text().strip()
    except:
        addr = None

    try:
        json_map = results_business_page.find('div', class_ = "lightbox-map hidden").get('data-map-state')
        dict_map = json.loads(json_map)
        lat = dict_map["markers"][1]["location"]["latitude"]
        lon = dict_map["markers"][1]["location"]["longitude"]
    except: 
        json_map = None 

    try: 
        category = results_business_page.find("span", class_ = 'category-str-list').find('a').get_text()
    except:
        category = None
        
    try:
        price_range = results_business_page.find("span", class_ = 'business-attribute price-range').get_text()
    except:
        price_range = None
        
    try: 
        rating_value = results_business_page.find("div", {'itemprop' : "aggregateRating"}).find("meta").get('content')
    except: 
        rating_value = None
          
    try:
        phone_number = results_business_page.find('span',itemprop = "telephone").get_text().strip()
    except:
        phone_number = None
        
    try:
        open_ = results_business_page.find('span', class_ = "nowrap extra open").get_text()
    except:
        open_ = 'closed now'
        
    try:
        web = results_business_page.find('span', class_ = "biz-website js-biz-website js-add-url-tagging").find('a').get_text()
    except:
        web = None
    
    try:
        moreinfo = results_business_page.find_all('dl')
        delivery = "No"
        for i in moreinfo[1:]:
            try:
                attr = i.find('dt',{'class':"attribute-key"}).get_text().strip()
                yon = i.find('dd').get_text().strip()
                if attr == 'Delivery':
                    delivery = yon
                    break
            except:
                delivery = delivery

    except:
        delivery = None
        
    return(addr,category,price_range,rating_value,phone_number,open_,web,delivery,lat,lon)
    

In [None]:
#Get a list of url links of all businesses (or maybe the first 30)
#so that we could compile a list of details on, for example, the first 30 search-result restaurants
link_list = []

for i in range(1,len(businesses)):
    business = businesses[i]
    link = 'https://www.yelp.com' + business.get('href')
    link_list.append(link)

In [None]:
#Create a list of restaurant names
name_list=[]
for i in range(1,len(businesses)):
    business = businesses[i]
    name = business.find('span').get_text()
    name_list.append(name)

In [None]:
#Create a list of all restaurants' information
info_list=[]

for i in range(len(link_list)):
    info_list.append(get_info(link_list[i]))

In [50]:
def get_businesses():
    global keyword
    keyword = input('What do you want to eat ? ')
    global address
    address = input('Where are you ? ')
    limit = input('How many results do you want? ')
    filters = input('Do you want to sort by \n 1: price low to high  \n 2: price high to low  \n 3: ratings? \n Please enter the number 1/2/3:')
    
    #first page - figure out numbers of results per page
    url = 'https://www.yelp.com/search?find_desc='+ keyword + '&find_loc=' + address 
    response = requests.get(url)
    try:
        results_page = BeautifulSoup(response.content,'lxml')
    except:
        print("we didn't get back result page from yelp correctly")
    noperpage = results_page.find('span',{'class':'pagination-results-window'}).get_text().strip()[10:12]
    noperpage = int(noperpage)
    businesses = results_page.find_all('a',{'data-analytics-label':"biz-name"})[1:]
    global cuisine_styles
    cuisine_styles = results_page.find_all('span',class_="category-str-list")
    
    #start loop from the second page to get enough number of restaurants
    loop_range = (int(limit)-1)//noperpage
    for pagination in range(1, loop_range+1):
        url = 'https://www.yelp.com/search?find_desc='+ keyword + '&find_loc=' + address + '&start=' + str(pagination*noperpage)
        response = requests.get(url)
        try:
            results_page = BeautifulSoup(response.content,'lxml')
        except:
            print("we didn't get back result page from yelp correctly")
        
        businesses.extend(results_page.find_all('a',{'data-analytics-label':"biz-name"})[1:])
    businesses = businesses[:int(limit)]
    
    #generate details by get_info and format in dataframe
    business_list = []
    for business in businesses:
        name = business.find('span').get_text()
        link = 'https://www.yelp.com' + business.get('href')
        addr,category,price_range,rating_value,phone_number,open_,web,delivery,lat,lon= get_info(link)
        business_list.append([name,addr,open_,price_range,rating_value,category,phone_number,web,delivery,lat,lon])
    df = pd.DataFrame(business_list[:int(limit)+1], columns = ['name','address','open or closed','price range','rating value','category','phone','website','delivery service','lat','lon'])
    
    #sortby...
    pattern = r'rat'
    match = bool(re.search(pattern, filters))
    if match or filters == '3':
        df = df.sort_values(by='rating value',ascending=False)
    
    elif filters == 'price low to high' or filters == '1':
        df['length'] = df['price range'].apply(lambda x: len(x))
        df = df.sort_values(by='length',ascending=True)
        df = df.drop('length', axis=1)
        
    elif filters == 'price high to low' or filters == '2':
        df['length'] = df['price range'].apply(lambda x: len(x))
        df = df.sort_values(by='length',ascending=False)
        df = df.drop('length', axis=1)
        
    map_address= address
    map_address=map_address.replace(' ','_')
    api_key="AIzaSyD2GsvbVaqwNOj6LNVOXlIpzm_Lu-y7yGs"
    url="https://maps.googleapis.com/maps/api/geocode/json?address=%s&key=%s" % (map_address,api_key) #%s permet de mettre des variables dans des string
    response = requests.get(url)
    a = response.json()
    try:
        location = a['results'][0]['geometry']['location']
        lat = location['lat']
        long = location['lng']
    except:
        lat = None
        long = None
        
    m = folium.Map(location=[lat, long],zoom_start=11.5)
    for k in range(len(df)):
        folium.Marker([df['lat'].iloc[k],df['lon'].iloc[k]],popup=df['name'].iloc[k]).add_to(m)
    
    return(df,m)

In [51]:
res,map_ = get_businesses()

What do you want to eat ? french
Where are you ? Los Angeles
How many results do you want? 5
Do you want to sort by 
 1: price low to high  
 2: price high to low  
 3: ratings? 
 Please enter the number 1/2/3:1


In [52]:
# print(filters)
# print(re.search(pattern, filters))
res

Unnamed: 0,name,address,open or closed,price range,rating value,category,phone,website,delivery service,lat,lon
0,Cafe Beaujolais,"1712 Colorado BlvdLos Angeles, CA 90041",closed now,$$,4.5,French,(323) 255-5111,,No,34.139151,-118.204156
1,Le Petit Marché,"5665 Melrose AveLos Angeles, CA 90038",Open now,$$,4.5,French,(323) 380-6557,lepetitmarche.com,Yes,34.083754,-118.324047
3,République,"624 S La Brea AveLos Angeles, CA 90036",Open now,$$,4.0,French,(310) 362-6115,republiquela.com,No,34.064141,-118.343701
4,A Food Affair,"1513 S Robertson BlvdLos Angeles, CA 90035",closed now,$$,4.5,French,(310) 557-9795,afoodaffair.com,No,34.051783,-118.384407
2,Le Petit Paris,"418 /420 S Spring StLos Angeles, CA 90013",closed now,$$$,4.0,French,(213) 217-4445,lepetitparisla.com,Yes,34.048237,-118.248524


In [53]:
map_

In [15]:
!pip install folium --upgrade

Requirement already up-to-date: folium in /Users/zijun/anaconda3/lib/python3.6/site-packages (0.7.0)
Requirement not upgraded as not directly required: branca>=0.3.0 in /Users/zijun/anaconda3/lib/python3.6/site-packages (from folium) (0.3.1)
Requirement not upgraded as not directly required: numpy in /Users/zijun/anaconda3/lib/python3.6/site-packages (from folium) (1.14.3)
Requirement not upgraded as not directly required: six in /Users/zijun/anaconda3/lib/python3.6/site-packages (from folium) (1.11.0)
Requirement not upgraded as not directly required: requests in /Users/zijun/anaconda3/lib/python3.6/site-packages (from folium) (2.18.4)
Requirement not upgraded as not directly required: jinja2 in /Users/zijun/anaconda3/lib/python3.6/site-packages (from folium) (2.10)
Requirement not upgraded as not directly required: chardet<3.1.0,>=3.0.2 in /Users/zijun/anaconda3/lib/python3.6/site-packages (from requests->folium) (3.0.4)
Requirement not upgraded as not directly required: idna<2.7,>=2

In [32]:
import folium
m = folium.Map(location=[40.75, -73.9625727],zoom_start=11.5)

In [33]:
# a marker is created for any restaurant: red for critical situation and orange for not critical
for k in range(len(res)):
    folium.Marker([res['lat'].iloc[k],res['lon'].iloc[k]],popup=res['name'].iloc[k]).add_to(m)
m

TypeError: tuple indices must be integers or slices, not str

In [19]:
#Now, start to find the style of the search results so that we can make relevant recommendations later

cuisine_styles = results_page.find_all('span',class_="category-str-list")
#print(cuisine_styles)
# print(cuisine_styles[4].find('a').get_text())

styles = []
for i in range(len(cuisine_styles)):
    style = [x.get_text() for  x  in cuisine_styles[i].find_all('a')]
    styles.extend(style)

styles

['Pizza',
 'Italian',
 'Pizza',
 'Pizza',
 'Pizza',
 'Italian',
 'Pizza',
 'Pizza',
 'Italian',
 'Pizza',
 'Pizza',
 'Pizza',
 'Italian',
 'Pizza',
 'Pizza',
 'Italian',
 'Beer Bar',
 'Pizza',
 'Pizza',
 'Pizza',
 'Diners',
 'Pizza',
 'Italian',
 'Pizza',
 'Pizza',
 'Pizza',
 'Wine Bars',
 'Beer Bar',
 'Pizza',
 'Pizza',
 'Italian',
 'Pizza',
 'Italian',
 'Pizza',
 'Bars',
 'American (New)',
 'Pizza',
 'Pizza',
 'Italian',
 'Pizza',
 'Italian',
 'Wine Bars',
 'Pizza',
 'Bars',
 'Italian',
 'Pizza',
 'Pizza',
 'Vegan',
 'Italian',
 'Pizza',
 'Salad',
 'Italian',
 'Pizza',
 'Pizza']

In [47]:
# from several random keyword searching, we found that only the first two cuisine styles have significant counts
# e.g. out of the 30 results, the 3rd most common style appears around only 5 times
count = collections.Counter(styles).most_common(2)
print(count)

# Get a new keyword
if keyword == count[0][0].casefold():
    keyword2 = count[1][0].casefold()
else:
    keyword2 = count[0][0].casefold()

print(keyword2)

[('Pizza', 31), ('Italian', 13)]
italian


In [13]:
# Give users an opportunity to see recommendations relevant cuisine
# e.g. if originally searched for "pizza" but did not like the recommendations
# this function returns 10 more recommendations of "Italian" restaurants
def further_rec():
    reaction = input('Are you satisfied with your recommendations? (yes/no)')
    
    if reaction == 'yes':
        return 'Bon Appetit!'
    else:
        # Moved cuisine_styles to get_business() and make it global for access in this function
        styles = []
        for i in range(len(cuisine_styles)):
            style = [x.get_text() for  x  in cuisine_styles[i].find_all('a')]
            styles.extend(style)

        count = collections.Counter(styles).most_common(2)
        if keyword == count[0][0].casefold():
            keyword2 = count[1][0].casefold()
        else:
            keyword2 = count[0][0].casefold()

        url = 'https://www.yelp.com/search?find_desc='+ keyword2 + '&find_loc=' + address
        # new keyword but same address as from get_business()
        response = requests.get(url)
        try:
            results_page = BeautifulSoup(response.content,'lxml')
        except:
            print("we didn't get back result page from yelp correctly")
                
        businesses = results_page.find_all('a',{'data-analytics-label':"biz-name"})[1:]
        business_list = []
        for business in businesses:
            name = business.find('span').get_text()
            link = 'https://www.yelp.com' + business.get('href')
            addr,category,price_range,rating_value,phone_number,open_,web,delivery,lat,lon= get_info(link)
            business_list.append([name,open_,price_range,rating_value,category,phone_number,web,delivery,lat,lon])
                                 
        df = pd.DataFrame(business_list[:11], columns = ['name','open or closed','price range','rating value','category','phone','website','delivery service','lat','lon'])
        print('Take a look at the following 10 restaurants!')                                   
        return df

In [14]:
fr = further_rec()
fr

Are you satisfied with your recommendations? (yes/no)no
Take a look at the following 10 restaurants!


Unnamed: 0,name,open or closed,price range,rating value,category,phone,website,delivery service,lat,lon
0,Vanguard Wine Bar,closed now,$$,4.0,Wine Bars,(212) 799-9463,vanguard-nyc.com,No,40.77628,-73.983222
1,Vin Sur Vingt,closed now,$$,4.5,Wine Bars,(646) 895-9944,vsvwinebars.com,No,40.784938,-73.972925
2,Lilly’s Cocktail and Wine,Open now,$$,4.0,American (New),(212) 799-4140,lillys-cocktail-and-wine.busi…,No,40.777025,-73.978458
3,Amelie,Open now,$$$,4.5,French,(646) 422-7167,ameliewinebar.com,Yes,40.788773,-73.97467
4,Le Pif,Open now,$$,4.0,French,(212) 799-2253,lepifwinebars.com,No,40.777484,-73.981793
5,Pour,Open now,$$,4.0,"Beer, Wine & Spirits",(212) 501-7687,pourwines.com,No,40.780591,-73.979965
6,Vino Levantino,closed now,$$,4.0,Mediterranean,(212) 280-3333,vinolevantino.com,Yes,40.792948,-73.972078
7,Da Capo Columbus,Open now,$$,4.0,Italian,(646) 882-0197,dacaponyc.com,Yes,40.779635,-73.977485
8,The Milling Room,Open now,$$$,4.0,American (New),(212) 595-0380,themillingroom.com,No,40.783747,-73.974468
9,Bin 71,Open now,$$,3.5,Wine Bars,(212) 362-5446,bin71.com,No,40.776459,-73.979252


In [18]:
n = folium.Map(location=[40.8075355, -73.9625727],zoom_start=14)

In [19]:
for k in range(len(fr)):
    folium.Marker([fr['lat'].iloc[k],fr['lon'].iloc[k]],popup=fr['name'].iloc[k]).add_to(n)
n

In [None]:
#workspace with saved results page
url = 'https://www.yelp.com/search?find_desc=lobster&find_loc=new+york&start=60'
response = requests.get(url)
try:
    results_page = BeautifulSoup(response.content,'lxml')
except:
    print("we didn't get back result page from yelp correctly")
  

In [None]:
businesses = results_page.find_all('a',{'data-analytics-label':"biz-name"})[1:]
businesses.extend(businesses)
businesses

In [5]:
# Yelp is also providing API keys to do some web scraping on there website. We created our own API key to be able
# to do some web scraping when limited by the direct html scraping method we were using before

with open('./YelpAPIKeys.txt','r') as f:
    count = 0
    for line in f:
        if count == 0:
            CLIENT_ID = line.strip()
        if count == 1:
            API_KEY = line.strip()
        count+=1

In [6]:
print(CLIENT_ID,API_KEY)

lAFFi3W2fS4DEF5N0746Pg -lBWAWUBZqrndjWzjK9RrS3OSpCA_Haz_hjDutky0OjQcLXiLO7AAl4K_JlelcZSRNmysHWMfmHFLpf2QsPtWY1uCRIgSMJbdsqV6oTv8z00qR_lpJTkzzRa5zraW3Yx


In [30]:
API_HOST = 'https://api.yelp.com' # this is the API url header
SEARCH_PATH = '/v3/businesses/search' # this is the path for an API request to find businesses
BUSINESS_PATH = '/v3/businesses/'  # this is the path to get data for a single business

In [31]:
# this function creates a list of businesses, given an API key and a location like New York,...

def get_restaurants(api_key,location,keyword,number=50):
    import requests
    
    #First we get the access token
    #Set up the search data dictionary
    search_data = {
    'term': "restaurant",
    'location': location.replace(' ', '+'),
    'limit': number,
    'categories': keyword
    }
    url = API_HOST + SEARCH_PATH
    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }
    response = requests.request('GET', url, headers=headers, params=search_data).json()
    businesses = response.get('businesses')
    return businesses

In [39]:
restaurants_data = get_restaurants(API_KEY,"New York",'pizza',number=50)

In [40]:
# This function creates a dataframe based on the data that we extracted from yelp website, 
# using the function get_restaurants

def data_output(restaurants_data):
    df = pd.DataFrame(columns = ['name','is_closed','latitude','longitude','price','rating','categories','phone'])
    for i in range(len(restaurants_data)):
        name = restaurants_data[i]['name']
        is_closed = restaurants_data[i]['is_closed']
        latitude = restaurants_data[i]['coordinates']['latitude']
        longitude = restaurants_data[i]['coordinates']['longitude']
        try:
            price = restaurants_data[i]['price']
        except:
            price = None
        rating = restaurants_data[i]['rating']
        categories = []
        for categorie in restaurants_data[i]['categories']:
            categories.append(categorie['title'])
        phone = restaurants_data[i]['phone']
        
        df.loc[i] = [name,is_closed,latitude,longitude,price,rating,categories,phone]
    df.set_index('name', inplace = True)
    return(df)

In [61]:
def web_scrapingYelp():
    import requests
    
    keyword = input('What do you want to eat ? ')
    address = input('Where are you ? ')
    limit = input('How many results do you want? ')

    search_data = {
    'term': "restaurant",
    'location': address.replace(' ', '+'),
    'limit': limit,
    'categories': keyword 
    }
    url = API_HOST + SEARCH_PATH
    headers = {
        'Authorization': 'Bearer %s' % API_KEY,
    }
    response = requests.request('GET', url, headers=headers, params=search_data).json()
    businesses = response.get('businesses')
    
    filters = input('Do you want to sort by \n 1: price low to high  \n 2: price high to low  \n 3: ratings? \n Please enter the number 1/2/3:')
    
    df = data_output(businesses)
    #sortby...
    pattern = r'rat'
    match = bool(re.search(pattern, filters))
    if match or filters == '3':
        df = df.sort_values(by='rating',ascending=False)
    
    elif filters == 'price low to high' or filters == '1':
        df['length'] = df['price'].apply(lambda x: len(x))
        df = df.sort_values(by='length',ascending=True)
        df = df.drop('length', axis=1)
        
    elif filters == 'price high to low' or filters == '2':
        df['length'] = df['price'].apply(lambda x: len(x))
        df = df.sort_values(by='length',ascending=False)
        df = df.drop('length', axis=1)
        
    return df

In [65]:
web_scrapingYelp()


What do you want to eat ? pizza
Where are you ? New York
How many results do you want? 30
Do you want to sort by 
 1: price low to high  
 2: price high to low  
 3: ratings? 
 Please enter the number 1/2/3:3


Unnamed: 0_level_0,is_closed,latitude,longitude,price,rating,categories,phone
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L'industrie Pizzeria,False,40.71162,-73.95783,$,5.0,[Pizza],17185990002
DIA,False,40.72572,-73.98955,,5.0,"[Pizza, Italian, Seafood]",19176757664
Juliana's Pizza,False,40.702615,-73.993416,$$,4.5,[Pizza],17185966700
Paulie Gee's,False,40.729546,-73.958568,$$,4.5,"[Pizza, Vegan, Italian]",13479873747
Rizzo's Fine Pizza,False,40.720852,-73.984456,$$,4.5,"[Pizza, Wine Bars, Beer Bar]",16464541262
Da Ciro Brooklyn,False,40.69313,-73.96716,$$,4.5,"[Pizza, Italian]",19292959542
Barboncino Pizza & Bar,False,40.67209,-73.95731,$$,4.5,"[Pizza, Bars, Italian]",17184838834
Macoletta,False,40.773242,-73.916415,$$,4.5,[Pizza],17187774992
Prince Street Pizza,False,40.722909,-73.994486,$,4.5,"[Pizza, Italian]",12129664100
Lucali,False,40.6818,-74.00024,$$,4.5,[Pizza],17188584086
