In [10]:
# scraping libraries to read html code and create map
!pip install folium --upgrade

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import folium

In [2]:
# pandas library to create dataframe
# re for match
import pandas as pd
import numpy as np
import re

In [3]:
# standard library to count frequency 
import collections 

# Part 1) Getting data thru API

In [25]:
# Yelp is also providing API keys to do some web scraping on there website. We created our own API key to be able
# to do some web scraping when limited by the direct html scraping method we were using before

with open('./YelpAPIKeys.txt','r') as f:
    count = 0
    for line in f:
        if count == 0:
            CLIENT_ID = line.strip()
        if count == 1:
            API_KEY = line.strip()
        count+=1

In [26]:
API_HOST = 'https://api.yelp.com' # this is the API url header
SEARCH_PATH = '/v3/businesses/search' # this is the path for an API request to find businesses
BUSINESS_PATH = '/v3/businesses/'  # this is the path to get data for a single business

In [20]:
# this function creates a list of businesses, given an API key and a location like New York,...

def get_restaurants(api_key,location,keyword,number=50):
    import requests
    
    #First we get the access token
    #Set up the search data dictionary
    search_data = {
    'term': "restaurant",
    'location': location.replace(' ', '+'),
    'limit': number,
    'categories': keyword
    }
    url = API_HOST + SEARCH_PATH
    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }
    response = requests.request('GET', url, headers=headers, params=search_data).json()
    businesses = response.get('businesses')
    return businesses

In [21]:
restaurants_data = get_restaurants(API_KEY,"New York",'pizza',number=50)

In [22]:
# This function creates a dataframe based on the data that we extracted from yelp website, 
# using the function get_restaurants

def data_output(restaurants_data):
    df = pd.DataFrame(columns = ['name','is_closed','latitude','longitude','price','rating','categories','phone'])
    for i in range(len(restaurants_data)):
        name = restaurants_data[i]['name']
        is_closed = restaurants_data[i]['is_closed']
        latitude = restaurants_data[i]['coordinates']['latitude']
        longitude = restaurants_data[i]['coordinates']['longitude']
        try:
            price = restaurants_data[i]['price']
        except:
            price = None
        rating = restaurants_data[i]['rating']
        categories = []
        for categorie in restaurants_data[i]['categories']:
            categories.append(categorie['title'])
        phone = restaurants_data[i]['phone']
        
        df.loc[i] = [name,is_closed,latitude,longitude,price,rating,categories,phone]
    df.set_index('name', inplace = True)
    return(df)

In [23]:
def web_scrapingYelp():
    import requests
    
    keyword = input('What do you want to eat ? ')
    address = input('Where are you ? ')
    limit = input('How many results do you want? ')

    search_data = {
    'term': "restaurant",
    'location': address.replace(' ', '+'),
    'limit': limit,
    'categories': keyword 
    }
    url = API_HOST + SEARCH_PATH
    headers = {
        'Authorization': 'Bearer %s' % API_KEY,
    }
    response = requests.request('GET', url, headers=headers, params=search_data).json()
    businesses = response.get('businesses')
    
    filters = input('Do you want to sort by \n 1: price low to high  \n 2: price high to low  \n 3: ratings? \n Please enter the number 1/2/3:')
    
    df = data_output(businesses)
    #sortby...
    pattern = r'rat'
    match = bool(re.search(pattern, filters))
    if match or filters == '3':
        df = df.sort_values(by='rating',ascending=False)
    
    elif filters == 'price low to high' or filters == '1':
        df['length'] = df['price'].apply(lambda x: len(x))
        df = df.sort_values(by='length',ascending=True)
        df = df.drop('length', axis=1)
        
    elif filters == 'price high to low' or filters == '2':
        df['length'] = df['price'].apply(lambda x: len(x))
        df = df.sort_values(by='length',ascending=False)
        df = df.drop('length', axis=1)
        
    return df

In [24]:
web_scrapingYelp()


What do you want to eat ? noodle
Where are you ? new york
How many results do you want? 3
Do you want to sort by 
 1: price low to high  
 2: price high to low  
 3: ratings? 
 Please enter the number 1/2/3:3


Unnamed: 0_level_0,is_closed,latitude,longitude,price,rating,categories,phone
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Hartley's,False,40.682598,-73.961343,$$,5.0,"[Irish Pub, Breakfast & Brunch]",13477992877
Amélie,False,40.7327,-73.99766,$$,4.5,"[French, Wine Bars]",12125332962
Upstate,False,40.726314,-73.98649,$$,4.5,"[Seafood, Wine Bars, Beer Bar]",16467915400


# Part 2) Web Scraping

In [27]:
#want to put all infomations we want to get from one business into one funtion
#input: specific restaurant url link
#output: all details we tested above
def get_info(link):
    response_business = requests.get(link)
    results_business_page = BeautifulSoup(response_business.content,'lxml')
    
    try:
        addr = results_business_page.find('strong', class_ = "street-address").get_text().strip()
    except:
        addr = None

    try:
        json_map = results_business_page.find('div', class_ = "lightbox-map hidden").get('data-map-state')
        dict_map = json.loads(json_map)
        lat = dict_map["markers"][1]["location"]["latitude"]
        lon = dict_map["markers"][1]["location"]["longitude"]
    except: 
        json_map = None 

    try: 
        category = results_business_page.find("span", class_ = 'category-str-list').find('a').get_text()
    except:
        category = None
        
    try:
        price_range = results_business_page.find("span", class_ = 'business-attribute price-range').get_text()
    except:
        price_range = None
        
    try: 
        rating_value = results_business_page.find("div", {'itemprop' : "aggregateRating"}).find("meta").get('content')
    except: 
        rating_value = None
          
    try:
        phone_number = results_business_page.find('span',itemprop = "telephone").get_text().strip()
    except:
        phone_number = None
        
    try:
        open_ = results_business_page.find('span', class_ = "nowrap extra open").get_text()
    except:
        open_ = 'closed now'
        
    try:
        web = results_business_page.find('span', class_ = "biz-website js-biz-website js-add-url-tagging").find('a').get_text()
    except:
        web = None
    
    try:
        moreinfo = results_business_page.find_all('dl')
        delivery = "No"
        for i in moreinfo[1:]:
            try:
                attr = i.find('dt',{'class':"attribute-key"}).get_text().strip()
                yon = i.find('dd').get_text().strip()
                if attr == 'Delivery':
                    delivery = yon
                    break
            except:
                delivery = delivery

    except:
        delivery = None
        
    return(addr,category,price_range,rating_value,phone_number,open_,web,delivery,lat,lon)
    

In [28]:
def get_businesses():
    global keyword
    keyword = input('What do you want to eat ? ')
    global address
    address = input('Where are you ? ')
    limit = input('How many results do you want? ')
    filters = input('Do you want to sort by \n 1: price low to high  \n 2: price high to low  \n 3: ratings? \n Please enter the number 1/2/3:')
    
    #first page - figure out numbers of results per page
    url = 'https://www.yelp.com/search?find_desc='+ keyword + '&find_loc=' + address 
    response = requests.get(url)
    try:
        results_page = BeautifulSoup(response.content,'lxml')
    except:
        print("we didn't get back result page from yelp correctly")
    noperpage = results_page.find('span',{'class':'pagination-results-window'}).get_text().strip()[10:12]
    noperpage = int(noperpage)
    businesses = results_page.find_all('a',{'data-analytics-label':"biz-name"})[1:]
    global cuisine_styles
    cuisine_styles = results_page.find_all('span',class_="category-str-list")
    
    #start loop from the second page to get enough number of restaurants
    loop_range = (int(limit)-1)//noperpage
    for pagination in range(1, loop_range+1):
        url = 'https://www.yelp.com/search?find_desc='+ keyword + '&find_loc=' + address + '&start=' + str(pagination*noperpage)
        response = requests.get(url)
        try:
            results_page = BeautifulSoup(response.content,'lxml')
        except:
            print("we didn't get back result page from yelp correctly")
        
        businesses.extend(results_page.find_all('a',{'data-analytics-label':"biz-name"})[1:])
    businesses = businesses[:int(limit)]
    
    #generate details by get_info and format in dataframe
    business_list = []
    for business in businesses:
        name = business.find('span').get_text()
        link = 'https://www.yelp.com' + business.get('href')
        addr,category,price_range,rating_value,phone_number,open_,web,delivery,lat,lon= get_info(link)
        business_list.append([name,addr,open_,price_range,rating_value,category,phone_number,web,delivery,lat,lon])
    df = pd.DataFrame(business_list[:int(limit)+1], columns = ['name','address','open or closed','price range','rating value','category','phone','website','delivery service','lat','lon'])
    
    #sortby...
    pattern = r'rat'
    match = bool(re.search(pattern, filters))
    if match or filters == '3':
        df = df.sort_values(by='rating value',ascending=False)
    
    elif filters == 'price low to high' or filters == '1':
        df['length'] = df['price range'].apply(lambda x: len(x))
        df = df.sort_values(by='length',ascending=True)
        df = df.drop('length', axis=1)
        
    elif filters == 'price high to low' or filters == '2':
        df['length'] = df['price range'].apply(lambda x: len(x))
        df = df.sort_values(by='length',ascending=False)
        df = df.drop('length', axis=1)
        
    map_address= address
    map_address=map_address.replace(' ','_')
    api_key="AIzaSyD2GsvbVaqwNOj6LNVOXlIpzm_Lu-y7yGs"
    url="https://maps.googleapis.com/maps/api/geocode/json?address=%s&key=%s" % (map_address,api_key) #%s permet de mettre des variables dans des string
    response = requests.get(url)
    a = response.json()
    try:
        location = a['results'][0]['geometry']['location']
        lat = location['lat']
        long = location['lng']
    except:
        lat = None
        long = None
        
    m = folium.Map(location=[lat, long],zoom_start=11.5)
    for k in range(len(df)):
        folium.Marker([df['lat'].iloc[k],df['lon'].iloc[k]],popup=df['name'].iloc[k]).add_to(m)
    
    return(df,m)

In [8]:
res,map_ = get_businesses()

What do you want to eat ? lobster
Where are you ? berkley
How many results do you want? 4
Do you want to sort by 
 1: price low to high  
 2: price high to low  
 3: ratings? 
 Please enter the number 1/2/3:4


In [9]:
res

Unnamed: 0,name,address,open or closed,price range,rating value,category,phone,website,delivery service,lat,lon
0,Republica,"1999 Coolidge HwyBerkley, MI 48072",Open now,$$,4.0,Gastropubs,(248) 268-3175,republicaberkley.com,Yes,42.491561,-83.183523
1,Kruse & Muer on Woodward,"28028 Woodward AveRoyal Oak, MI 48067",closed now,$$,3.5,Seafood,(248) 965-2101,kruseandmuerrestaurants.com/l…,No,42.497138,-83.165509
2,O’ Mara’s Irish Pub,"2555 12 Mile RdBerkley, MI 48072",Open now,$$,3.0,American (Traditional),(248) 399-6750,omaras.net,Yes,42.502808,-83.184745
3,Johnny’s,"215 S Main StRoyal Oak, MI 48067",closed now,$$,4.0,Cocktail Bars,(248) 794-1261,johnnysroyaloak.com,No,42.48856,-83.143667


### Mapping the results

In [10]:
map_

### Further Recommendations
Give users an opportunity to see recommendations relevant cuisine
e.g. if originally searched for "pizza" but did not like the recommendations
This function returns 10 more recommendations of "Italian" restaurants

In [13]:
def further_rec():
    reaction = input('Are you satisfied with your recommendations? (yes/no)')
    
    if reaction == 'yes':
        return 'Bon Appetit!'
    else:
        # Moved cuisine_styles to get_business() and make it global for access in this function
        styles = []
        for i in range(len(cuisine_styles)):
            style = [x.get_text() for  x  in cuisine_styles[i].find_all('a')]
            styles.extend(style)

        count = collections.Counter(styles).most_common(2)
        if keyword == count[0][0].casefold():
            keyword2 = count[1][0].casefold()
        else:
            keyword2 = count[0][0].casefold()

        url = 'https://www.yelp.com/search?find_desc='+ keyword2 + '&find_loc=' + address
        # new keyword but same address as from get_business()
        response = requests.get(url)
        try:
            results_page = BeautifulSoup(response.content,'lxml')
        except:
            print("we didn't get back result page from yelp correctly")
                
        businesses = results_page.find_all('a',{'data-analytics-label':"biz-name"})[1:]
        business_list = []
        for business in businesses:
            name = business.find('span').get_text()
            link = 'https://www.yelp.com' + business.get('href')
            addr,category,price_range,rating_value,phone_number,open_,web,delivery,lat,lon= get_info(link)
            business_list.append([name,open_,price_range,rating_value,category,phone_number,web,delivery,lat,lon])
                                 
        df = pd.DataFrame(business_list[:11], columns = ['name','open or closed','price range','rating value','category','phone','website','delivery service','lat','lon'])
        print('Take a look at the following 10 restaurants!')                                   
        return df

In [14]:
fr = further_rec()
fr

Are you satisfied with your recommendations? (yes/no)no
Take a look at the following 10 restaurants!


Unnamed: 0,name,open or closed,price range,rating value,category,phone,website,delivery service,lat,lon
