# Prepare Restaurant data

In [1]:
# import needed libraries
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

# Web scraping www.zomato.com to extract restaurant data.
<b>Disclaimers: </b>

- All of the data extracted are not my own and are properties of www.zomato.com
- Scraped data are not used for commercial purposes and purely for personal education purposes
- HTML, format, tags, parameters, and other website script used as reference are working as of this writing and may change anytime by the website owner/administrators which may impact this code
- The code blocks may take 0-2mins depending on hardware/software/network capabilities. If you want to rerun, patience is appreciated
- Imported time module to avoid overloading the site and get blocked

## Use Search Url and create BeautifulSoup object

In [3]:
headers = {
        'Access-Control-Allow-Origin': '*',
        'Access-Control-Allow-Methods': 'GET',
        'Access-Control-Allow-Headers': 'Content-Type',
        'Access-Control-Max-Age': '3600',
        "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36"
    }

# get html text and create BeautifulSoup object 
search_url = 'https://www.zomato.com/manila/reopened-for-dine-in'
search_html = requests.get(search_url, headers = headers)    
search_soup = BeautifulSoup(search_html.text, 'html.parser')

## Find the Restaurant links from the website search page and append to a list

In [5]:
# Use 3 comment lines below this to find the resto_link_class 
#resto_links = search_soup.find_all('a') 
#for link in resto_links[:200] :
    #print(link)
    
resto_link_class = {'class' : 'sc-cpUXGm sc-cXIWzj jQZAUH'} #this changes everytime you access 1st link(search_url)  
resto_links = search_soup.find_all('a', resto_link_class) 

#filter direct restaurant links and append to list
resto_links_lst = []
for link in resto_links : 
    if link.get('href').split('/')[4].strip() == 'restaurants' :
        continue
    else: resto_link = link.get('href').split('?')[0].strip()
    resto_links_lst.append(resto_link)

print('Scraping Complete!')
print('Count of Hotel Links : ', len(resto_links_lst) )
print('Sample Links:')
for link in resto_links_lst[0:3] : print(link)

Scraping Complete!
Count of Hotel Links :  27
Sample Links:
https://www.zomato.com/manila/leanns-tea-house-tomas-morato-quezon-city
https://www.zomato.com/manila/royal-indian-curry-house-poblacion-makati-city
https://www.zomato.com/manila/sensei-bf-homes-parañaque-city


## For each link in the list, scrape restaurant data

In [6]:
resto_main_dict = {}
for resto_link in resto_links_lst :
    resto_details_dict = {}
    resto_html = requests.get(resto_link, headers = headers)    
    resto_soup = BeautifulSoup(resto_html.text, 'lxml')
    
    #scrape restaurant name and add to dictionary
    resto_name = resto_soup.find('h1').get_text().split('/')[0]
    resto_details_dict['restaurant_name'] = resto_name
    
    #scrape restaurant location and add to dictionary
    loc_tag = resto_soup.find('p', {'class' : 'sc-1hez2tp-0 clKRrC'} )
    resto_loc = loc_tag.get_text().strip()
    resto_details_dict['restaurant_location'] = resto_loc
    
    #scrape restaurant specialty and add to dictionary
    resto_specialty = resto_soup.find('a', {'class' : 'sc-ibxdXY cgmKgO'}).get_text()
    resto_details_dict['resto_specialty'] = resto_specialty
    
    #scrape restaurant price range, seats and other info(restaurant is known for)
    info_tag = resto_soup.find_all('p', {'color' : '#4F4F4F'}, limit = 4)
   
    for info in info_tag :
        if info.get_text().count('PHP') >= 1 :
            resto_price = info.get_text()
            break
        else: 
            resto_other_info = info.get_text()
    seat_capacity = '''Seating Capacity varies from 10-50%, 
    depending on Alert Level''' # no exact no. of seats available in all review and resataurant social media sites
    
    resto_details_dict['restaurant_price_range'] = resto_price
    resto_details_dict['seat_capacity'] = seat_capacity
    resto_details_dict['restaurant_other_info'] = resto_other_info
    time.sleep(1)
    
    #create soup object for different url to scrape review and add to dictionary    
    review_html = requests.get(resto_link + '/reviews', headers = headers)    
    review_soup = BeautifulSoup(review_html.text, 'lxml')
    
    # Use 3 comment lines below this to find the resto_link_class 
    #review_tags = resto_soup.find_all('p')
    #for tag in review_tags :
        #print(tag)
       
    review_class = {'class' : 'sc-1hez2tp-0' }
    review_tags = review_soup.find_all('p', review_class)

    for tag in review_tags [-2:-1] : # this is the location of review text
        if len(tag.get_text()) == 0 :
            resto_review = 'no review text'
        else : 
            resto_review = (tag.get_text())
    resto_details_dict['restaurant_reviews'] = resto_review
    resto_main_dict[resto_name] = resto_details_dict
    # remaining reviews to be extracted 
    
    time.sleep(1)

print('Scraping Complete!')

Scraping Complete!


## Dictionary to Pandas dataframe and clean data before saving to csv

In [60]:
df = pd.DataFrame.from_dict(resto_main_dict, orient = 'index')
print('Count of Rows: ', len(df))
df.head()

Count of Rows:  27


Unnamed: 0,restaurant_name,restaurant_location,resto_specialty,restaurant_price_range,seat_capacity,restaurant_other_info,restaurant_reviews
Leann's Tea House,Leann's Tea House,105-R Mother Ignacia Corner Sct. Madrinan Stre...,Korean,PHP700 for two people (approx.) Without alcohol,"Seating Capacity varies from 10-50%, \n dep...","Courteous Staff, Friendly Staffs, Ample Seatin...",This is probably one of the best K-bbq places ...
Royal Indian Curry House,Royal Indian Curry House,"5345 General Luna Street, Poblacion, Makati City",Indian,PHP2000 for two people (approx.) Without alcohol,"Seating Capacity varies from 10-50%, \n dep...","Awesome Food, Authenticity, Themed Decor, Gast...",Best in the Philippines 5 star for RICH
Sensei,Sensei,"181 Aguirre Avenue, BF Homes, Parañaque City",Japanese,PHP800 for two people (approx.) Without alcohol,"Seating Capacity varies from 10-50%, \n dep...","Fresh, Gastronomical Experience, Relaxed Atmos...",no review text
La Cabrera,La Cabrera,"Ayala Business Center, 6750 Ayala Avenue, Glor...",Argentine,PHP2300 for two people (approx.) Without alcohol,"Seating Capacity varies from 10-50%, \n dep...","Comfortable Seating Area, Great Service, Manag...",Di available ang 15 day aged rib eye that day ...
Earth Kitchen,Earth Kitchen,"Lot 10 B-10 Katipunan Avenue, White Plains, Qu...",Italian,PHP1200 for two people (approx.) Without alcohol,"Seating Capacity varies from 10-50%, \n dep...","Healthy Food, Good Food, Servings, Ambience, E...",Pili nut was an unexpected but perfect pair wi...


In [61]:
df['restaurant_location'] = df['restaurant_location'].apply(lambda x: x.split(',')[-1].strip())
df.restaurant_location.loc[df['restaurant_location'] == 'Quezon City 1100'] = 'Quezon City'
df.restaurant_location.loc[df['restaurant_location'] == 'Quezon City 1101'] = 'Quezon City'
df

Unnamed: 0,restaurant_name,restaurant_location,resto_specialty,restaurant_price_range,seat_capacity,restaurant_other_info,restaurant_reviews
Leann's Tea House,Leann's Tea House,Quezon City,Korean,PHP700 for two people (approx.) Without alcohol,"Seating Capacity varies from 10-50%, \n dep...","Courteous Staff, Friendly Staffs, Ample Seatin...",This is probably one of the best K-bbq places ...
Royal Indian Curry House,Royal Indian Curry House,Makati City,Indian,PHP2000 for two people (approx.) Without alcohol,"Seating Capacity varies from 10-50%, \n dep...","Awesome Food, Authenticity, Themed Decor, Gast...",Best in the Philippines 5 star for RICH
Sensei,Sensei,Parañaque City,Japanese,PHP800 for two people (approx.) Without alcohol,"Seating Capacity varies from 10-50%, \n dep...","Fresh, Gastronomical Experience, Relaxed Atmos...",no review text
La Cabrera,La Cabrera,Makati City,Argentine,PHP2300 for two people (approx.) Without alcohol,"Seating Capacity varies from 10-50%, \n dep...","Comfortable Seating Area, Great Service, Manag...",Di available ang 15 day aged rib eye that day ...
Earth Kitchen,Earth Kitchen,Quezon City,Italian,PHP1200 for two people (approx.) Without alcohol,"Seating Capacity varies from 10-50%, \n dep...","Healthy Food, Good Food, Servings, Ambience, E...",Pili nut was an unexpected but perfect pair wi...
Silantro Fil-Mex,Silantro Fil-Mex,Quezon City,Filipino,PHP800 for two people (approx.) Without alcohol,"Seating Capacity varies from 10-50%, \n dep...","Good for Large Groups, Reasonable Prices, Main...",no review text
Teppanya,Teppanya,Las Piñas City,Japanese,PHP1200 for two people (approx.) Without alcohol,"Seating Capacity varies from 10-50%, \n dep...","Family Crowd, Gastronomical Experience, Chef, ...",After almost 7 months of no-dining in at resta...
Samba - Shangri-La at the Fort,Samba - Shangri-La at the Fort,Taguig City,Latin American,PHP2500 for two people (approx.) Without alcohol,"Seating Capacity varies from 10-50%, \n dep...","Good for Large Groups, Relaxed Atmosphere, Che...",This casual dining restaurant is family friend...
Antonio's,Antonio's,Tagaytay City,French,PHP4500 for two people (approx.) Without alcohol,"Seating Capacity varies from 10-50%, \n dep...","Romantic Setting, Clean Bathrooms, Sophisticat...",Everything went well when I went earlier. The ...
Ombu Kusina,Ombu Kusina,Quezon City,Filipino,PHP800 for two people (approx.) Without alcohol,"Seating Capacity varies from 10-50%, \n dep...","Food and Service, Good for Large Groups, Theme...","Great food, great service and overall a great ..."


In [62]:
df.to_csv('Restaurant.csv', index = False)