In [1]:
import os
import re
import time
import string
import pandas as pd
import numpy as np

# web scraping
import requests
import json
from bs4 import BeautifulSoup
from lxml import etree

# connect to MongoDB
import dns
import pymongo
from pymongo import MongoClient

In [2]:
# UA伪装
header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'}

Yelp link: https://www.yelp.com/search?cflt=restaurants&find_loc=Fort%20Lee%2C%20NJ&start=0

### General Scraping - List of Restaurants
直接从给定地点和filter(restaurant)开始爬。目前只爬了二十个存进mongodb。

=> Further step: 只从yelp主页，餐厅和地址变成filter，合成更general的url。

#### Page 1

In [78]:
# url
main_page_url = 'https://www.yelp.com/search?cflt=restaurants&find_loc=Fort%20Lee%2C%20NJ'

# get page
response = requests.get(url = main_page_url, headers = header)
page_text = response.text

# 解析
tree = etree.HTML(page_text)

# 提取li标签
li_list = tree.xpath('//ul/li')

In [79]:
names = []
urls = []
ratings = []
cuisines = []
num_reviews = []
prices = []
for li in li_list:
    # stores information about every single restaurant
    div_list = li.xpath('./div/div/div/div[2]')
    
    # 逐个解析
    for div in div_list:
        name = div.xpath('./div[1]/div/div[1]/div/div[1]/div/div/h4/span/a/text()')
        url = div.xpath('./div[1]/div/div[1]/div/div[1]/div/div/h4/span/a/@href')
        rating = div.xpath('./div[1]/div/div[1]/div/div[2]/div/div/div[1]/span/div/@aria-label')
        num_review = div.xpath('./div[1]/div/div[1]/div/div[2]/div/div/div[2]/span/text()')
        # 这里有点特殊，这个div标签下有price和cuisine，但有些餐厅有price，有些没有，所以分开提取，没有price的餐厅存为空  
        cuisine_div = div.xpath('./div[1]/div/div[1]/div/div[3]/div/div')
        for d in cuisine_div:
            price = d.xpath('./span/span/text()')
            cuisine_span = d.xpath('./span')
            c = []
            for s in cuisine_span:
                cuisine = s.xpath('./span/span/a/text()')
                if cuisine:
                    c.append(cuisine)
       
        names.append(name)
        urls.append(url)
        ratings.append(rating)
        num_reviews.append(num_review)
        cuisines.append(c[0])
        prices.append(price)

In [80]:
info = []
dict_ = {}
for index, item in enumerate(names):
    if item:
        try:
            dict_ = {
                "Name": item[0],
                "Url": 'https://www.yelp.com' + urls[index][0],
                "Cuisine": cuisines[index],
                "Price": prices[index],
                "Rating": ratings[index][0],
                "Review_number": int(num_reviews[index][0])
            }
        except:
            dict_ = {
                "Name": item[0],
                "Url": 'https://www.yelp.com' + urls[index][0],
                "Cuisine": cuisines[index],
                "Price": prices[index],
                "Rating": ratings[index],
                "Review_number": num_reviews[index] 
            }
        info.append(dict_)

# 前两个和最后一个结果都是sponsor result，去掉
info = info[2:-1]

In [81]:
info

[{'Name': 'Gopchang Story Fort Lee',
  'Url': 'https://www.yelp.com/biz/gopchang-story-fort-lee-fort-lee',
  'Cuisine': ['Korean', 'Barbeque'],
  'Price': [],
  'Rating': '4.5 star rating',
  'Review_number': 24},
 {'Name': 'Soup Dumpling Plus',
  'Url': 'https://www.yelp.com/biz/soup-dumpling-plus-fort-lee',
  'Cuisine': ['Shanghainese'],
  'Price': ['$$'],
  'Rating': '4.5 star rating',
  'Review_number': 933},
 {'Name': 'Soba Noodle Azuma',
  'Url': 'https://www.yelp.com/biz/soba-noodle-azuma-fort-lee-2',
  'Cuisine': ['Japanese', 'Noodles'],
  'Price': ['$$'],
  'Rating': '4.5 star rating',
  'Review_number': 311},
 {'Name': 'Sa Rit Gol',
  'Url': 'https://www.yelp.com/biz/sa-rit-gol-fort-lee-2',
  'Cuisine': ['Korean'],
  'Price': ['$$'],
  'Rating': '4.5 star rating',
  'Review_number': 174},
 {'Name': 'Lauren’s Chicken Burger',
  'Url': 'https://www.yelp.com/biz/lauren-s-chicken-burger-fort-lee',
  'Cuisine': ['Chicken Wings', 'Burgers', 'Chicken Shop'],
  'Price': [],
  'Rating

#### Connection to MongoDB
Tutorial: https://www.youtube.com/watch?v=rE_bJl2GAY8&ab_channel=TechWithTim

Use the following information to complete this form, but do not click "Connect" yet.
- Hostname: cluster0-shard-00-00-jxeqq.mongodb.net
- Username: m001-student
- Password: m001-mongodb-basics
- Replica Set Name: Cluster0-shard-0
- Read Preference: Primary Preferred

In [13]:
# my connection string into your application code
connect = 'mongodb+srv://m001-student:m001-mongodb-basics@sandbox.jqgjp.mongodb.net/restaurant_info?retryWrites=true&w=majority'
cluster = MongoClient(connect)
db = cluster['restaurant_info']    # to database
collection = db['test']             # to collection

In [14]:
# insert one document
post1 = info[0]
collection.insert_one(post1)

<pymongo.results.InsertOneResult at 0x11cb5fc80>

In [16]:
# insert multiple documents: insert a list of dictionarys (如果有重复的会报错)
collection.insert_many(info[1:])

<pymongo.results.InsertManyResult at 0x11088f780>

In [28]:
# find("attribute": "value"): returns a mongo cursor object, need to loop through
results = collection.find({"name": "Gopchang Story Fort Lee"})   
print("Returns a cursor:", results)

print("\nReturns all results:")
for res in results:
    print(res)

print("\nReturns particular attribute:")
for res in results:
    print(res['_id'])    # 怎么没有返回结果？

Returns a cursor: <pymongo.cursor.Cursor object at 0x11cb99400>

Returns all results:
{'_id': ObjectId('601718c03d1c2c00c6fd5725'), 'name': 'Gopchang Story Fort Lee', 'img': 'https://s3-media0.fl.yelpcdn.com/bphoto/T7Nk0vrA3j5bD-utYpd9kw/ls.jpg', 'rating': '4.5 star rating', 'cuisine': ['Korean', 'Barbeque']}

Returns particular attribute:


In [29]:
# find one result, no need to loop through
results = collection.find_one({"name": "Gamja Tang Tang"}) 
print(results)

{'_id': ObjectId('601718c03d1c2c00c6fd572c'), 'name': 'Gamja Tang Tang', 'img': 'https://s3-media0.fl.yelpcdn.com/bphoto/5QtlZRff4zoD90TR-8JCog/ls.jpg', 'rating': '4 star rating', 'cuisine': ['Korean']}


In [91]:
# # delete
# results = collection.delete_one({"name": 'Curry Up Now'})
results = collection.delete_many({})          # delete everthing

Update operators: https://docs.mongodb.com/manual/reference/operator/update/

In [30]:
# update_one({search for the one you want to update}, {update operators: {"attribute": "updated values"}})
# results = collection.update_on({"name": "Cyndia’s"}, {"$set": {"name": "Cyndia's"}})      #  update original field
# resulte = collection.update_one({"name": "Cyndia’s"}, {"$set": {"description": "Cyndia's"}})   # create a new field

In [31]:
# count the amount of documents that meet certain criteria
post_count = collection.count_documents({"rating": '4 star rating'})
print(post_count)

5


#### Scape multiple pages
Information includes:
- Name of restaurant
- Image
- Rating
- Cuisine type
- Number of reviews
- Service

In [53]:
def get_max_page(url):
    """
    Total number of pages of searching results
    """
    response = requests.get(url = url, headers = header)
    page_text = response.text
    tree = etree.HTML(page_text)
    page_range = tree.xpath('//ul/li[21]/div/div[2]/span/text()')
    max_page = page_range[0].split(' ')[-1]
    return max_page

In [82]:
def scrap_restaurant_info(url_format, num_page):
    """
    Scrap basic information about restaurants
    """
    url = url_format.format((num_page-1)*10)
    
    # scrap page info
    response = requests.get(url = url, headers = header)
    page_text = response.text
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//ul/li')
    
    info = []     # stores the basic information from main page
    names = []
    urls = []
    ratings = []
    cuisines = []
    num_reviews = []
    prices = []
    for li in li_list:
        div_list = li.xpath('./div/div/div/div[2]')   
        for div in div_list:
            name = div.xpath('./div[1]/div/div[1]/div/div[1]/div/div/h4/span/a/text()')
            url = div.xpath('./div[1]/div/div[1]/div/div[1]/div/div/h4/span/a/@href')
            rating = div.xpath('./div[1]/div/div[1]/div/div[2]/div/div/div[1]/span/div/@aria-label')
            num_review = div.xpath('./div[1]/div/div[1]/div/div[2]/div/div/div[2]/span/text()')        
            cuisine_div = div.xpath('./div[1]/div/div[1]/div/div[3]/div/div')
            c = []
            for d in cuisine_div:
                price = d.xpath('./span/span/text()')
                cuisine_span = d.xpath('./span')
                for s in cuisine_span:
                    cuisine = s.xpath('./span/span/a/text()')
                    if cuisine:
                        c.append(cuisine)
       
            names.append(name)
            urls.append(url)
            ratings.append(rating)
            num_reviews.append(num_review)
            cuisines.append(c)
            prices.append(price)
    
    dict_ = {}
    for index, item in enumerate(names):
        if item:
            try:
                dict_ = {
                    "Name": item[0],
                    "Url": 'https://www.yelp.com' + urls[index][0],
                    "Cuisine": cuisines[index][0],
                    "Price": prices[index],
                    "Rating": ratings[index][0],
                    "Review_number": int(num_reviews[index][0])  
                }
            except:
                dict_ = {
                    "Name": item[0],
                    "Url": 'https://www.yelp.com' + urls[index][0],
                    "Cuisine": cuisines[index],
                    "Price": prices[index],
                    "Rating": ratings[index],
                    "Review_number": num_reviews[index]
                }
            info.append(dict_)

    info = info[2:-1]
    return info

In [83]:
# test example: scrape page 1 & 2
url_format = "https://www.yelp.com/search?cflt=restaurants&find_loc=Fort+Lee%2C+NJ&start={}"
restaurant_info = []
for page in range(1, 2+1):
    page_info = scrap_restaurant_info(url_format, page)
    restaurant_info.append(page_info)
    time.sleep(30)

restaurant_info  # Success!

[[{'Name': 'Gopchang Story Fort Lee',
   'Url': 'https://www.yelp.com/biz/gopchang-story-fort-lee-fort-lee',
   'Cuisine': ['Korean', 'Barbeque'],
   'Price': [],
   'Rating': '4.5 star rating',
   'Review_number': 24},
  {'Name': 'Soup Dumpling Plus',
   'Url': 'https://www.yelp.com/biz/soup-dumpling-plus-fort-lee',
   'Cuisine': ['Shanghainese'],
   'Price': ['$$'],
   'Rating': '4.5 star rating',
   'Review_number': 933},
  {'Name': 'Soba Noodle Azuma',
   'Url': 'https://www.yelp.com/biz/soba-noodle-azuma-fort-lee-2',
   'Cuisine': ['Japanese', 'Noodles'],
   'Price': ['$$'],
   'Rating': '4.5 star rating',
   'Review_number': 311},
  {'Name': 'Sa Rit Gol',
   'Url': 'https://www.yelp.com/biz/sa-rit-gol-fort-lee-2',
   'Cuisine': ['Korean'],
   'Price': ['$$'],
   'Rating': '4.5 star rating',
   'Review_number': 174},
  {'Name': 'Lauren’s Chicken Burger',
   'Url': 'https://www.yelp.com/biz/lauren-s-chicken-burger-fort-lee',
   'Cuisine': ['Chicken Wings', 'Burgers', 'Chicken Shop'

In [84]:
len(restaurant_info)

2

In [54]:
total_num_pages = int(get_max_page('https://www.yelp.com/search?cflt=restaurants&find_loc=Fort%20Lee%2C%20NJ&start=0'))
total_num_pages

24

In [55]:
# url_format = "https://www.yelp.com/search?cflt=restaurants&find_loc=Fort+Lee%2C+NJ&start={}"
# restaurant_info = []
# for page in range(1, total_num_pages+1):
#     page_info = scrap_restaurant_info(url_format, page)
#     restaurant_info.append(page_info)
#     time.sleep(30)

UnboundLocalError: local variable 'c' referenced before assignment

In [85]:
# connection to mongodb
connect = 'mongodb+srv://m001-student:m001-mongodb-basics@sandbox.jqgjp.mongodb.net/restaurant_info?retryWrites=true&w=majority'
cluster = MongoClient(connect)
db = cluster['restaurant_info']    # to database
collection = db['test']             # to collection

results = collection.delete_many({})   # clear database (如果输入记录有重复会报错)

In [86]:
# insert data into mongodb database
for info in restaurant_info:
    collection.insert_many(info)