In [1]:
# Setup Libraries
import os
import re
import time
import string
import pandas as pd
import numpy as np
from itertools import product

import requests
import json
from bs4 import BeautifulSoup
from lxml import etree

# connect to MongoDB
import dns
import pymongo
from pymongo import MongoClient


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


### Set Credentials

In [2]:
# Credentials: key, url, headers
url = 'https://api.yelp.com/v3/businesses/search'
key = 'YHHnKimgjw6krz4sjYcV9kWcmzoundDaNdXX0RLWeEERqGRz7dSw5XbsKaPz8vdYJrjwi5gmMol3q2T3fQLSVjy08Sw2DHHfUsMIjmCHpdqPtOwDNQHNitoLwVUhYHYx'
headers = {
    'Authorization': 'Bearer %s' % key
}

### Extract New York Restaurants

In [3]:
location = ['New York, NY']
# Using the offset and limit parameters, you can get up to 1000 businesses from this endpoint if there are more than 1000 results. 
offset = np.arange(0, 1000, 50)
tuples = list(product(location, offset))

detail_info = []
for loc, step in tuples:
    search_parameters = {
        'location': loc,
        'term': 'restaurants',
        'limit': 50,
        # 'radius': 2500,
        'offset': step
    }
    resp = requests.get(url, headers=headers, params=search_parameters)
    raw_data = resp.json()
    detail_info.append(raw_data['businesses'])

In [4]:
def organize_info(info):
    final_info = []
    for i in info:
        for j in i:
            final_info.append(j)
    return final_info

detail_data = organize_info(detail_info)

In [5]:
len(detail_data)

1000

In [6]:
detail_data[:5]

[{'id': 'fVbUVAiLiGgLA_nxBFxyww',
  'alias': 'thursday-kitchen-new-york',
  'name': 'Thursday Kitchen',
  'image_url': 'https://s3-media4.fl.yelpcdn.com/bphoto/nXO8M-d-XTamNmc0BpXWtA/o.jpg',
  'is_closed': False,
  'url': 'https://www.yelp.com/biz/thursday-kitchen-new-york?adjust_creative=b1l9PtKsY5nsB8qEsNcf0Q&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=b1l9PtKsY5nsB8qEsNcf0Q',
  'review_count': 1379,
  'categories': [{'alias': 'korean', 'title': 'Korean'},
   {'alias': 'newamerican', 'title': 'American (New)'},
   {'alias': 'tapasmallplates', 'title': 'Tapas/Small Plates'}],
  'rating': 4.5,
  'coordinates': {'latitude': 40.7275, 'longitude': -73.9838},
  'transactions': ['delivery', 'pickup'],
  'price': '$$',
  'location': {'address1': '424 E 9th St',
   'address2': None,
   'address3': '',
   'city': 'New York',
   'zip_code': '10009',
   'country': 'US',
   'state': 'NY',
   'display_address': ['424 E 9th St', 'New York, NY 10009']},
  'phone': '',
  'di

### Clean and Extract Info

The information we'd like to pull are:

- `_id`
- `display_address`
- `name`
- `phone`
- `price`
- `rating`
- `city`
- `review_count`
- `transactions`
- `url`

In [7]:
nyc_restaurants = []
dict_ = {}
for item in detail_data:
  # get cuisine categories
    categories = []
    for c in item['categories']:
        categories.append(c['title'])
  # 有些餐厅没有price，所以用一个try block，没有price的餐厅用空来填值
    try:
        dict_ = {
            "_id": item['id'],       
            "name": item['name'],
            "phone": item['display_phone'],
            "price": item['price'],
            "rating": item['rating'],
            "url": item['url'],
            "city": item['location']['city'],
            "address": ' '.join(item['location']['display_address']),
            "category": categories,
            "transactions": item['transactions'],
            "review_count": item['review_count']
        }
    except:
        dict_ = {
            "_id": item['id'],    
            "name": item['name'],
            "phone": item['display_phone'],
            "price": '',
            "rating": item['rating'],
            "url": item['url'],
            "city": item['location']['city'],
            "address": ' '.join(item['location']['display_address']),
            "category": categories,
            "transactions": item['transactions'],
            "review_count": item['review_count']
        }
    nyc_restaurants.append(dict_)

In [8]:
len(nyc_restaurants)

1000

In [9]:
nyc_restaurants[:5]

[{'_id': 'fVbUVAiLiGgLA_nxBFxyww',
  'name': 'Thursday Kitchen',
  'phone': '',
  'price': '$$',
  'rating': 4.5,
  'url': 'https://www.yelp.com/biz/thursday-kitchen-new-york?adjust_creative=b1l9PtKsY5nsB8qEsNcf0Q&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=b1l9PtKsY5nsB8qEsNcf0Q',
  'city': 'New York',
  'address': '424 E 9th St New York, NY 10009',
  'category': ['Korean', 'American (New)', 'Tapas/Small Plates'],
  'transactions': ['delivery', 'pickup'],
  'review_count': 1379},
 {'_id': 'ETgJqJHV7BW6pIr9Ox74sA',
  'name': 'Amélie',
  'phone': '(212) 533-2962',
  'price': '$$',
  'rating': 4.5,
  'url': 'https://www.yelp.com/biz/am%C3%A9lie-new-york?adjust_creative=b1l9PtKsY5nsB8qEsNcf0Q&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=b1l9PtKsY5nsB8qEsNcf0Q',
  'city': 'New York',
  'address': '22 W 8th St New York, NY 10011',
  'category': ['French', 'Wine Bars'],
  'transactions': ['delivery', 'pickup'],
  'review_count': 2719},
 {'_i

### Saving JSON file

In [10]:
# from google.colab import drive
# drive.mount('/content/drive')

In [11]:
# with open('nyc_restaurants_info.json', 'w') as f:
#     json.dump(nyc_restaurants, f)

In [12]:
# nyc = pd.DataFrame(nyc_restaurants)
# nyc.head()

Unnamed: 0,_id,name,phone,price,rating,url,city,address,category,transactions,review_count
0,fVbUVAiLiGgLA_nxBFxyww,Thursday Kitchen,,$$,4.5,https://www.yelp.com/biz/thursday-kitchen-new-...,New York,"424 E 9th St New York, NY 10009","[Korean, American (New), Tapas/Small Plates]","[delivery, pickup]",1379
1,ETgJqJHV7BW6pIr9Ox74sA,Amélie,(212) 533-2962,$$,4.5,https://www.yelp.com/biz/am%C3%A9lie-new-york?...,New York,"22 W 8th St New York, NY 10011","[French, Wine Bars]","[delivery, pickup]",2719
2,J3NT61-AH5d5Gu5tFJhYSw,The Cabin NYC,(212) 777-0454,$$,4.0,https://www.yelp.com/biz/the-cabin-nyc-new-yor...,New York,"205 E 4th St New York, NY 10009","[American (New), Cocktail Bars, Breakfast & Br...","[restaurant_reservation, delivery, pickup]",269
3,q11TljTQd33XCWlVoPyRqg,The Osprey,(347) 696-2505,$$,4.0,https://www.yelp.com/biz/the-osprey-brooklyn?a...,Brooklyn,"60 Furman St Brooklyn, NY 11201",[American (New)],[delivery],230
4,hNFe8WhCibrqT4sFcZmAgw,Kong Sihk Tong 港食堂,(646) 850-6140,$,4.0,https://www.yelp.com/biz/kong-sihk-tong-%E6%B8...,New York,"65 Bayard St New York, NY 10013","[Chinese, Hong Kong Style Cafe]","[delivery, pickup]",300


### Connect to MongoDB

In [13]:
# my connection string into your application code
connect = 'mongodb+srv://m001-student:m001-mongodb-basics@sandbox.jqgjp.mongodb.net/restaurant_info?retryWrites=true&w=majority'
cluster = MongoClient(connect)
db = cluster['yelp_dataset']               # to database
collection = db['restaurants']             # to collection

In [26]:
results = collection.delete_many({})

In [27]:
# insert data into mongodb database
for info in nyc_restaurants:
    try:
        collection.insert_one(info)
    except:
        pass

In [28]:
# API 返回的结果有重复的, 这里只保留重复business的第一个返回结果
count = collection.count_documents({})
print(count)

994


Now saving MongoDB documents in file: `nyc_restaurants_updated.json`