# yelp data ingestion

In [28]:
import pprint
import boto3
import json
import sys
import os

pp = pprint.PrettyPrinter(indent = 3)
print('imported modules.')

imported modules.


## connect to s3 & list files

In [2]:
client = boto3.client('s3')

bucket_meta = client.list_objects(Bucket = 'yelp-academic-dataset-123')
print('files in s3 bucket:')
print('')
for c in bucket_meta['Contents']:
    print(c['Key'])

files in s3 bucket:

yelp_academic_dataset_business.json
yelp_academic_dataset_checkin.json
yelp_academic_dataset_tip.json


## function to read json from s3

In [25]:
def read_json(filename):
    """
    read's a yelp .json file from s3 bucket.

    keyword arguments:
    filename - name of file (str)

    returns: json_file (json)
    """

    response = client.get_object(Bucket = 'yelp-academic-dataset-123', Key = filename)
    file_content = response['Body'].read().decode('utf-8')
    json_file = json.loads("[" + file_content.replace("}\n{", "},\n{") + "]")
    return(json_file)

## business

In [26]:
business = read_json('yelp_academic_dataset_business.json')
print('read in json file.')

read in json file.


In [31]:
print('json file has {} records with size of {} gb.'.format(len(business), sys.getsizeof(business)/1000000))
print('here is an example record:')
print('')
pp.pprint(business[0])

json file has 160585 records with size of 1.443576 gb.
here is an example record:

{  'address': '921 Pearl St',
   'attributes': {  'Alcohol': "'beer_and_wine'",
                    'Ambience': "{'touristy': False, 'hipster': False, "
                                "'romantic': False, 'divey': False, "
                                "'intimate': False, 'trendy': False, "
                                "'upscale': False, 'classy': False, 'casual': "
                                'True}',
                    'BikeParking': 'True',
                    'BusinessAcceptsBitcoin': 'False',
                    'BusinessAcceptsCreditCards': 'True',
                    'BusinessParking': "{'garage': False, 'street': True, "
                                       "'validated': False, 'lot': False, "
                                       "'valet': False}",
                    'Caters': 'True',
                    'DogsAllowed': 'False',
                    'GoodForMeal': "{'dessert': False,

## checkin

In [32]:
checkin = read_json('yelp_academic_dataset_checkin.json')
print('read in json file.')

read in json file.


In [33]:
print('json file has {} records with size of {} gb.'.format(len(checkin), sys.getsizeof(checkin)/1000000))
print('here is an example record:')
print('')
pp.pprint(checkin[0])

json file has 138876 records with size of 1.140568 gb.
here is an example record:

{'business_id': '--0r8K_AQ4FZfLsX3ZYRDA', 'date': '2017-09-03 17:13:59'}


## tip

In [34]:
tip = read_json('yelp_academic_dataset_tip.json')
print('read in json file.')

read in json file.


In [35]:
print('json file has {} records with size of {} gb.'.format(len(tip), sys.getsizeof(tip)/1000000))
print('here is an example record:')
print('')
pp.pprint(tip[0])

json file has 1162119 records with size of 9.504856 gb.
here is an example record:

{  'business_id': 'ENwBByjpoa5Gg7tKgxqwLg',
   'compliment_count': 0,
   'date': '2011-07-22 19:07:35',
   'text': 'Carne asada chips...',
   'user_id': 'WCjg0jdHXMlwbqS9tZUx8Q'}


## user

In [None]:
user = read_json('yelp_academic_dataset_user.json')
print('read in json file.')

In [None]:
print('json file has {} records with size of {} gb.'.format(len(user), sys.getsizeof(user)/1000000))
print('here is an example record:')
print('')
pp.pprint(user[0])

## review

In [None]:
review = read_json('yelp_academic_dataset_review.json')
print('read in json file.')

In [None]:
print('json file has {} records with size of {} gb.'.format(len(review), sys.getsizeof(review)/1000000))
print('here is an example record:')
print('')
pp.pprint(review[0])