# yelp data ingestion

see example: https://github.com/polakowo/yelp-3nf

In [2]:
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Window, Row

import pprint
import boto3
import json
import os

pp = pprint.PrettyPrinter(indent = 3)
print('imported modules.')

imported modules.


## connect to s3

In [3]:
client = boto3.client('s3')

bucket_meta = client.list_objects(Bucket = 'yelp-dataset-stevenhurwitt')
print('files in s3 bucket:')
print('')
for c in bucket_meta['Contents']:
    print(c['Key'])

files in s3 bucket:

raw/
raw/yelp_academic_dataset_business.json
raw/yelp_academic_dataset_checkin.json
raw/yelp_academic_dataset_review.json
raw/yelp_academic_dataset_tip.json
raw/yelp_academic_dataset_user.json


## function to read json from s3

In [4]:
def read_json(filename):
    """
    reads a yelp .json file from s3 bucket.

    keyword arguments:
    filename - name of file (str)

    returns: json_file (json)
    """

    bucket = "yelp-dataset-stevenhurwitt"
    print(f"bucket: {bucket}.")
    print(f"filename: {filename}.")
    
    response = client.get_object(Bucket = bucket, Key = filename)
    file_content = response['Body'].read().decode('utf-8')
    json_file = json.loads("[" + file_content.replace("}\n{", "},\n{") + "]")
    return(json_file)

In [5]:
my_file = read_json('raw/yelp_academic_dataset_business.json')
my_file

bucket: yelp-dataset-stevenhurwitt.
filename: raw/yelp_academic_dataset_business.json.


[{'business_id': 'Pns2l4eNsfO8kk83dixA6A',
  'name': 'Abby Rappoport, LAC, CMQ',
  'address': '1616 Chapala St, Ste 2',
  'city': 'Santa Barbara',
  'state': 'CA',
  'postal_code': '93101',
  'latitude': 34.4266787,
  'longitude': -119.7111968,
  'stars': 5.0,
  'review_count': 7,
  'is_open': 0,
  'attributes': {'ByAppointmentOnly': 'True'},
  'categories': 'Doctors, Traditional Chinese Medicine, Naturopathic/Holistic, Acupuncture, Health & Medical, Nutritionists',
  'hours': None},
 {'business_id': 'mpf3x-BjTdTEA3yCZrAYPw',
  'name': 'The UPS Store',
  'address': '87 Grasso Plaza Shopping Center',
  'city': 'Affton',
  'state': 'MO',
  'postal_code': '63123',
  'latitude': 38.551126,
  'longitude': -90.335695,
  'stars': 3.0,
  'review_count': 15,
  'is_open': 1,
  'attributes': {'BusinessAcceptsCreditCards': 'True'},
  'categories': 'Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services',
  'hours': {'Monday': '0:0-0:0',
   'Tuesday': '8:0-18:30',
   'Wednes

In [8]:
# pp.pprint(my_file[0])

In [7]:
pp.pprint(my_file[3])

{  'address': '935 Race St',
   'attributes': {  'Alcohol': "u'none'",
                    'BikeParking': 'True',
                    'BusinessAcceptsCreditCards': 'False',
                    'BusinessParking': "{'garage': False, 'street': True, "
                                       "'validated': False, 'lot': False, "
                                       "'valet': False}",
                    'ByAppointmentOnly': 'False',
                    'Caters': 'True',
                    'OutdoorSeating': 'False',
                    'RestaurantsDelivery': 'False',
                    'RestaurantsPriceRange2': '1',
                    'RestaurantsTakeOut': 'True',
                    'WiFi': "u'free'"},
   'business_id': 'MTSW4McQd7CbVtyjqoe9mw',
   'categories': 'Restaurants, Food, Bubble Tea, Coffee & Tea, Bakeries',
   'city': 'Philadelphia',
   'hours': {  'Friday': '7:0-21:0',
               'Monday': '7:0-20:0',
               'Saturday': '7:0-21:0',
               'Sunday': '7:0-2

## create dynamodb yelp-checkin table

In [None]:
# yelp_business.delete()

NameError: name 'checkin_table' is not defined

In [None]:
dynamodb = boto3.resource('dynamodb', endpoint_url="https://us-east-2.console.aws.amazon.com?arn=arn:aws:dynamodb:us-east-2134132211607:8000")

In [None]:
yelp_business = dynamodb.create_table(
        TableName='yelp.business',
        KeySchema=[
            {
                'AttributeName': 'business_id',
                'KeyType': 'HASH'  # Partition key
            }
        ],
        AttributeDefinitions=[
            {
                'AttributeName': 'name',
                'AttributeType': 'S'
            }
        ],
        ProvisionedThroughput={
            'ReadCapacityUnits': 25,
            'WriteCapacityUnits': 20
        }
    )
print('created dynamo table.')

ResourceInUseException: An error occurred (ResourceInUseException) when calling the CreateTable operation: Cannot create preexisting table

## checkin

In [None]:
checkin = read_json('s3a://yelp-dataset-stevenhurwitt/yelp_academic_dataset_checkin.json')
print('read in json file.')

read in json file.


In [None]:
print('json file has {} records with size of {} mb.'.format(len(checkin), sys.getsizeof(checkin)/1000000))
print('here is an example record:')
print('')
pp.pprint(checkin[0])

json file has 138876 records with size of 1.140568 mb.
here is an example record:

{  'business_id': '--0zrn43LEaB4jUWTQH_Bg',
   'date': '2010-10-08 22:21:20, 2010-11-01 21:29:14, 2010-12-23 22:55:45, '
           '2011-04-08 17:14:59, 2011-04-11 21:28:45, 2011-04-26 16:42:25, '
           '2011-05-20 19:30:57, 2011-05-24 20:02:21, 2011-08-29 19:01:31'}


## insert checkin items into dynamo table

In [None]:
# checkin_missing_index = []

# for i, c in enumerate(checkin):
#     try:
#         checkin_table.put_item(Item = c)
#     except:
#         print('item too large, index {}, skipping...'.format(i))
#         checkin_missing_index.append(i)

# print('loaded data into table.')

index 138400, skipping...
item too large, index 138401, skipping...
item too large, index 138402, skipping...
item too large, index 138403, skipping...
item too large, index 138404, skipping...
item too large, index 138405, skipping...
item too large, index 138406, skipping...
item too large, index 138407, skipping...
item too large, index 138408, skipping...
item too large, index 138409, skipping...
item too large, index 138410, skipping...
item too large, index 138411, skipping...
item too large, index 138412, skipping...
item too large, index 138413, skipping...
item too large, index 138414, skipping...
item too large, index 138415, skipping...
item too large, index 138416, skipping...
item too large, index 138417, skipping...
item too large, index 138418, skipping...
item too large, index 138419, skipping...
item too large, index 138420, skipping...
item too large, index 138421, skipping...
item too large, index 138422, skipping...
item too large, index 138423, skipping...
item too

In [None]:
# print(checkin_table.item_count)

NameError: name 'checkin_table' is not defined

## business

In [None]:
business = read_json('s3a://yelp-dataset-stevenhurwitt/yelp_academic_dataset_business.json')
print('read in json file.')

read in json file.


In [None]:
print('json file has {} records with size of {} mb.'.format(len(business), sys.getsizeof(business)/1000000))
print('here is an example record:')
print('')
pp.pprint(business[0])

json file has 160585 records with size of 1.443576 gb.
here is an example record:

{  'address': '921 Pearl St',
   'attributes': {  'Alcohol': "'beer_and_wine'",
                    'Ambience': "{'touristy': False, 'hipster': False, "
                                "'romantic': False, 'divey': False, "
                                "'intimate': False, 'trendy': False, "
                                "'upscale': False, 'classy': False, 'casual': "
                                'True}',
                    'BikeParking': 'True',
                    'BusinessAcceptsBitcoin': 'False',
                    'BusinessAcceptsCreditCards': 'True',
                    'BusinessParking': "{'garage': False, 'street': True, "
                                       "'validated': False, 'lot': False, "
                                       "'valet': False}",
                    'Caters': 'True',
                    'DogsAllowed': 'False',
                    'GoodForMeal': "{'dessert': False,

In [None]:
del business

In [None]:
checkin[0]['date']

str

In [None]:
# for i in range(1,len(checkin)):
    # mongo_upload(checkin[i])
    # pp.pprint(checkin[i])

# print('uploaded checkin collection.')

OperationFailure: Fully qualified namespace is too long. Namespace: yelp.{'business_id': '--0zrn43LEaB4jUWTQH_Bg', 'date': '2010-10-08 22:21:20, 2010-11-01 21:29:14, 2010-12-23 22:55:45, 2011-04-08 17:14:59, 2011-04-11 21:28:45, 2011-04-26 16:42:25, 2011-05-20 19:30:57, 2011-05-24 20:02:21, 2011-08-29 19:01:31', '_id': ObjectId('60691a52c189385debb59268')} Max: 255, full error: {'ok': 0.0, 'errmsg': "Fully qualified namespace is too long. Namespace: yelp.{'business_id': '--0zrn43LEaB4jUWTQH_Bg', 'date': '2010-10-08 22:21:20, 2010-11-01 21:29:14, 2010-12-23 22:55:45, 2011-04-08 17:14:59, 2011-04-11 21:28:45, 2011-04-26 16:42:25, 2011-05-20 19:30:57, 2011-05-24 20:02:21, 2011-08-29 19:01:31', '_id': ObjectId('60691a52c189385debb59268')} Max: 255", 'code': 4862100, 'codeName': 'Location4862100'}

In [None]:
# str(business[0]["business_id"])

OperationFailure: Fully qualified namespace is too long. Namespace: yelp.{'business_id': '--164t1nclzzmca7eDiJMw', 'date': '2010-02-26 02:06:53, 2010-02-27 08:00:09, 2010-03-04 02:00:59, 2010-03-11 01:24:46, 2010-03-17 02:29:17, 2010-03-19 03:33:40, 2010-03-27 01:26:31, 2010-04-07 20:08:52, 2010-04-12 19:03:33, 2010-04-27 18:15:59, 2010-04-29 17:59:15, 2010-05-16 00:28:06, 2010-05-18 17:51:22, 2010-05-22 03:25:43, 2010-05-22 04:15:07, 2010-05-23 06:42:25, 2010-06-04 04:59:14, 2010-06-10 17:33:19, 2010-06-20 05:48:30, 2010-06-25 23:32:03, 2010-07-09 19:18:13, 2010-07-10 01:58:37, 2010-07-11 03:00:13, 2010-07-14 17:47:55, 2010-07-16 19:14:11, 2010-07-17 01:33:04, 2010-07-17 01:48:42, 2010-07-18 02:37:39, 2010-07-18 03:51:35, 2010-07-22 19:20:00, 2010-07-24 01:46:18, 2010-07-25 03:45:09, 2010-07-30 01:07:12, 2010-08-01 05:31:38, 2010-08-03 19:05:25, 2010-08-05 01:37:29, 2010-08-05 23:00:52, 2010-08-07 22:50:59, 2010-08-12 02:10:44, 2010-08-15 03:29:10, 2010-08-21 00:54:52, 2010-08-22 03:33:45, 2010-08-25 17:12:20, 2010-08-28 05:01:22, 2010-09-02 18:27:05, 2010-09-03 18:29:01, 2010-09-09 18:30:59, 2010-09-17 22:47:43, 2010-09-18 23:44:51, 2010-09-19 06:20:57, 2010-09-23 22:30:03, 2010-09-25 06:02:30, 2010-09-30 23:46:47, 2010-10-03 01:30:22, 2010-10-09 00:36:02, 2010-10-09 04:23:32, 2010-10-14 19:44:58, 2010-10-16 00:34:05, 2010-10-20 17:08:52, 2010-10-24 03:00:45, 2010-10-24 05:03:30, 2010-10-31 04:26:28, 2010-10-31 07:05:33, 2010-10-31 08:22:33, 2010-11-06 00:13:15, 2010-11-10 18:24:15, 2010-11-13 01:55:54, 2010-11-13 07:35:31, 2010-11-13 21:20:31, 2010-11-14 04:24:06, 2010-11-19 18:14:30, 2010-12-19 02:24:29, 2010-12-19 03:18:30, 2010-12-19 07:53:59, 2011-01-02 07:11:02, 2011-01-08 04:52:00, 2011-01-22 04:42:06, 2011-01-28 03:35:30, 2011-01-30 03:08:47, 2011-01-30 06:56:11, 2011-01-31 03:29:27, 2011-02-06 03:35:22, 2011-02-06 22:11:26, 2011-02-13 04:51:24, 2011-03-04 19:25:18, 2011-03-05 08:03:58, 2011-03-06 05:59:05, 2011-03-06 06:21:57, 2011-03-08 19:11:58, 2011-03-09 18:18:55, 2011-03-18 22:22:54, 2011-03-18 23:26:43, 2011-03-23 17:08:27, 2011-03-26 02:48:53, 2011-03-27 05:26:18, 2011-03-31 23:42:25, 2011-04-07 02:20:39, 2011-04-11 04:06:07, 2011-04-13 02:44:25, 2011-04-20 21:21:20, 2011-04-24 05:22:42, 2011-04-30 05:45:56, 2011-04-30 21:33:29, 2011-05-15 02:32:35, 2011-05-26 01:47:13, 2011-05-29 06:00:07, 2011-06-04 06:42:07, 2011-06-06 02:05:50, 2011-06-09 17:22:12, 2011-06-11 06:35:49, 2011-07-01 03:04:08, 2011-07-08 19:51:27, 2011-07-10 06:41:16, 2011-07-11 01:02:23, 2011-07-30 03:54:59, 2011-08-07 06:20:45, 2011-08-12 02:49:10, 2011-08-19 05:31:35, 2011-08-20 00:22:14, 2011-08-27 05:01:03, 2011-08-28 05:30:18, 2011-08-29 01:15:03, 2011-08-30 18:28:20, 2011-08-30 18:28:27, 2011-09-08 02:28:44, 2011-10-02 03:33:22, 2011-10-07 04:23:44, 2011-10-10 02:21:00, 2011-10-16 03:07:39, 2011-10-22 00:48:56, 2011-10-24 03:56:44, 2011-10-24 03:56:57, 2011-10-29 01:24:36, 2011-11-04 20:12:18, 2011-11-10 01:50:19, 2011-11-10 02:28:02, 2011-11-11 23:40:15, 2011-11-13 20:53:20, 2011-11-30 19:38:17, 2011-12-11 05:35:27, 2011-12-25 22:36:45, 2012-01-08 00:48:54, 2012-01-09 04:38:04, 2012-01-11 00:48:36, 2012-01-14 03:45:20, 2012-01-15 02:40:36, 2012-01-21 01:42:50, 2012-01-22 05:35:39, 2012-01-28 06:27:57, 2012-02-05 04:48:29, 2012-02-09 04:46:12, 2012-02-17 01:29:17, 2012-02-17 02:56:23, 2012-02-18 06:18:11, 2012-03-02 21:56:52, 2012-03-05 04:23:59, 2012-03-23 00:52:05, 2012-03-23 19:12:45, 2012-03-28 04:11:07, 2012-04-01 01:36:36, 2012-04-01 02:07:20, 2012-04-01 02:13:26, 2012-04-08 03:09:20, 2012-04-12 18:30:19, 2012-04-14 03:11:33, 2012-04-19 17:37:15, 2012-04-19 17:38:22, 2012-04-20 03:14:43, 2012-04-27 05:27:32, 2012-05-10 03:34:08, 2012-05-16 02:17:23, 2012-05-16 02:17:57, 2012-05-16 18:11:00, 2012-05-18 03:33:48, 2012-05-21 01:35:27, 2012-05-28 00:26:29, 2012-05-30 19:46:59, 2012-06-04 02:07:25, 2012-06-06 03:44:00, 2012-06-08 00:40:35, 2012-06-08 01:51:56, 2012-06-09 04:45:31, 2012-06-15 06:30:50, 2012-06-23 01:21:18, 2012-06-27 00:50:42, 2012-06-27 19:24:54, 2012-06-28 04:02:55, 2012-07-02 00:38:11, 2012-07-07 03:13:32, 2012-07-14 01:06:37, 2012-07-14 01:45:58, 2012-07-15 03:54:57, 2012-07-19 03:05:18, 2012-07-19 03:34:54, 2012-07-23 02:44:42, 2012-08-03 00:59:44, 2012-08-17 02:32:05, 2012-08-22 00:45:54, 2012-08-23 17:17:39, 2012-08-25 01:00:09, 2012-08-31 00:23:00, 2012-08-31 22:50:02, 2012-09-03 04:16:21, 2012-09-08 01:28:01, 2012-09-10 03:10:36, 2012-09-12 18:39:39, 2012-09-23 01:26:41, 2012-09-23 01:46:39, 2012-10-03 00:34:05, 2012-10-03 00:34:50, 2012-10-06 03:43:17, 2012-10-06 03:49:46, 2012-10-06 03:50:01, 2012-10-10 01:55:49, 2012-10-11 22:13:05, 2012-10-20 01:29:35, 2012-10-28 04:04:29, 2012-11-05 01:53:03, 2012-11-05 01:53:33, 2012-11-08 01:19:57, 2012-11-09 19:50:33, 2012-11-12 04:27:52, 2012-11-12 07:11:49, 2012-11-20 20:22:16, 2012-11-21 23:30:05, 2012-11-29 01:17:43, 2012-12-05 03:15:42, 2012-12-05 03:16:11, 2012-12-08 19:24:41, 2012-12-10 01:01:30, 2012-12-11 18:22:26, 2012-12-22 19:12:10, 2012-12-23 09:06:36, 2012-12-28 21:04:14, 2013-01-01 01:55:58, 2013-01-04 03:43:39, 2013-01-11 01:30:22, 2013-01-18 07:28:30, 2013-01-22 04:19:42, 2013-01-22 04:21:26, 2013-01-27 06:01:38, 2013-02-05 18:19:36, 2013-02-06 01:09:25, 2013-02-06 04:15:29, 2013-02-07 18:12:59, 2013-02-09 03:52:45, 2013-02-11 02:46:31, 2013-02-16 02:57:56, 2013-02-16 03:01:22, 2013-02-21 02:41:49, 2013-02-23 04:46:03, 2013-02-28 03:28:53, 2013-03-07 02:04:46, 2013-03-12 23:55:06, 2013-03-14 00:22:21, 2013-03-16 01:47:34, 2013-03-16 22:31:29, 2013-03-23 01:10:42, 2013-04-03 19:22:04, 2013-04-03 22:36:07, 2013-04-04 00:59:43, 2013-04-05 00:20:20, 2013-04-05 18:18:29, 2013-04-09 00:25:15, 2013-04-14 06:43:21, 2013-04-15 02:15:15, 2013-04-17 00:22:09, 2013-04-19 01:07:41, 2013-04-23 01:26:07, 2013-04-28 05:45:04, 2013-04-29 02:55:59, 2013-04-30 18:16:39, 2013-05-05 05:45:46, 2013-05-15 02:32:06, 2013-05-16 20:35:04, 2013-05-16 22:57:10, 2013-05-17 02:35:43, 2013-05-18 04:11:46, 2013-05-19 03:43:10, 2013-06-08 01:05:12, 2013-06-09 07:10:01, 2013-06-10 03:38:46, 2013-06-14 01:33:28, 2013-06-14 01:40:46, 2013-06-16 02:35:36, 2013-06-20 17:55:33, 2013-06-21 23:03:28, 2013-06-22 06:49:23, 2013-06-28 05:51:58, 2013-07-03 00:35:10, 2013-07-11 00:18:13, 2013-07-15 00:16:49, 2013-07-17 00:53:08, 2013-07-17 00:54:42, 2013-07-20 02:27:00, 2013-07-27 02:41:32, 2013-07-31 23:38:49, 2013-08-04 01:34:32, 2013-08-04 02:03:14, 2013-08-11 23:44:23, 2013-08-12 05:02:59, 2013-08-15 01:18:55, 2013-08-15 03:11:03, 2013-08-30 18:14:03, 2013-08-31 05:31:28, 2013-09-08 05:20:15, 2013-09-15 02:21:20, 2013-10-04 02:51:48, 2013-10-23 15:58:15, 2013-11-12 18:54:05, 2013-11-25 02:51:33', '_id': ObjectId('60691a52c189385debb59269')} Max: 255, full error: {'ok': 0.0, 'errmsg': "Fully qualified namespace is too long. Namespace: yelp.{'business_id': '--164t1nclzzmca7eDiJMw', 'date': '2010-02-26 02:06:53, 2010-02-27 08:00:09, 2010-03-04 02:00:59, 2010-03-11 01:24:46, 2010-03-17 02:29:17, 2010-03-19 03:33:40, 2010-03-27 01:26:31, 2010-04-07 20:08:52, 2010-04-12 19:03:33, 2010-04-27 18:15:59, 2010-04-29 17:59:15, 2010-05-16 00:28:06, 2010-05-18 17:51:22, 2010-05-22 03:25:43, 2010-05-22 04:15:07, 2010-05-23 06:42:25, 2010-06-04 04:59:14, 2010-06-10 17:33:19, 2010-06-20 05:48:30, 2010-06-25 23:32:03, 2010-07-09 19:18:13, 2010-07-10 01:58:37, 2010-07-11 03:00:13, 2010-07-14 17:47:55, 2010-07-16 19:14:11, 2010-07-17 01:33:04, 2010-07-17 01:48:42, 2010-07-18 02:37:39, 2010-07-18 03:51:35, 2010-07-22 19:20:00, 2010-07-24 01:46:18, 2010-07-25 03:45:09, 2010-07-30 01:07:12, 2010-08-01 05:31:38, 2010-08-03 19:05:25, 2010-08-05 01:37:29, 2010-08-05 23:00:52, 2010-08-07 22:50:59, 2010-08-12 02:10:44, 2010-08-15 03:29:10, 2010-08-21 00:54:52, 2010-08-22 03:33:45, 2010-08-25 17:12:20, 2010-08-28 05:01:22, 2010-09-02 18:27:05, 2010-09-03 18:29:01, 2010-09-09 18:30:59, 2010-09-17 22:47:43, 2010-09-18 23:44:51, 2010-09-19 06:20:57, 2010-09-23 22:30:03, 2010-09-25 06:02:30, 2010-09-30 23:46:47, 2010-10-03 01:30:22, 2010-10-09 00:36:02, 2010-10-09 04:23:32, 2010-10-14 19:44:58, 2010-10-16 00:34:05, 2010-10-20 17:08:52, 2010-10-24 03:00:45, 2010-10-24 05:03:30, 2010-10-31 04:26:28, 2010-10-31 07:05:33, 2010-10-31 08:22:33, 2010-11-06 00:13:15, 2010-11-10 18:24:15, 2010-11-13 01:55:54, 2010-11-13 07:35:31, 2010-11-13 21:20:31, 2010-11-14 04:24:06, 2010-11-19 18:14:30, 2010-12-19 02:24:29, 2010-12-19 03:18:30, 2010-12-19 07:53:59, 2011-01-02 07:11:02, 2011-01-08 04:52:00, 2011-01-22 04:42:06, 2011-01-28 03:35:30, 2011-01-30 03:08:47, 2011-01-30 06:56:11, 2011-01-31 03:29:27, 2011-02-06 03:35:22, 2011-02-06 22:11:26, 2011-02-13 04:51:24, 2011-03-04 19:25:18, 2011-03-05 08:03:58, 2011-03-06 05:59:05, 2011-03-06 06:21:57, 2011-03-08 19:11:58, 2011-03-09 18:18:55, 2011-03-18 22:22:54, 2011-03-18 23:26:43, 2011-03-23 17:08:27, 2011-03-26 02:48:53, 2011-03-27 05:26:18, 2011-03-31 23:42:25, 2011-04-07 02:20:39, 2011-04-11 04:06:07, 2011-04-13 02:44:25, 2011-04-20 21:21:20, 2011-04-24 05:22:42, 2011-04-30 05:45:56, 2011-04-30 21:33:29, 2011-05-15 02:32:35, 2011-05-26 01:47:13, 2011-05-29 06:00:07, 2011-06-04 06:42:07, 2011-06-06 02:05:50, 2011-06-09 17:22:12, 2011-06-11 06:35:49, 2011-07-01 03:04:08, 2011-07-08 19:51:27, 2011-07-10 06:41:16, 2011-07-11 01:02:23, 2011-07-30 03:54:59, 2011-08-07 06:20:45, 2011-08-12 02:49:10, 2011-08-19 05:31:35, 2011-08-20 00:22:14, 2011-08-27 05:01:03, 2011-08-28 05:30:18, 2011-08-29 01:15:03, 2011-08-30 18:28:20, 2011-08-30 18:28:27, 2011-09-08 02:28:44, 2011-10-02 03:33:22, 2011-10-07 04:23:44, 2011-10-10 02:21:00, 2011-10-16 03:07:39, 2011-10-22 00:48:56, 2011-10-24 03:56:44, 2011-10-24 03:56:57, 2011-10-29 01:24:36, 2011-11-04 20:12:18, 2011-11-10 01:50:19, 2011-11-10 02:28:02, 2011-11-11 23:40:15, 2011-11-13 20:53:20, 2011-11-30 19:38:17, 2011-12-11 05:35:27, 2011-12-25 22:36:45, 2012-01-08 00:48:54, 2012-01-09 04:38:04, 2012-01-11 00:48:36, 2012-01-14 03:45:20, 2012-01-15 02:40:36, 2012-01-21 01:42:50, 2012-01-22 05:35:39, 2012-01-28 06:27:57, 2012-02-05 04:48:29, 2012-02-09 04:46:12, 2012-02-17 01:29:17, 2012-02-17 02:56:23, 2012-02-18 06:18:11, 2012-03-02 21:56:52, 2012-03-05 04:23:59, 2012-03-23 00:52:05, 2012-03-23 19:12:45, 2012-03-28 04:11:07, 2012-04-01 01:36:36, 2012-04-01 02:07:20, 2012-04-01 02:13:26, 2012-04-08 03:09:20, 2012-04-12 18:30:19, 2012-04-14 03:11:33, 2012-04-19 17:37:15, 2012-04-19 17:38:22, 2012-04-20 03:14:43, 2012-04-27 05:27:32, 2012-05-10 03:34:08, 2012-05-16 02:17:23, 2012-05-16 02:17:57, 2012-05-16 18:11:00, 2012-05-18 03:33:48, 2012-05-21 01:35:27, 2012-05-28 00:26:29, 2012-05-30 19:46:59, 2012-06-04 02:07:25, 2012-06-06 03:44:00, 2012-06-08 00:40:35, 2012-06-08 01:51:56, 2012-06-09 04:45:31, 2012-06-15 06:30:50, 2012-06-23 01:21:18, 2012-06-27 00:50:42, 2012-06-27 19:24:54, 2012-06-28 04:02:55, 2012-07-02 00:38:11, 2012-07-07 03:13:32, 2012-07-14 01:06:37, 2012-07-14 01:45:58, 2012-07-15 03:54:57, 2012-07-19 03:05:18, 2012-07-19 03:34:54, 2012-07-23 02:44:42, 2012-08-03 00:59:44, 2012-08-17 02:32:05, 2012-08-22 00:45:54, 2012-08-23 17:17:39, 2012-08-25 01:00:09, 2012-08-31 00:23:00, 2012-08-31 22:50:02, 2012-09-03 04:16:21, 2012-09-08 01:28:01, 2012-09-10 03:10:36, 2012-09-12 18:39:39, 2012-09-23 01:26:41, 2012-09-23 01:46:39, 2012-10-03 00:34:05, 2012-10-03 00:34:50, 2012-10-06 03:43:17, 2012-10-06 03:49:46, 2012-10-06 03:50:01, 2012-10-10 01:55:49, 2012-10-11 22:13:05, 2012-10-20 01:29:35, 2012-10-28 04:04:29, 2012-11-05 01:53:03, 2012-11-05 01:53:33, 2012-11-08 01:19:57, 2012-11-09 19:50:33, 2012-11-12 04:27:52, 2012-11-12 07:11:49, 2012-11-20 20:22:16, 2012-11-21 23:30:05, 2012-11-29 01:17:43, 2012-12-05 03:15:42, 2012-12-05 03:16:11, 2012-12-08 19:24:41, 2012-12-10 01:01:30, 2012-12-11 18:22:26, 2012-12-22 19:12:10, 2012-12-23 09:06:36, 2012-12-28 21:04:14, 2013-01-01 01:55:58, 2013-01-04 03:43:39, 2013-01-11 01:30:22, 2013-01-18 07:28:30, 2013-01-22 04:19:42, 2013-01-22 04:21:26, 2013-01-27 06:01:38, 2013-02-05 18:19:36, 2013-02-06 01:09:25, 2013-02-06 04:15:29, 2013-02-07 18:12:59, 2013-02-09 03:52:45, 2013-02-11 02:46:31, 2013-02-16 02:57:56, 2013-02-16 03:01:22, 2013-02-21 02:41:49, 2013-02-23 04:46:03, 2013-02-28 03:28:53, 2013-03-07 02:04:46, 2013-03-12 23:55:06, 2013-03-14 00:22:21, 2013-03-16 01:47:34, 2013-03-16 22:31:29, 2013-03-23 01:10:42, 2013-04-03 19:22:04, 2013-04-03 22:36:07, 2013-04-04 00:59:43, 2013-04-05 00:20:20, 2013-04-05 18:18:29, 2013-04-09 00:25:15, 2013-04-14 06:43:21, 2013-04-15 02:15:15, 2013-04-17 00:22:09, 2013-04-19 01:07:41, 2013-04-23 01:26:07, 2013-04-28 05:45:04, 2013-04-29 02:55:59, 2013-04-30 18:16:39, 2013-05-05 05:45:46, 2013-05-15 02:32:06, 2013-05-16 20:35:04, 2013-05-16 22:57:10, 2013-05-17 02:35:43, 2013-05-18 04:11:46, 2013-05-19 03:43:10, 2013-06-08 01:05:12, 2013-06-09 07:10:01, 2013-06-10 03:38:46, 2013-06-14 01:33:28, 2013-06-14 01:40:46, 2013-06-16 02:35:36, 2013-06-20 17:55:33, 2013-06-21 23:03:28, 2013-06-22 06:49:23, 2013-06-28 05:51:58, 2013-07-03 00:35:10, 2013-07-11 00:18:13, 2013-07-15 00:16:49, 2013-07-17 00:53:08, 2013-07-17 00:54:42, 2013-07-20 02:27:00, 2013-07-27 02:41:32, 2013-07-31 23:38:49, 2013-08-04 01:34:32, 2013-08-04 02:03:14, 2013-08-11 23:44:23, 2013-08-12 05:02:59, 2013-08-15 01:18:55, 2013-08-15 03:11:03, 2013-08-30 18:14:03, 2013-08-31 05:31:28, 2013-09-08 05:20:15, 2013-09-15 02:21:20, 2013-10-04 02:51:48, 2013-10-23 15:58:15, 2013-11-12 18:54:05, 2013-11-25 02:51:33', '_id': ObjectId('60691a52c189385debb59269')} Max: 255", 'code': 4862100, 'codeName': 'Location4862100'}

In [None]:
str(business[0]["date"])

In [None]:
try:
    pp.pprint(business[0])

except KeyboardInterrupt:
    logger.info(KeyboardInterrupt)
    logger.info(business[1])

:26, '
              '2020-08-31 18:51:28, 2020-11-23 19:01:21, 2020-11-27 18:11:52, '
              '2020-12-28 20:36:41, 2020-12-28 20:37:52, 2021-01-01 16:51:04'},
   {  '_id': ObjectId('604f9773c4078d81432ffd50'),
      'business_id': 'G_QFvmWWmJRC-bsWC4bYqw',
      'date': '2010-07-21 23:13:40, 2010-08-03 20:16:48, 2011-01-21 23:33:30, '
              '2011-04-26 18:27:07, 2011-07-20 17:31:37, 2011-07-22 19:51:18, '
              '2012-03-25 01:08:27, 2012-04-14 18:34:58, 2013-01-01 14:53:20, '
              '2014-07-26 18:09:26, 2015-03-26 23:26:34, 2015-09-02 12:14:57, '
              '2015-09-03 22:02:55, 2015-09-15 13:06:04, 2015-11-23 12:47:22, '
              '2015-11-30 13:48:01, 2016-02-04 12:34:50, 2016-02-11 12:26:13, '
              '2016-02-17 14:16:28, 2016-02-29 14:30:04, 2016-03-01 12:33:33, '
              '2016-03-16 12:01:27, 2016-03-18 21:57:52, 2016-03-22 11:23:42, '
              '2017-01-09 13:24:04, 2017-01-14 14:24:44, 2017-02-07 12:00:09, '
              '

KeyboardInterrupt: 

In [None]:
del checkin

## tip

In [None]:
tip = read_json('s3a://yelp-dataset-stevenhurwitt/yelp_academic_dataset_tip.json')
logger.info('read in json file.')

read in json file.


In [None]:
logger.info('json file has {} records with size of {} mb.'.format(len(tip), sys.getsizeof(tip)/1000000))
logger.info('here is an example record:')
logger.info('')
logger.info(tip[0])

json file has 1162119 records with size of 9.504856 gb.
here is an example record:

{  'business_id': 'ENwBByjpoa5Gg7tKgxqwLg',
   'compliment_count': 0,
   'date': '2011-07-22 19:07:35',
   'text': 'Carne asada chips...',
   'user_id': 'WCjg0jdHXMlwbqS9tZUx8Q'}


In [None]:
del tip

## user

In [None]:
try:
    user = read_json('s3a://yelp-dataset-stevenhurwitt/yelp_academic_dataset_user.json')
    print('read in json file.')

except Exception as e:
    logger.warn(e)

OverflowError: signed integer is greater than maximum

In [None]:
logger.info('json file has {} records with size of {} mb.'.format(len(user), sys.getsizeof(user)/1000000))
logger.info('here is an example record:')
logger.info('')
logger.info(user[0])

In [None]:
del user

## review

In [None]:
try:
    review = read_json('s3a://yelp-dataset-stevenhurwitt/yelp_academic_dataset_review.json')
    logger.info('read in json file')

except Exception as e:
    logger.warn(e)

OverflowError: signed integer is greater than maximum

In [None]:
logger.info('json file has {} records with size of {} mb.'.format(len(review), sys.getsizeof(review)/1000000))
logger.info('here is an example record:')
logger.info('')
logger.info(review[0])

In [None]:
del review