## spark read s3 delta

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Window, Row

import pprint
import boto3
import json
import os

pp = pprint.PrettyPrinter(indent = 3)
print('imported modules.')

client = boto3.client('s3')

bucket_meta = client.list_objects(Bucket = 'yelp-dataset-stevenhurwitt')
print('files in s3 bucket:')
print('')
for c in bucket_meta['Contents']:
    print(c['Key'])

In [None]:
def read_json(filename):
    """
    reads a yelp .json file from s3 bucket.

    keyword arguments:
    filename - name of file (str)

    returns: json_file (json)
    """

    response = client.get_object(Bucket = 'yelp-academic-dataset-stevenhurwitt', Key = filename)
    file_content = response['Body'].read().decode('utf-8')
    json_file = json.loads("[" + file_content.replace("}\n{", "},\n{") + "]")
    return(json_file)

In [None]:
dynamodb = boto3.resource('dynamodb', endpoint_url="https://us-east-2.console.aws.amazon.com/dynamodbv2/home?region=us-east-2#export?arn=arn:aws:dynamodb:us-east-2134132211607:8000")


yelp_business = dynamodb.create_table(
        TableName='yelp.business',
        KeySchema=[
            {
                'AttributeName': 'business_id',
                'KeyType': 'HASH'  # Partition key
            }
        ],
        AttributeDefinitions=[
            {
                'AttributeName': 'name',
                'AttributeType': 'S'
            }
        ],
        ProvisionedThroughput={
            'ReadCapacityUnits': 25,
            'WriteCapacityUnits': 20
        }
    )
print('created dynamo table.')

In [None]:
business = read_json('s3a://yelp-dataset-stevenhurwitt/yelp_academic_dataset_business.json')
business.head()
print('read in json file.')

In [None]:
print('json file has {} records with size of {} mb.'.format(len(business), sys.getsizeof(business)/1000000))
print('here is an example record:')
print('')
pp.pprint(business[0])

In [None]:
df_pandas = business.toPandas()

In [None]:
html_df = df_pandas.to_html()

In [None]:
from iPython import display
display(html_df)