## spark read s3 json

In [8]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Window, Row, SparkSession

import pprint
import boto3
import json
import sys
import os

pp = pprint.PrettyPrinter(indent = 3)
print('imported modules.')

# Set Java home environment variable
# os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/temurin-8.jdk/Contents/Home'  # Update this path to match your Java installation

# read creds.json
with open("creds.json", "r") as f:
    creds = json.load(f)
    f.close()

imported modules.


## Spark

In [14]:
# Stop any existing Spark session
if 'spark' in locals():
    spark.stop()

## start spark

In [None]:
try:
    # Create Spark session with required configurations
    spark = SparkSession.builder \
        .appName("YelpAnalysis") \
        .master("local[4]") \
        .config("spark.driver.memory", "2g") \
        .config("spark.executor.memory", "2g") \
        .config("spark.hadoop.fs.s3a.access.key", creds["aws_client"]) \
        .config("spark.hadoop.fs.s3a.secret.key", creds["aws_secret"]) \
        .config("spark.jars.packages", 
                "org.apache.hadoop:hadoop-aws:3.3.4," + 
                "org.apache.hadoop:hadoop-common:3.3.4," +
                "org.apache.logging.log4j:log4j-slf4j-impl:2.17.2," +
                "org.apache.logging.log4j:log4j-api:2.17.2," +
                "org.apache.logging.log4j:log4j-core:2.17.2," + 
                "org.apache.hadoop:hadoop-client:3.3.4," + 
                "io.delta:delta-core_2.12:2.4.0," + 
                "org.postgresql:postgresql:9.4.1212") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
        .getOrCreate()
    
        # .config("spark.jars.packages", "org.apache.hadoop:hadoop-common:3.3.4,org.apache.hadoop:hadoop-aws:3.3.4,org.apache.hadoop:hadoop-client:3.3.4,io.delta:delta-core_2.12:2.4.0,org.postgresql:postgresql:9.4.1212") \
        
    
except Exception as e:
    print(str(e))

25/03/11 17:55:03 WARN SparkContext: Another SparkContext is being constructed (or threw an exception in its constructor). This may indicate an error, since only one SparkContext should be running in this JVM (see SPARK-2243). The other SparkContext was created at:
org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:490)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.Con

### list s3 files

In [19]:
client = boto3.client('s3',
                    aws_access_key_id=creds["aws_client"],
                    aws_secret_access_key= creds["aws_secret"])
bucket = "yelp-stevenhurwitt-2"
file = "yelp_academic_dataset_business.json"

bucket_meta = client.list_objects(Bucket = 'yelp-stevenhurwitt-2')
print('files in s3 bucket:')
print('')
for c in bucket_meta['Contents']:
    print(c['Key'])

files in s3 bucket:

yelp_academic_dataset_business.json
yelp_academic_dataset_checkin.json
yelp_academic_dataset_review.json
yelp_academic_dataset_tip.json
yelp_academic_dataset_user.json


## spark read json

### schemas

In [22]:
# Define schemas for each dataset
checkin_schema = StructType([
    StructField("business_id", StringType(), True),
    StructField("date", StringType(), True)
])

review_schema = StructType([
    StructField("review_id", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("business_id", StringType(), True),
    StructField("stars", FloatType(), True),
    StructField("useful", IntegerType(), True),
    StructField("funny", IntegerType(), True),
    StructField("cool", IntegerType(), True),
    StructField("text", StringType(), True),
    StructField("date", TimestampType(), True)
])

tip_schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("business_id", StringType(), True),
    StructField("text", StringType(), True),
    StructField("date", TimestampType(), True),
    StructField("compliment_count", IntegerType(), True)
])

user_schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("review_count", IntegerType(), True),
    StructField("yelping_since", TimestampType(), True),
    StructField("friends", StringType(), True),
    StructField("useful", IntegerType(), True),
    StructField("funny", IntegerType(), True),
    StructField("cool", IntegerType(), True),
    StructField("fans", IntegerType(), True),
    StructField("elite", StringType(), True),
    StructField("average_stars", FloatType(), True),
    StructField("compliment_hot", IntegerType(), True),
    StructField("compliment_more", IntegerType(), True),
    StructField("compliment_profile", IntegerType(), True),
    StructField("compliment_cute", IntegerType(), True),
    StructField("compliment_list", IntegerType(), True),
    StructField("compliment_note", IntegerType(), True),
    StructField("compliment_plain", IntegerType(), True),
    StructField("compliment_cool", IntegerType(), True),
    StructField("compliment_funny", IntegerType(), True),
    StructField("compliment_writer", IntegerType(), True),
    StructField("compliment_photos", IntegerType(), True)
])

business_schema = StructType([
    StructField("business_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("address", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("postal_code", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("stars", DoubleType(), True),
    StructField("review_count", IntegerType(), True),
    StructField("is_open", IntegerType(), True),
    StructField("attributes", MapType(StringType(), StringType()), True),
    StructField("categories", StringType(), True),
    StructField("hours", MapType(StringType(), StringType()), True)
])

## load yelp dataframes

In [23]:
# Function to load dataframes
def load_yelp_dataframes(base_path):
    try:
        # Load each dataset
        df_business = spark.read.schema(business_schema)\
            .json(f"{base_path}/yelp_academic_dataset_business.json")
        
        df_checkin = spark.read.schema(checkin_schema)\
            .json(f"{base_path}/yelp_academic_dataset_checkin.json")
        
        df_review = spark.read.schema(review_schema)\
            .json(f"{base_path}/yelp_academic_dataset_review.json")
        
        df_tip = spark.read.schema(tip_schema)\
            .json(f"{base_path}/yelp_academic_dataset_tip.json")
        
        df_user = spark.read.schema(user_schema)\
            .json(f"{base_path}/yelp_academic_dataset_user.json")
        
        # Print basic info about loaded dataframes
        print(f"Businesses: {df_business.count()} rows")
        print(f"Checkins: {df_checkin.count()} rows")
        print(f"Reviews: {df_review.count()} rows")
        print(f"Tips: {df_tip.count()} rows")
        print(f"Users: {df_user.count()} rows")
        
        return df_business, df_checkin, df_review, df_tip, df_user
    
    except Exception as e:
        print(f"Error loading dataframes: {str(e)}")
        return None

## read data

http://localhost:4040/jobs

In [None]:
# Usage
base_path = "s3a://yelp-stevenhurwitt-2/"  # Update this path
df_business, df_checkin, df_review, df_tip, df_user = load_yelp_dataframes(base_path)


                                                                                

Businesses: 150346 rows


                                                                                

Checkins: 131930 rows


                                                                                

Reviews: 6990280 rows


                                                                                

Tips: 908915 rows




### verify schemas

In [None]:
# Verify schemas
print("\nCheckin Schema:")
df_checkin.printSchema()
print("\nReview Schema:")
df_review.printSchema()
print("\nTip Schema:")
df_tip.printSchema()
print("\nUser Schema:")
df_user.printSchema()

## write to delta on s3

In [None]:
def write_to_s3_delta(df, bucket, folder, partition_cols=None):
    """
    Write DataFrame to S3 in Delta format
    Args:
        df: Spark DataFrame
        bucket: S3 bucket name
        folder: Folder name for the dataset
        partition_cols: List of columns to partition by (optional)
    """
    try:
        writer = df.write \
            .format("delta") \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .option("mergeSchema", "true")
        
        if partition_cols:
            writer = writer.partitionBy(partition_cols)
            
        path = f"s3a://{bucket}/{folder}/"
        writer.save(path)
        print(f"Successfully wrote {folder} data to Delta table at {path}")
        
        # Create Delta table if it doesn't exist
        spark.sql(f"""
        CREATE TABLE IF NOT EXISTS delta.`{path}`
        USING DELTA
        """)
        
    except Exception as e:
        print(f"Error writing {folder} to Delta: {str(e)}")

# Usage example:
bucket = "yelp-stevenhurwitt-2"

try:
    # Reviews - partition by year and month
    df_review = df_review.withColumn(
        "year", year("date")
    ).withColumn(
        "month", month("date")
    )
    write_to_s3_delta(df_review, bucket, "reviews", ["year", "month"])

    # Users - no partitioning
    write_to_s3_delta(df_user, bucket, "users")

    # Tips - partition by year
    df_tip = df_tip.withColumn("year", year("date"))
    write_to_s3_delta(df_tip, bucket, "tips", ["year"])

    # Checkins - no partitioning
    write_to_s3_delta(df_checkin, bucket, "checkins")

    print("All datasets written to Delta format successfully")
    
except Exception as e:
    print(f"Error in write operations: {str(e)}")

## read delta

In [None]:
# Read from Delta table
df = spark.read.format("delta").load("s3a://yelp-stevenhurwitt-2/reviews")

# write to postgres

## create tables

In [None]:
# -- Business table
# CREATE TABLE IF NOT EXISTS business (
#     business_id STRING,
#     name STRING,
#     address STRING,
#     city STRING,
#     state STRING,
#     postal_code STRING,
#     latitude DOUBLE,
#     longitude DOUBLE,
#     stars FLOAT,
#     review_count INT,
#     is_open INT,
#     attributes MAP<STRING, STRING>,
#     categories STRING,
#     hours MAP<STRING, STRING>
# ) USING DELTA
# LOCATION 's3a://yelp-stevenhurwitt-2/business';

# -- Review table
# CREATE TABLE IF NOT EXISTS review (
#     review_id STRING,
#     user_id STRING,
#     business_id STRING,
#     stars FLOAT,
#     useful INT,
#     funny INT,
#     cool INT,
#     text STRING,
#     date TIMESTAMP,
#     year INT,
#     month INT
# ) USING DELTA
# LOCATION 's3a://yelp-stevenhurwitt-2/reviews';

# -- User table
# CREATE TABLE IF NOT EXISTS user (
#     user_id STRING,
#     name STRING,
#     review_count INT,
#     yelping_since TIMESTAMP,
#     friends STRING,
#     useful INT,
#     funny INT,
#     cool INT,
#     fans INT,
#     elite STRING,
#     average_stars FLOAT,
#     compliment_hot INT,
#     compliment_more INT,
#     compliment_profile INT,
#     compliment_cute INT,
#     compliment_list INT,
#     compliment_note INT,
#     compliment_plain INT,
#     compliment_cool INT,
#     compliment_funny INT,
#     compliment_writer INT,
#     compliment_photos INT
# ) USING DELTA
# LOCATION 's3a://yelp-stevenhurwitt-2/users';

# -- Tip table
# CREATE TABLE IF NOT EXISTS tip (
#     user_id STRING,
#     business_id STRING,
#     text STRING,
#     date TIMESTAMP,
#     compliment_count INT,
#     year INT
# ) USING DELTA
# LOCATION 's3a://yelp-stevenhurwitt-2/tips';

# -- Checkin table
# CREATE TABLE IF NOT EXISTS checkin (
#     business_id STRING,
#     date STRING
# ) USING DELTA
# LOCATION 's3a://yelp-stevenhurwitt-2/checkins';

## write to postgres from spark

In [None]:
def write_to_postgres(df, table_name):
    """
    Write Spark DataFrame to PostgreSQL
    
    Args:
        df: Spark DataFrame
        table_name: Name of the target PostgreSQL table
    """
    try:
        # PostgreSQL connection properties
        postgres_props = {
            "url": f"jdbc:postgresql://{creds['postgres_host']}:5433/{creds['postgres_db']}",
            "driver": "org.postgresql.Driver",
            "user": creds["postgres_user"],
            "password": creds["postgres_password"]
        }
        
        # Write DataFrame to PostgreSQL
        df.write \
            .format("jdbc") \
            .option("url", postgres_props["url"]) \
            .option("driver", postgres_props["driver"]) \
            .option("dbtable", table_name) \
            .option("user", postgres_props["user"]) \
            .option("password", postgres_props["password"]) \
            .mode("overwrite") \
            .save()
            
        print(f"Successfully wrote {table_name} to PostgreSQL")
        
    except Exception as e:
        print(f"Error writing {table_name} to PostgreSQL: {str(e)}")

# Write all datasets to PostgreSQL
try:
    # Add PostgreSQL JDBC driver to Spark
    spark.sparkContext.addJar("postgresql-42.2.23.jar")
    
    # Write each dataset
    write_to_postgres(df_business, "business")
    write_to_postgres(df_review, "review")
    write_to_postgres(df_user, "user_profile")  # using user_profile since user is reserved
    write_to_postgres(df_tip, "tip")
    write_to_postgres(df_checkin, "checkin")
    
    print("All datasets written to PostgreSQL successfully")
    
except Exception as e:
    print(f"Error in PostgreSQL write operations: {str(e)}")

## read json

In [3]:
def read_json(filename):
    """
    reads a yelp .json file from s3 bucket.

    keyword arguments:
    filename - name of file (str)

    returns: json_file (json)
    """

    bucket = "yelp-dataset-stevenhurwitt"
    file = "raw/yelp_academic_dataset_business.json"
    response = client.get_object(Bucket = bucket, Key = file)
    
    file_content = response['Body'].read().decode('utf-8')
    json_file = json.loads("[" + file_content.replace("}\n{", "},\n{") + "]")
    return(json_file)

## read json files

In [4]:
business_file = read_json("raw/yelp_academic_dataset_business.json")
checkin_file = read_json("raw/yelp_academic_dataset_checkin.json")
review_file = read_json("raw/yelp_academic_dataset_review.json")
tip_file = read_json("raw/yelp_academic_dataset_tip.json")
user_file = read_json("raw/yelp_academic_dataset_user.json")

print("read json files from s3.")

read json files from s3.


### review

In [5]:
pp.pprint(review_file[0])

{  'address': '1616 Chapala St, Ste 2',
   'attributes': {'ByAppointmentOnly': 'True'},
   'business_id': 'Pns2l4eNsfO8kk83dixA6A',
   'categories': 'Doctors, Traditional Chinese Medicine, '
                 'Naturopathic/Holistic, Acupuncture, Health & Medical, '
                 'Nutritionists',
   'city': 'Santa Barbara',
   'hours': None,
   'is_open': 0,
   'latitude': 34.4266787,
   'longitude': -119.7111968,
   'name': 'Abby Rappoport, LAC, CMQ',
   'postal_code': '93101',
   'review_count': 7,
   'stars': 5.0,
   'state': 'CA'}


### tip

In [6]:
pp.pprint(tip_file[0])

{  'address': '1616 Chapala St, Ste 2',
   'attributes': {'ByAppointmentOnly': 'True'},
   'business_id': 'Pns2l4eNsfO8kk83dixA6A',
   'categories': 'Doctors, Traditional Chinese Medicine, '
                 'Naturopathic/Holistic, Acupuncture, Health & Medical, '
                 'Nutritionists',
   'city': 'Santa Barbara',
   'hours': None,
   'is_open': 0,
   'latitude': 34.4266787,
   'longitude': -119.7111968,
   'name': 'Abby Rappoport, LAC, CMQ',
   'postal_code': '93101',
   'review_count': 7,
   'stars': 5.0,
   'state': 'CA'}


### user

In [7]:
pp.pprint(user_file[0])

{  'address': '1616 Chapala St, Ste 2',
   'attributes': {'ByAppointmentOnly': 'True'},
   'business_id': 'Pns2l4eNsfO8kk83dixA6A',
   'categories': 'Doctors, Traditional Chinese Medicine, '
                 'Naturopathic/Holistic, Acupuncture, Health & Medical, '
                 'Nutritionists',
   'city': 'Santa Barbara',
   'hours': None,
   'is_open': 0,
   'latitude': 34.4266787,
   'longitude': -119.7111968,
   'name': 'Abby Rappoport, LAC, CMQ',
   'postal_code': '93101',
   'review_count': 7,
   'stars': 5.0,
   'state': 'CA'}


### checkin

In [8]:
pp.pprint(checkin_file[0])

{  'address': '1616 Chapala St, Ste 2',
   'attributes': {'ByAppointmentOnly': 'True'},
   'business_id': 'Pns2l4eNsfO8kk83dixA6A',
   'categories': 'Doctors, Traditional Chinese Medicine, '
                 'Naturopathic/Holistic, Acupuncture, Health & Medical, '
                 'Nutritionists',
   'city': 'Santa Barbara',
   'hours': None,
   'is_open': 0,
   'latitude': 34.4266787,
   'longitude': -119.7111968,
   'name': 'Abby Rappoport, LAC, CMQ',
   'postal_code': '93101',
   'review_count': 7,
   'stars': 5.0,
   'state': 'CA'}


### business

In [9]:
pp.pprint(business_file[0])

{  'address': '1616 Chapala St, Ste 2',
   'attributes': {'ByAppointmentOnly': 'True'},
   'business_id': 'Pns2l4eNsfO8kk83dixA6A',
   'categories': 'Doctors, Traditional Chinese Medicine, '
                 'Naturopathic/Holistic, Acupuncture, Health & Medical, '
                 'Nutritionists',
   'city': 'Santa Barbara',
   'hours': None,
   'is_open': 0,
   'latitude': 34.4266787,
   'longitude': -119.7111968,
   'name': 'Abby Rappoport, LAC, CMQ',
   'postal_code': '93101',
   'review_count': 7,
   'stars': 5.0,
   'state': 'CA'}


In [10]:
dynamodb = boto3.resource('dynamodb', endpoint_url="https://us-east-2.console.aws.amazon.com?arn=arn:aws:dynamodb:us-east-2134132211607:8000")
print('created dynamo resource.')

# try:
#     yelp_business = dynamodb.create_table(
#             TableName='yelp.business',
#             KeySchema=[
#                 {
#                     'AttributeName': 'business_id',
#                     'KeyType': 'HASH'  # Partition key
#                 }
#             ],
#             AttributeDefinitions=[
#                 {
#                     'AttributeName': 'name',
#                     'AttributeType': 'S'
#                 }
#             ],
#             ProvisionedThroughput={
#                 'ReadCapacityUnits': 25,
#                 'WriteCapacityUnits': 20
#             }
#         )
#     print('created dynamo table.')

# except Exception as e:
#     print(f"exception: {e}")
#     print("failed to create dynamo table.")

created dynamo resource.


In [11]:
print(bucket) 

yelp-dataset-stevenhurwitt


In [12]:
print(file)

raw/yelp_academic_dataset_business.json


In [13]:
print(len(business_file))
print(len(checkin_file))
# review_file = read_json("raw/yelp_academic_dataset_review.json")
# tip_file = read_json("raw/yelp_academic_dataset_tip.json")
# user_file = read_json("raw/yelp_academic_dataset_user.json")


150346
150346


In [14]:
print('business json file has {} records with size of {} mb.'.format(len(business_file), sys.getsizeof(business_file)/1000000))
print('tip json file has {} records with size of {} mb.'.format(len(tip_file), sys.getsizeof(tip_file)/1000000))
print('user json file has {} records with size of {} mb.'.format(len(user_file), sys.getsizeof(user_file)/1000000))
print('review json file has {} records with size of {} mb.'.format(len(review_file), sys.getsizeof(review_file)/1000000))
print('checkin json file has {} records with size of {} mb.'.format(len(checkin_file), sys.getsizeof(checkin_file)/1000000))


business json file has 150346 records with size of 1.28316 mb.
tip json file has 150346 records with size of 1.28316 mb.
user json file has 150346 records with size of 1.28316 mb.
review json file has 150346 records with size of 1.28316 mb.
checkin json file has 150346 records with size of 1.28316 mb.


In [15]:
# df_pandas = business.toPandas()

In [16]:
# html_df = df_pandas.to_html()

In [17]:
# display(html_df)

In [18]:
# ! gcloud auth login --no-launch-browser