In [0]:
%run ./processor

In [0]:
%run ./schema_definition

In [0]:
# from schema_definition import schema, tracking_table_schema
import json
from botocore.exceptions import NoCredentialsError
from pyspark.sql import SparkSession
from datetime import datetime, timezone
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import boto3

# Initialize Spark session
spark = SparkSession.builder.appName("S3_JSON_Loader").getOrCreate()

# Step 1 - Establish AWS connections
s3 = establish_aws_connection()

# Step 2 - Load the config and store the values into different variables
config = load_config_txt("config.txt")
bucket = config["bucket"]
prefix = config["prefix"]
files_tracking = config["files_tracking"]
last_ingested_times = config["last_ingested_times"]

# Step 3 - create tracking tables for files which have been ingested if not exists
# get the lists of ingested files
create_table(files_tracking, files_tracking_schema)
ingested_files = get_ingested_files(files_tracking)
print("ingested files is:", ingested_files)

# Step 4 - a table to store the times of last ingested files
create_table(last_ingested_times, last_ingested_times_schema)

last_ingested_df = spark.read.table(last_ingested_times)

# Extract the only timestamp value
last_ingested_times = last_ingested_df.collect()[0]["last_ingested_times"]

if last_ingested_times.tzinfo is None:
    last_ingested_times = last_ingested_times.replace(tzinfo=timezone.utc)

print(f"Last ingested timestamp: {last_ingested_times}")

# Step 5: List all files in S3
all_data = [] # to store all new data from json
new_ingested_json = [] 
try:
    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
    for obj in response.get('Contents', []):
        key = obj['Key']
        last_modified = obj['LastModified']  # this is a datetime object in UTC

        # Only ingest files modified after the last_ingested_times
        if key.endswith('.json') and last_modified > last_ingested_times:
            # Download and parse
            file_obj = s3.get_object(Bucket=bucket, Key=key)
            json_data = file_obj['Body'].read().decode('utf-8')
            data_dict = json.loads(json_data)
            all_data.append(data_dict)

            # Log the file for tracking
            new_ingested_json.append((key, datetime.now(timezone.utc)))

    print(f"New JSON files to ingest: {len(new_ingested_json)}")

except NoCredentialsError:
    print("AWS credentials not found!")
    all_data = []

# Step 6: Create DataFrame and write to Delta
if all_data:
    silver_df = spark.createDataFrame(all_data, schema=main_schema)

    # Append new data
    silver_df.write.format("delta").mode("append").saveAsTable("workspace.silver_schema.silver_delta_table")
    print("New data appended to Delta table.")

    # Step 7: Update tracking table
    print("new files is:", new_ingested_json)
    new_ingested_json_df = spark.createDataFrame(new_ingested_json, schema=files_tracking_schema)
    new_ingested_json_df.write \
    .format("delta") \
    .option("mergeSchema", "true") \
    .mode("append") \
    .saveAsTable(files_tracking)


else:
    print("No new files to process.")
