In [0]:
%run ./processor

In [0]:
%run ./schema_definition

In [0]:
# from schema_definition import schema
import json
from botocore.exceptions import NoCredentialsError
from pyspark.sql import SparkSession
from datetime import datetime, timezone
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import boto3

# Initialize Spark session
spark = SparkSession.builder.appName("S3_JSON_Loader").getOrCreate()

# Step 1 - Establish AWS connections
s3 = establish_aws_connection()

# Step 2 - Load the config and store the values into different variables
config = load_config_txt("config.txt")
bucket = config["bucket"]
prefix = config["prefix"]
tracking_table = config["tracking_table"]

# Step 3 - create tracking tables for files which have been ingested if not exists
# get the lists of ingested files
create_ingested_files_log(tracking_table)
ingested_files = get_ingested_files(tracking_table)

# Step 4: List all files in S3
all_data = [] # to store all new data from json
# new_files_log = [] 

try:
    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
    for obj in response.get('Contents', []):
        key = obj['Key']
        last_modified = obj['LastModified']  # datetime in UTC

        if key.endswith('.json') and key not in ingested_files:
            # Download and parse
            file_obj = s3.get_object(Bucket=bucket, Key=key)
            json_data = file_obj['Body'].read().decode('utf-8')
            data_dict = json.loads(json_data)
            all_data.append(data_dict)

            # Log the file for tracking
            new_files_log.append((key, last_modified.isoformat(), datetime.now(timezone.utc).isoformat()))

    print(f"New JSON files to ingest: {len(new_files_log)}")

except NoCredentialsError:
    print("AWS credentials not found!")
    all_data = []

# Step 5: Create DataFrame and write to Delta
if all_data:
    silver_df = spark.createDataFrame(all_data, schema=schema)
    #silver_df.show(truncate=False)

    # Append new data
    silver_df.write.format("delta").mode("append").saveAsTable("workspace.silver_schema.silver_delta_table")
    print("New data appended to Delta table.")

    # Step 6: Update tracking table
    log_df = spark.createDataFrame(new_files_log, ["file_name", "last_modified", "ingested_at"])
    log_df.write.format("delta").mode("append").saveAsTable(tracking_table)
    print("Tracking table updated.")
else:
    print("No new files to process.")
