In [0]:
from schema_definition import schema
from processors import *
import boto3
import json
from botocore.exceptions import NoCredentialsError
from pyspark.sql import SparkSession
from datetime import datetime, timezone
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

# Initialize Spark session
spark = SparkSession.builder.appName("S3_JSON_Loader").getOrCreate()

# Step 1: Get AWS credentials from Databricks Secrets
aws_access_key = dbutils.secrets.get(scope="aws-secrets", key="aws-access-key")
aws_secret_key = dbutils.secrets.get(scope="aws-secrets", key="aws-secret-key")

# Step 2: Initialize boto3 client for S3
s3 = boto3.client(
    's3',
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_key
)

bucket = 'databricks-practice-sk'
prefix = 'raw_data/'

# Table to track ingested files
tracking_table = "workspace.silver_schema.ingested_files_log"

# Define schema for tracking table
tracking_table_schema = StructType([
    StructField("file_name", StringType(), False),
    StructField("ingestion_time", TimestampType(), False)
])

# Create an empty DataFrame with this schema
empty_df = spark.createDataFrame([], tracking_table_schema)

# Check if table exists
if not spark.catalog.tableExists(tracking_table):
    empty_df.write.format("delta").saveAsTable(tracking_table)
    print("Tracking table created.")
else:
    print("Tracking table already exists.")




# Step 3: Get already ingested file names
try:
    ingested_files = set(row["file_name"] for row in spark.table(tracking_table).collect())
except:
    ingested_files = set()
    print("Tracking table not found. Assuming no files ingested yet.")

all_data = []
new_files_log = []

try:
    # Step 4: List all files in S3 prefix
    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

    for obj in response.get('Contents', []):
        key = obj['Key']
        last_modified = obj['LastModified']  # datetime in UTC

        if key.endswith('.json') and key not in ingested_files:
            # Download and parse
            file_obj = s3.get_object(Bucket=bucket, Key=key)
            json_data = file_obj['Body'].read().decode('utf-8')
            data_dict = json.loads(json_data)
            all_data.append(data_dict)

            # Log the file for tracking
            new_files_log.append((key, last_modified.isoformat(), datetime.now(timezone.utc).isoformat()))

    print(f"New JSON files to ingest: {len(new_files_log)}")

except NoCredentialsError:
    print("AWS credentials not found!")
    all_data = []

# Step 5: Create DataFrame and write to Delta
if all_data:
    silver_df = spark.createDataFrame(all_data, schema=schema)
    #silver_df.show(truncate=False)

    # Append new data
    silver_df.write.format("delta").mode("append").saveAsTable("workspace.silver_schema.silver_delta_table")
    print("New data appended to Delta table.")

    # Step 6: Update tracking table
    log_df = spark.createDataFrame(new_files_log, ["file_name", "last_modified", "ingested_at"])
    log_df.write.format("delta").mode("append").saveAsTable(tracking_table)
    print("Tracking table updated.")
else:
    print("No new files to process.")
