In [0]:

import json
from botocore.exceptions import NoCredentialsError
from pyspark.sql import SparkSession
from datetime import datetime, timezone
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import boto3

# Step 1 - Establish AWS connections
def establish_aws_connection():
    # Step 1: Get AWS credentials from Databricks Secrets
    aws_access_key = dbutils.secrets.get(scope="aws-secrets", key="aws-access-key")
    aws_secret_key = dbutils.secrets.get(scope="aws-secrets", key="aws-secret-key")

    # Step 2: Initialize boto3 client for S3
    s3 = boto3.client(
        's3',
        aws_access_key_id=aws_access_key,
        aws_secret_access_key=aws_secret_key
    )

    return s3

# Step 2
def load_config_txt(path):
    config = {}
    with open(path, "r") as f:
        for line in f:
            # Skip empty lines and comments
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            # Split key=value
            if '=' in line:
                key, value = line.split("=", 1)
                config[key.strip()] = value.strip()
    return config


# Step 2 - create tracking tables for files which have been ingested
def create_table(table_name, schema):

    # Create an empty DataFrame with this schema
    empty_df = spark.createDataFrame([], schema)

    # Check if table exists
    if not spark.catalog.tableExists(table_name):
        empty_df.write.format("delta").saveAsTable(table_name)
        print("Tracking table created.")
    else:
        print("Tracking table already exists.")

# Step 3: Get already ingested file names
def get_ingested_files(tracking_table):
    try:
        ingested_files = set(row["file_name"] for row in spark.table(tracking_table).collect())
    except:
        ingested_files = set()
        print("Tracking table not found. Assuming no files ingested yet.")
    return ingested_files