In [0]:
import boto3
from botocore.exceptions import NoCredentialsError
import json
from pyspark.sql.types import *
from pyspark.sql import SparkSession

# Initialize Spark session (Databricks has it by default)
spark = SparkSession.builder.appName("S3_JSON_Loader").getOrCreate()

# Step 1: Get AWS credentials from Databricks Secrets
aws_access_key = dbutils.secrets.get(scope="aws-secrets", key="aws-access-key")
aws_secret_key = dbutils.secrets.get(scope="aws-secrets", key="aws-secret-key")

# Step 2: Initialize boto3 client for S3
s3 = boto3.client(
    's3',
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_key
)

bucket = 'databricks-practice-sk'
prefix = 'raw_data/'  # Folder containing multiple JSON files

all_data = []

try:
    # Step 3: List all files in S3 prefix
    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

    for obj in response.get('Contents', []):
        key = obj['Key']
        if key.endswith('.json'):  # Only process JSON files
            file_obj = s3.get_object(Bucket=bucket, Key=key)
            json_data = file_obj['Body'].read().decode('utf-8')
            data_dict = json.loads(json_data)
            all_data.append(data_dict)

    print(f"✅ Total JSON files processed: {len(all_data)}")

except NoCredentialsError:
    print("AWS credentials not found!")
    all_data = []

# ✅ Step 4: Define schema matching your JSON structure
schema = StructType([
    StructField("time", StringType(), True),

    StructField("systems", StructType([
        StructField("equipment_id", StringType(), True),
        StructField("component", StructType([
            StructField("columns", StructType([
                StructField("column_id", StringType(), True)
            ]), True)
        ]), True),
        StructField("im_id", StringType(), True)  # Changed from im_equipment_id
    ]), True),

    StructField("users", StructType([
        StructField("my_submit", StructType([
            StructField("id", StringType(), True),
            StructField("datetime", StringType(), True)
        ]), True),
        StructField("my_review", StructType([
            StructField("id", StringType(), True),
            StructField("datetime", StringType(), True)
        ]), True),
        StructField("my_approval", StructType([
            StructField("id", StringType(), True),
            StructField("datetime", StringType(), True)
        ]), True),
        StructField("my_status", StructType([
            StructField("status", StringType(), True)
        ]), True)
    ]), True),

    StructField("methods", StructType([
        StructField("sop", StringType(), True),
        StructField("id", StringType(), True),
        StructField("meth_id", StringType(), True),
        StructField("temp_id", StringType(), True),
        StructField("report_template_version", StringType(), True),
        StructField("seq_id", StringType(), True),
        StructField("seq_version", StringType(), True)
    ]), True),

    StructField("runs", ArrayType(StructType([
        StructField("in_num", StringType(), True),
        StructField("in_name", StringType(), True),
        StructField("pk", StringType(), True)
    ])), True),

    StructField("sst", ArrayType(StructType([
        StructField("fk", StringType(), True),
        StructField("sst_res", StructType([
            StructField("number", StringType(), True),
            StructField("in_num", StringType(), True),
            StructField("in_name", StringType(), True),
            StructField("sst_name", StringType(), True),
            StructField("peak", StringType(), True),
            StructField("eval_result", StringType(), True),
            StructField("result", StringType(), True)
        ]), True)
    ])), True),

    StructField("results", ArrayType(StructType([
        StructField("fk", StringType(), True),
        StructField("result", StructType([
            StructField("samp_id", StringType(), True),
            StructField("as_id", StringType(), True),
            StructField("Type", StringType(), True),
            StructField("comp", StringType(), True),
            StructField("unit", StringType(), True),
            StructField("det_id", StringType(), True),
            StructField("result", StringType(), True),
            StructField("number_of_averaged_samples", StringType(), True)
        ]), True)
    ])), True)
])

# ✅ Step 5: Create a DataFrame from all JSON data
if all_data:
    silver_df = spark.createDataFrame(all_data, schema=schema)
    print("✅ DataFrame created successfully!")

    # ✅ Show schema and preview
    silver_df.printSchema()
    silver_df.show(truncate=False)

    # ✅ Step 6: Write to Delta Table
    silver_df.write.format("delta").mode("overwrite").saveAsTable("workspace.silver_schema.silver_delta_table")
    print("✅ Data written to Delta table: workspace.silver_schema.silver_delta_table")
else:
    print("❌ No data found in S3 bucket!")
