In [0]:
import boto3
import os
import re
from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType, IntegerType, DateType
from datetime import timezone
from pyspark.sql.functions import col, regexp_extract, lit, lower, count, sum, when, regexp_extract_all, size, array_join, current_timestamp, split, expr, concat
from delta.tables import DeltaTable
from pyspark.sql import functions as F

In [0]:
dbutils.widgets.text("catalog", "")
dbutils.widgets.text("schema", "")
dbutils.widgets.text("s3_inventory_table", "")
dbutils.widgets.text("candidate_table", "")
dbutils.widgets.text("dataset_mapping_file_location", "") 
dbutils.widgets.text("archive_dataset_mapping_file_location", "")
dbutils.widgets.text("dataset_mapping_table", "")
dbutils.widgets.text("job_id", "")
dbutils.widgets.text("run_id", "")
dbutils.widgets.text("dataset_mapping_file","")

catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
s3_inventory_table = dbutils.widgets.get("s3_inventory_table")
candidate_table = dbutils.widgets.get("candidate_table")
dataset_mapping_file_location = dbutils.widgets.get("dataset_mapping_file_location")
archive_dataset_mapping_file_location= dbutils.widgets.get("archive_dataset_mapping_file_location")
dataset_mapping_table = dbutils.widgets.get("dataset_mapping_table")
job_id = dbutils.widgets.get("job_id")
run_id = dbutils.widgets.get("run_id")
dataset_mapping_file = dbutils.widgets.get("dataset_mapping_file")

execution_id = job_id + "-" + run_id
candidate_table = catalog + "." + schema + "." + candidate_table
s3_inventory_table = catalog + "." + schema + "." + s3_inventory_table
dataset_mapping_table = catalog + "." + schema + "." + dataset_mapping_table


In [0]:
from datetime import datetime

def list_s3_files_recursive(path,s3_bucket_name, bucket_prefix):
  all_files = []
  try:
      files = dbutils.fs.ls(path)
    #   print(path)
  except Exception as e:
      print(f"Error reading path {path}: {e}")
      return all_files

  for f in files:
      if f.path.endswith("/"): 
          all_files.extend(list_s3_files_recursive(f.path, s3_bucket_name, bucket_prefix))
      else:
          ext = f.name.split(".")[-1] if "." in f.name else None
          all_files.append({
              "execution_id": execution_id,
              "s3_bucket_name": s3_bucket_name,
              "bucket_prefix": bucket_prefix,
              "path": f.path,
              "file_name": f.name,
              "last_modified_time": datetime.fromtimestamp(f.modificationTime / 1000),
              "size": f.size,
              "extension": ext
          })
  return all_files

In [0]:
# if file_exists_in_volume(f"{dataset_mapping_file_location}/{dataset_mapping_file}"):
#   print("test")

In [0]:
dataset_mapping_schema = StructType([
    StructField("dataset_name", StringType(), True),
    StructField("s3_bucket_name", StringType(), True),
    StructField("bucket_prefix", StringType(), True),
    StructField("dbx_catalog", StringType(), True),
    StructField("dbx_managed_table_schema", StringType(), True),
    StructField("datetime", TimestampType(), True)
])

df_dataset_mapping = spark.createDataFrame([], dataset_mapping_schema)

def file_exists_in_volume(file_path: str) -> bool:
    try:
        # Split into folder and filename
        folder, file_name = file_path.rsplit("/", 1)
        files = dbutils.fs.ls(folder)
        return any(f.name == file_name for f in files)
    except Exception:
        return False


if file_exists_in_volume(f"{dataset_mapping_file_location}/{dataset_mapping_file}"):
    df_dataset_mapping = spark.read \
        .option("header", "true") \
        .schema(dataset_mapping_schema) \
        .csv(f"{dataset_mapping_file_location}/{dataset_mapping_file}")\
        .withColumn("datetime", current_timestamp())

    display(df_dataset_mapping)

    dataset_mapping_table = DeltaTable.forName(spark, dataset_mapping_table)

    dataset_mapping_insert_cols = ["dataset_name", "s3_bucket_name", "bucket_prefix", "dbx_catalog", "dbx_managed_table_schema", "datetime"]

    dataset_mapping_table.alias("t").merge(
        df_dataset_mapping.alias("s"),
        "t.dataset_name = s.dataset_name AND \
        t.s3_bucket_name = s.s3_bucket_name AND \
        t.bucket_prefix = s.bucket_prefix"
    ).whenNotMatchedInsert(values={col: f"s.{col}" for col in dataset_mapping_insert_cols}).execute()


dataset_name,s3_bucket_name,bucket_prefix,dbx_catalog,dbx_managed_table_schema,datetime
lda_taser_txns_shrt_wndw_daily,app-id-89055-dep-id-109792-uu-id-n6ph64imx36e,trusted/analytics/data_dlvr/lda_taser_txns_shrt_wndw_daily/,89055_ctg_prod_exp,dbx_89055_trusted_db_mdas_ais_hcd_dora_fdl_prod_exp,2025-12-04T17:42:30.621505Z


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import DateType


# final_file_list =[]
prefixes=[]
df_dataset_inventory= None
# for df in df_dataset_mapping.select("s3_bucket_name", "bucket_prefix").distinct().filter(F.col("bucket_prefix")== "trusted/fnce/data_insgts/cpb_acct/").collect():
for df in df_dataset_mapping.select("s3_bucket_name", "bucket_prefix").distinct().collect():
    
    try:
        print(f"Processing dataset: {df['bucket_prefix']}")
        path=f"s3://{df['s3_bucket_name']}/{df['bucket_prefix']}"
        file_list = list_s3_files_recursive(path,df['s3_bucket_name'],df['bucket_prefix'])

        #To consider incremental files from S3 
        s3_files_df=spark.createDataFrame(file_list)
        src_invtry_df=spark.sql(f"""
                            SELECT max(last_modified_time) FROM {s3_inventory_table} where bucket_prefix ="{df['bucket_prefix']}"
                        """)
        max_last_modfied_file_time=src_invtry_df.collect()[0][0]
        print(max_last_modfied_file_time)

        # s3_files_df.agg(F.max("last_modified_time").alias("max_value")).show()
        # print(s3_max_last_modified)

        if max_last_modfied_file_time:
            s3_files_df=s3_files_df.filter(col("last_modified_time")>max_last_modfied_file_time)
        else:
            s3_files_df
        print(s3_files_df.count())
        if s3_files_df.count() == 0:
            print(f"No new files to process for  {df['bucket_prefix']}")
            continue

        df_dataset_inventory=s3_files_df\
        .withColumn(
            "key", 
            expr("regexp_replace(path, concat('s3://', s3_bucket_name, '/'), '')")
        )

        df_dataset_inventory_filtered = df_dataset_inventory.alias("inventory") \
            .join(
                df_dataset_mapping.alias("mapping"),
                (col("inventory.s3_bucket_name") == col("mapping.s3_bucket_name")) &
                (col("inventory.bucket_prefix") == col("mapping.bucket_prefix")),
                "inner"  # inner join keeps only matches
            ) \
            .select("inventory.*")

        partition_expr = regexp_extract_all(lower(col("key")), lit(r"([a-zA-Z0-9_]+)="))

    # Convert array -> comma-separated string
        df_dataset_inventory_filtered = df_dataset_inventory_filtered\
                                        .withColumn("partition_key", when(size(partition_expr) > 0, array_join(partition_expr, ", ")).otherwise(lit(None)))\
                                        .withColumn("edp_run_id", F.regexp_extract(lower(F.col("key")),r"edp_run_id=([^/]+)",1))\
                                        .withColumn("snapshot_date", F.regexp_extract(lower(F.col("key")),r"snapshot_date=([^/]+)",1))

        
        df_dataset_inventory_filtered = df_dataset_inventory_filtered.select("execution_id", "s3_bucket_name", "bucket_prefix", "key","extension", "size", "last_modified_time","partition_key","edp_run_id","snapshot_date")

        df_dataset_inventory_filtered=df_dataset_inventory_filtered.withColumn("snapshot_date",col("snapshot_date").cast(DateType()))
        
        #display(df_dataset_inventory_filtered)
        
        #print(s3_inventory_table)
        df_dataset_inventory_filtered.write \
        .mode("append") \
        .format("delta") \
        .saveAsTable(s3_inventory_table)
        
        print(f"Processing dataset completed for: {df['bucket_prefix']}")
        prefixes.append(df['bucket_prefix'])
    except Exception as e:
        print(f"Error Processing dataset for {df['bucket_prefix']}")
        
if not prefixes:
    print("No new records for any of the datasets")
else:
    print(f"Inventory table processing completed for : {prefixes}")
    
#prefixes = [row.bucket_prefix for row in df_dataset_inventory_filtered.select("bucket_prefix").distinct().collect()]
#print(prefixes)

Processing dataset: trusted/analytics/data_dlvr/lda_taser_txns_shrt_wndw_daily/
s3://app-id-89055-dep-id-109792-uu-id-n6ph64imx36e/trusted/analytics/data_dlvr/lda_taser_txns_shrt_wndw_daily/
s3://app-id-89055-dep-id-109792-uu-id-n6ph64imx36e/trusted/analytics/data_dlvr/lda_taser_txns_shrt_wndw_daily/edp_run_id=00429282-c29b-4958-b393-21278f03f198/
s3://app-id-89055-dep-id-109792-uu-id-n6ph64imx36e/trusted/analytics/data_dlvr/lda_taser_txns_shrt_wndw_daily/edp_run_id=00429282-c29b-4958-b393-21278f03f198/snapshot_date=2021-10-08/
s3://app-id-89055-dep-id-109792-uu-id-n6ph64imx36e/trusted/analytics/data_dlvr/lda_taser_txns_shrt_wndw_daily/edp_run_id=007bf865-f463-42e5-a4e1-7b1a5d9a9c87/
s3://app-id-89055-dep-id-109792-uu-id-n6ph64imx36e/trusted/analytics/data_dlvr/lda_taser_txns_shrt_wndw_daily/edp_run_id=007bf865-f463-42e5-a4e1-7b1a5d9a9c87/snapshot_date=2021-03-20/
s3://app-id-89055-dep-id-109792-uu-id-n6ph64imx36e/trusted/analytics/data_dlvr/lda_taser_txns_shrt_wndw_daily/edp_run_id=00

In [0]:

from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType
import math

# Initialize Delta table for candidate operations
candidate_table_delta = DeltaTable.forName(spark, candidate_table)

for prefix in prefixes:
    print(f"Processing candidate table for prefix: {prefix}")
    
    try:
        # Read inventory and filter by prefix with load_status IS NULL
        df = spark.sql(f"""
            SELECT s3_bucket_name, bucket_prefix, key, extension, size 
            FROM {s3_inventory_table}
            WHERE bucket_prefix = '{prefix}' AND load_status IS NULL
        """)
        
        # Skip if no unprocessed records
        if df.count() == 0:
            print(f"No records with null load_status for {prefix}")
            continue
        
        # Filter for structured files
        #STRUCTURED_EXTS = [".parquet"]
        #structured_list = [e.lstrip('.').lower() for e in STRUCTURED_EXTS]
        df_structured = df.filter(col("extension").isNotNull())
        display(df_structured)
        # Extract s3_bucket_name and table name
        s3_bucket_name = df.select("s3_bucket_name").first()[0]
        table_name = os.path.basename(os.path.normpath(prefix))
        
        # Validate bucket and prefix
        if not s3_bucket_name or not prefix:
            print(f"ERROR: Invalid bucket or prefix")
            continue
        
        # Calculate metrics from inventory table (fresh counts)
        total_file_count = df.count()
        structured_file_count = df_structured.count()
        total_size_bytes = df.agg(sum("size")).collect()[0][0] or 0
        table_file_size_mb = total_size_bytes / (1024 * 1024)
        
        # Handle edge cases
        if math.isnan(table_file_size_mb) or math.isinf(table_file_size_mb):
            table_file_size_mb = 0.0
        table_file_size_mb = round(table_file_size_mb, 2)
        
        print(f"Metrics calculated:")
        print(f"   Total files: {total_file_count}")
        print(f"   Structured files: {structured_file_count}")
        print(f"   Total size: {table_file_size_mb:.2f} MB")
        
        # Check if record exists
        existing_candidate = spark.table(candidate_table).filter(
            (col("s3_bucket_name") == s3_bucket_name) &
            (col("bucket_prefix") == prefix) &
            (col("candidate_for_managed_table_creation").isNull())
        )
        
        existing_count = existing_candidate.count()
        
        if existing_count > 0:
            # UPDATE existing record
            print(f"Updating existing candidate record for {prefix}")
            
            # Create source DataFrame with explicit schema
            source_schema = StructType([
                StructField("s3_bucket_name", StringType(), False),
                StructField("bucket_prefix", StringType(), False),
                StructField("new_total_file_count", LongType(), False),
                StructField("new_table_file_size_mb", DoubleType(), False),
                StructField("new_structured_file_count", LongType(), False)
            ])
            
            source_data = [(
                s3_bucket_name,
                prefix,
                int(total_file_count),
                float(table_file_size_mb),
                int(structured_file_count)
            )]
            
            source_df = spark.createDataFrame(source_data, schema=source_schema)
            
            candidate_table_delta.alias("t").merge(
                source_df.alias("s"),
                """t.s3_bucket_name = s.s3_bucket_name 
                   AND t.bucket_prefix = s.bucket_prefix 
                   AND t.candidate_for_managed_table_creation IS NULL"""
            ).whenMatchedUpdate(
                set={
                    "total_file_count": "s.new_total_file_count",
                    "table_file_size_mb": "s.new_table_file_size_mb",
                    "structured_file_count": "s.new_structured_file_count"
                }
            ).execute()
            
            print(f"Successfully updated candidate record for {prefix}")
        else:
            # LOGIC 2: INSERT new record
            print(f"Inserting new candidate record for {prefix}")
            
            # Get the exact schema from the candidate table
            candidate_table_df = spark.table(candidate_table)
            candidate_schema = candidate_table_df.schema
            
            # Create a row with all columns from the schema
            row_data = {}
            for field in candidate_schema.fields:
                if field.name == "execution_id":
                    row_data[field.name] = execution_id
                elif field.name == "s3_bucket_name":
                    row_data[field.name] = s3_bucket_name
                elif field.name == "bucket_prefix":
                    row_data[field.name] = prefix
                elif field.name == "table_name":
                    row_data[field.name] = table_name
                elif field.name == "total_file_count":
                    row_data[field.name] = int(total_file_count)
                elif field.name == "table_file_size_mb":
                    row_data[field.name] = float(table_file_size_mb)
                elif field.name == "structured_file_count":
                    row_data[field.name] = int(structured_file_count)
                elif field.name == "candidate_for_managed_table_creation":
                    row_data[field.name] = None
                else:
                    # Set default value for any other columns
                    row_data[field.name] = None
            
            # Create DataFrame with exact schema
            new_candidate_df = spark.createDataFrame([row_data], schema=candidate_schema)
            
            new_candidate_df.write \
                .format("delta") \
                .mode("append") \
                .saveAsTable(candidate_table)
            
            print(f"Successfully inserted candidate record for {prefix}")
        
        print(f"Completed candidate processing for {prefix}\n")
        
    except Exception as e:
        print(f"ERROR processing candidate for {prefix}:")
        print(f"Error: {str(e)}")
        import traceback
        traceback.print_exc()
        continue

print("Candidate table processing completed!")


Candidate table processing completed!


In [0]:
try:
    if file_exists_in_volume(f"{dataset_mapping_file_location}/{dataset_mapping_file}"):
        archive_file_name=dataset_mapping_file.replace(".csv",f"_{execution_id}.csv")
        source_path=f"{dataset_mapping_file_location}/{dataset_mapping_file}"
        archive_path=f"{archive_dataset_mapping_file_location}/{archive_file_name}"
        dbutils.fs.mv(source_path,archive_path)
        # dbutils.fs.cp(source_path,archive_path)
        # dbutils.fs.rm(source_path)
except Exception as e:
    print (f"{e}")