In [0]:
import uuid
from pyspark.sql.functions import pandas_udf, explode, udf,from_json
from pyspark.sql.types import ArrayType, StringType, DoubleType
import json
from pyspark.sql.functions import struct, flatten, lit, col,row_number, current_timestamp
from pyspark.sql import functions as F
from typing import Iterator, Tuple
import pandas as pd
import os
import hashlib
from pyspark.sql.functions import md5
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, BooleanType
from datetime import datetime
from pyspark.sql import Window
from urllib.parse import unquote


In [0]:

jpmcEnv= False

if jpmcEnv:
    dbutils.widgets.text("catalog", "102354_c360_ctg_prod_exp")
    dbutils.widgets.text("schema", "ucd_data_migration")
    dbutils.widgets.text("s3_batch_copy_id", "UCD_Chase360_Wave1_S3_Copy", "Batch Copy Id to compare")
else:    
    dbutils.widgets.text("catalog", "users")
    dbutils.widgets.text("schema", "dom_rodrigues")
    dbutils.widgets.text("s3_batch_copy_id", "Test1", "Batch Copy Id to compare")

dbutils.widgets.dropdown("debug", "true", ["true", "false"])
dbutils.widgets.text("runId", "")
dbutils.widgets.text("checksumValidationPercentage", "5") #0.05 #5% of each file type

catalog = dbutils.widgets.get("catalog")
schema =  dbutils.widgets.get("schema")
runId =  dbutils.widgets.get("runId")

dbutils.widgets.dropdown("performChecksumValidation", "true", ["true", "false"])
performChecksumValidation =  True if dbutils.widgets.get("performChecksumValidation") == 'true' else False
SAMPLE_PERCENT_FOR_CHECKSUM_CHECK = float(dbutils.widgets.get("checksumValidationPercentage"))/100.0

runId = str(uuid.uuid4()) if len(runId.strip()) == 0 else runId

s3_batch_copy_id=dbutils.widgets.get("s3_batch_copy_id")

debug =  True if dbutils.widgets.get("debug") == 'true' else False
trace = True
createMetadataTables = False

spark.sql(f"use {catalog}.{schema}")

DataFrame[]

#### Test Data

In [0]:
if createMetadataTables:
  src_inventory_s3_location="s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY"
  src_bucket_name="databricks-e2demofieldengwest"
  src_prefix="dom_rodrigues/folder_1/"
  src_inventory_name="dom_rodrigues_inventory_test"

  dest_inventory_s3_location="s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY"
  dest_bucket_name="databricks-e2demofieldengwest"
  dest_prefix="dom_rodrigues/folder_2/"
  dest_inventory_name="dom_rodrigues_inventory_dest_test"

  data = [{
      "s3_batch_copy_id": s3_batch_copy_id,
      "src_inventory_s3_location": src_inventory_s3_location,
      "src_bucket_name": src_bucket_name,
      "src_prefix": src_prefix,
      "src_inventory_name": src_inventory_name,
      "dest_inventory_s3_location": dest_inventory_s3_location,
      "dest_bucket_name": dest_bucket_name,
      "dest_inventory_name": dest_inventory_name,
      "dest_prefix": dest_prefix,
      "last_recon_timestamp": datetime.now(),
      "runId": "0",
      "status": "",
      "recon_status" : "",
      "enabled": True
  }]

  # Create DataFrame from the list of dictionary
  df_s3_recon = spark.createDataFrame(data)
  df_s3_recon.display()

  df_s3_recon.write.option("mergeSchema", "true").mode("overwrite").format("delta").saveAsTable(f"{catalog}.{schema}.ucd_src_dest_mapping")

In [0]:
df_s3_recon = spark.read.table(f"{catalog}.{schema}.ucd_src_dest_mapping")
df_s3_recon = df_s3_recon.filter(f"s3_batch_copy_id = '{s3_batch_copy_id}' and enabled = 'true'")
df_s3_recon.display() if trace else None


dest_bucket_name,dest_inventory_name,dest_inventory_s3_location,dest_prefix,enabled,last_recon_timestamp,recon_status,runId,s3_batch_copy_id,src_bucket_name,src_inventory_name,src_inventory_s3_location,src_prefix,status
databricks-e2demofieldengwest,dom_rodrigues_inventory_dest_test,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,dom_rodrigues/folder_2/,True,2025-11-18T13:05:29.49168Z,"[""{""status"":""FILES_IN_SRC_ARE_NEWER_THAN_DESTINATION"",""count"":1}""]",50933e4f-de57-43b9-930d-604d898b2ddc,Test1,databricks-e2demofieldengwest,dom_rodrigues_inventory_test,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,dom_rodrigues/folder_1/,FAILED


#### Build Manifest metadata

In [0]:

to_prepend_1 = [StructField("src_manifest_path", StringType(), True), StructField("dest_manifest_path", StringType(), True)] 

s3_manifest_schema = StructType(df_s3_recon.schema.fields + to_prepend_1)

def build_s3_manifast_path_for_recon(iterator: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:
   for pdf in iterator:
     #build manifest apth for both - src and destination
     src_inventory_s3_location = pdf["src_inventory_s3_location"][0]
     src_bucket_name = pdf["src_bucket_name"][0]
     src_prefix = pdf["src_prefix"][0]
     src_inventory_name = pdf["src_inventory_name"][0]
     dest_inventory_s3_location = pdf["dest_inventory_s3_location"][0]
     dest_bucket_name = pdf["dest_bucket_name"][0]
     dest_prefix = pdf["dest_prefix"][0]
     dest_inventory_name = pdf["dest_inventory_name"][0]
     pdf["src_manifest_path"] = f"{src_inventory_s3_location}/{src_bucket_name}/{src_inventory_name}"
     pdf["dest_manifest_path"] = f"{dest_inventory_s3_location}/{dest_bucket_name}/{dest_inventory_name}"
   yield pdf

df_s3_recon_with_manifest_path = df_s3_recon.mapInPandas(build_s3_manifast_path_for_recon, s3_manifest_schema)
df_s3_recon_with_manifest_path.display() if trace else None

dest_bucket_name,dest_inventory_name,dest_inventory_s3_location,dest_prefix,enabled,last_recon_timestamp,recon_status,runId,s3_batch_copy_id,src_bucket_name,src_inventory_name,src_inventory_s3_location,src_prefix,status,src_manifest_path,dest_manifest_path
databricks-e2demofieldengwest,dom_rodrigues_inventory_dest_test,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,dom_rodrigues/folder_2/,True,2025-11-18T13:05:29.49168Z,"[""{""status"":""FILES_IN_SRC_ARE_NEWER_THAN_DESTINATION"",""count"":1}""]",50933e4f-de57-43b9-930d-604d898b2ddc,Test1,databricks-e2demofieldengwest,dom_rodrigues_inventory_test,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,dom_rodrigues/folder_1/,FAILED,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_test,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_dest_test


In [0]:

from pyspark.sql.functions import col
src_s3_recon_df = df_s3_recon_with_manifest_path.select(col("src_bucket_name").alias("bucket_name"), col("src_prefix").alias("prefix"),col("src_inventory_name").alias("inventory_name"), col("src_inventory_s3_location").alias("inventory_s3_location"), col("src_manifest_path").alias("manifest_path"))

dest_s3_recon_df = df_s3_recon_with_manifest_path.select(col("dest_bucket_name").alias("bucket_name"),col("dest_prefix").alias("prefix"),col("dest_inventory_name").alias("inventory_name"), col("dest_inventory_s3_location").alias("inventory_s3_location"), col("dest_manifest_path").alias("manifest_path")) 

src_s3_recon_df.limit(10).display() if trace else None
dest_s3_recon_df.limit(10).display() if trace else None

bucket_name,prefix,inventory_name,inventory_s3_location,manifest_path
databricks-e2demofieldengwest,dom_rodrigues/folder_1/,dom_rodrigues_inventory_test,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_test


bucket_name,prefix,inventory_name,inventory_s3_location,manifest_path
databricks-e2demofieldengwest,dom_rodrigues/folder_2/,dom_rodrigues_inventory_dest_test,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_dest_test


### Find latest inventory folder for src and dest and save into inventory list table

In [0]:

inventory_reports_schema = StructType([
  StructField("file", StringType(), True),
  StructField("name", StringType(), True),
  StructField("size", DoubleType(), True),
  StructField("modificationTime", TimestampType(), True)])

import re
regex = r"^(.*)/(.*)/"

# path1= "s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/dom-rodrigues-us-east1-dbx-s3-1/dom_dbx_s3_1_notebooks/2025-10-18T01-00Z/"
#print(re.search(regex, path1).group(1))


def get_inventory_list(s3_recon_df):

  def get_list(manifest_path):
    inventory_reports_list = dbutils.fs.ls( manifest_path)

    inventory_reports = [{"file":re.search(regex, file.path).group(1), "name": file.name, "size": file.size, "modificationTime": int(file.modificationTime/1000)} for file in inventory_reports_list if file.name not in ["hive/", "data/"]]
    json_dump =  json.dumps(inventory_reports) #for arrow
    return json_dump


  inventory_df = pd.DataFrame(columns=s3_recon_df.columns)

  for index, row in s3_recon_df.toPandas().iterrows():
    list_of_files = get_list(row["manifest_path"])
    row["list_of_files"] = list_of_files 
    inventory_df = pd.concat([inventory_df, pd.DataFrame([row])], ignore_index=True)

  inventory_df_spark = spark.createDataFrame(inventory_df)
  return inventory_df_spark 

In [0]:
def recon_inventory_files_with_selected_column(s3_recon_df):

  recon_df_with_list = get_inventory_list(s3_recon_df) 
  recon_df_with_list_exploded =  recon_df_with_list.withColumn("inventory_reports", explode(from_json("list_of_files", ArrayType(StringType()))))

  recon_df_with_selected_columns =  recon_df_with_list_exploded\
    .withColumn("inventory_reports_details", from_json(col("inventory_reports"), inventory_reports_schema))\
    .select("bucket_name", "prefix", "inventory_s3_location",  "inventory_reports_details.file", "inventory_reports_details.size", "inventory_reports_details.name", "inventory_reports_details.modificationTime")\
    .withColumnRenamed("file", "path")

  return recon_df_with_selected_columns

In [0]:

# Pick latest after stripping the date from path s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/dom-rodrigues-us-east1-dbx-s3-1/dom_dbx_s3_1_notebooks/2025-10-18T01-00Z/

src_recon_df_with_selected_columns = recon_inventory_files_with_selected_column(src_s3_recon_df)
dest_recon_df_with_selected_columns = recon_inventory_files_with_selected_column(dest_s3_recon_df)

#.withColumn("stripped_path", F.regexp_extract(F.col("path"), "^(.*)/(.*)/", 1))\

recon_df_with_selected_columns = src_recon_df_with_selected_columns.unionByName(dest_recon_df_with_selected_columns)\
  .withColumn("Rank",  row_number().over(Window.partitionBy("path").orderBy(F.desc("name"))))\
  .filter(col("Rank") == 1).drop("Rank")

recon_df_with_selected_columns.display() if trace else None


bucket_name,prefix,inventory_s3_location,path,size,name,modificationTime
databricks-e2demofieldengwest,dom_rodrigues/folder_2/,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_dest_test,0.0,2025-11-17T01-00Z/,2025-11-18T13:07:53Z
databricks-e2demofieldengwest,dom_rodrigues/folder_1/,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_test,0.0,2025-11-17T01-00Z/,2025-11-18T13:07:52Z


In [0]:
if createMetadataTables:
  #spark.sql(f"drop table if exists {catalog}.{schema}.ucd_s3_inventory_list")
  recon_df_with_selected_columns.withColumn("processed", lit("false")).withColumn("runId", lit("")).limit(0).write.option("mergeSchema", "true").mode("overwrite").saveAsTable(f"{catalog}.{schema}.ucd_s3_inventory_list")
spark.sql(f"select *  from {catalog}.{schema}.ucd_s3_inventory_list").limit(2).display() if trace else None

bucket_name,prefix,inventory_s3_location,path,size,name,modificationTime,processed,runId
databricks-e2demofieldengwest,dom_rodrigues/folder_1/,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_test,0.0,2025-11-14T01-00Z/,2025-11-14T21:22:54Z,True,da91bd31-9c6d-4b8f-8519-7c4019e2acf2
databricks-e2demofieldengwest,dom_rodrigues/folder_2/,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_dest_test,0.0,2025-11-14T01-00Z/,2025-11-14T21:22:54Z,True,da91bd31-9c6d-4b8f-8519-7c4019e2acf2


In [0]:
recon_df_with_selected_columns.display() if trace else None
existing_inventory_list_df = spark.sql(f"select *  from {catalog}.{schema}.ucd_s3_inventory_list")
existing_inventory_list_df.filter(col("processed") == "false").display() if trace else None


bucket_name,prefix,inventory_s3_location,path,size,name,modificationTime
databricks-e2demofieldengwest,dom_rodrigues/folder_2/,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_dest_test,0.0,2025-11-17T01-00Z/,2025-11-18T13:07:53Z
databricks-e2demofieldengwest,dom_rodrigues/folder_1/,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_test,0.0,2025-11-17T01-00Z/,2025-11-18T13:07:52Z


bucket_name,prefix,inventory_s3_location,path,size,name,modificationTime,processed,runId


In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import lit, concat, col

delta_tgt = DeltaTable.forName(spark, f"{catalog}.{schema}.ucd_s3_inventory_list")
delta_tgt.toDF().filter(col("processed") == "false").display()

delta_tgt.alias("tgt").merge(
   recon_df_with_selected_columns.withColumn("processed", lit("false")).alias("src")\
       .withColumn("runId", F.lit(runId)),
   "tgt.bucket_name = src.bucket_name and tgt.prefix = src.prefix and tgt.path == src.path"
) .whenMatchedUpdate(condition="tgt.processed =='false'", set={"processed": lit("ignored"), "runId": F.concat(col("tgt.runId"), F.lit(":"),lit(runId))}) \
.execute()

delta_tgt.toDF().filter(col("processed") == "false").display()
delta_tgt.alias("tgt").merge(
   recon_df_with_selected_columns.withColumn("processed", lit("false")).alias("src")\
       .withColumn("runId", F.lit(runId)),
   "tgt.bucket_name = src.bucket_name and tgt.prefix = src.prefix and tgt.path == src.path and tgt.processed == false"
).whenNotMatchedInsertAll() \
.execute()

delta_tgt.toDF().filter(col("processed") == "false").display()



bucket_name,prefix,inventory_s3_location,path,size,name,modificationTime,processed,runId


bucket_name,prefix,inventory_s3_location,path,size,name,modificationTime,processed,runId


bucket_name,prefix,inventory_s3_location,path,size,name,modificationTime,processed,runId
databricks-e2demofieldengwest,dom_rodrigues/folder_1/,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_test,0.0,2025-11-17T01-00Z/,2025-11-18T13:07:52Z,False,2e952ce4-94e4-49f7-83c7-8dc5776331e8
databricks-e2demofieldengwest,dom_rodrigues/folder_2/,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_dest_test,0.0,2025-11-17T01-00Z/,2025-11-18T13:07:53Z,False,2e952ce4-94e4-49f7-83c7-8dc5776331e8


In [0]:
s3_inventory_manifest_df = delta_tgt.toDF()
s3_inventory_manifest_df.filter(col("processed") == "false").limit(2).display() if trace else None

bucket_name,prefix,inventory_s3_location,path,size,name,modificationTime,processed,runId
databricks-e2demofieldengwest,dom_rodrigues/folder_1/,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_test,0.0,2025-11-17T01-00Z/,2025-11-18T13:07:52Z,False,2e952ce4-94e4-49f7-83c7-8dc5776331e8
databricks-e2demofieldengwest,dom_rodrigues/folder_2/,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_dest_test,0.0,2025-11-17T01-00Z/,2025-11-18T13:07:53Z,False,2e952ce4-94e4-49f7-83c7-8dc5776331e8


In [0]:
s3_inventory_files_to_process = s3_inventory_manifest_df.filter(col("processed") == "false")
if (s3_inventory_files_to_process.count() == 0):
  message = f"No new inventory files to process  for s3_batch_copy_id {s3_batch_copy_id} using  runId {runId}"
  raise Exception(json.dumps( {"status": "WARNING", "message": message}))
else:
  s3_inventory_files_to_process.display() if trace else None  

bucket_name,prefix,inventory_s3_location,path,size,name,modificationTime,processed,runId
databricks-e2demofieldengwest,dom_rodrigues/folder_1/,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_test,0.0,2025-11-17T01-00Z/,2025-11-18T13:07:52Z,False,2e952ce4-94e4-49f7-83c7-8dc5776331e8
databricks-e2demofieldengwest,dom_rodrigues/folder_2/,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_dest_test,0.0,2025-11-17T01-00Z/,2025-11-18T13:07:53Z,False,2e952ce4-94e4-49f7-83c7-8dc5776331e8


### For the latest inventory, read gz file and build inventory list_details table

In [0]:
print(f"Starting recon process for {s3_batch_copy_id} using runId {runId}")

Starting recon process for Test1 using runId 2e952ce4-94e4-49f7-83c7-8dc5776331e8


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode

def read_manifest_json(manifest_path_json):
  df = spark.read.option("multiline", "true").json(manifest_path_json)
  file_format = df.select("fileFormat").first()["fileFormat"]
  file_schema = df.select("fileSchema").first()["fileSchema"]

  # Explode the files array to get individual file rows
  files_df = df.select(explode("files").alias("file"))
  files_df = files_df.select(
    col("file.key").alias("key"),
    col("file.size").alias("size"),
    col("file.MD5checksum").alias("md5")
  )

  # Collect file paths as a Python list
  file_keys = [row["key"] for row in files_df.collect()]

  print("File Format:", file_format)
  print("File Keys:", file_keys)
  return {"file_key": file_keys[0], "file_format": file_format, "file_schema": file_schema}


#convert to pandas dataframe to use spark context
s3_inventory_manifest_df_result = pd.DataFrame(columns=s3_inventory_files_to_process.columns)

for index, row in s3_inventory_files_to_process.toPandas().iterrows():
    try: 
      row["file_info"] = read_manifest_json(row["path"] + "/" +  row["name"] + "/manifest.json")
      s3_inventory_manifest_df_result = pd.concat([s3_inventory_manifest_df_result, pd.DataFrame([row])], ignore_index=True)
    except Exception as e:
      print(e)  

s3_inventory_manifest_df_result_spark = spark.createDataFrame(s3_inventory_manifest_df_result)
s3_inventory_manifest_df_result_spark.display() if debug else None      

File Format: CSV
File Keys: ['dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_test/data/b381e4d9-de6e-4fd5-a82f-d44f2e9563a3.csv.gz']
File Format: CSV
File Keys: ['dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_dest_test/data/8edf25fb-1f97-4922-a222-5eab412a754e.csv.gz']


bucket_name,prefix,inventory_s3_location,path,size,name,modificationTime,processed,runId,file_info
databricks-e2demofieldengwest,dom_rodrigues/folder_1/,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_test,0.0,2025-11-17T01-00Z/,2025-11-18T13:07:52Z,False,2e952ce4-94e4-49f7-83c7-8dc5776331e8,"List(CSV, dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_test/data/b381e4d9-de6e-4fd5-a82f-d44f2e9563a3.csv.gz, Bucket, Key, Size, LastModifiedDate, ETag, StorageClass, IsMultipartUploaded, ReplicationStatus, IntelligentTieringAccessTier, BucketKeyStatus, ChecksumAlgorithm)"
databricks-e2demofieldengwest,dom_rodrigues/folder_2/,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_dest_test,0.0,2025-11-17T01-00Z/,2025-11-18T13:07:53Z,False,2e952ce4-94e4-49f7-83c7-8dc5776331e8,"List(CSV, dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_dest_test/data/8edf25fb-1f97-4922-a222-5eab412a754e.csv.gz, Bucket, Key, Size, LastModifiedDate, ETag, StorageClass, IsMultipartUploaded, ReplicationStatus, IntelligentTieringAccessTier, BucketKeyStatus, ChecksumAlgorithm)"


In [0]:
# file_schema = "Bucket, Key, Size, LastModifiedDate, ETag, StorageClass, IsMultipartUploaded, ReplicationStatus, IntelligentTieringAccessTier, BucketKeyStatus, ChecksumAlgorithm"
# customSchema =  StructType([StructField(field.strip(), StringType(), True) for field in file_schema.split(",")])

#   # Path to the gzipped CSV file
# gz_path = "s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_test/data/e63fff2f-1b6f-4eaa-8014-5c561205c074.csv.gz"
# dbutils.fs.ls(gz_path)
# df = spark.read.option("header", True).schema(customSchema).csv(gz_path)
# df.printSchema()
# df.display()

In [0]:

def read_gz_csv(row):
  inventory_s3_location = row["inventory_s3_location"] #s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY
  bucket_name = row["bucket_name"]
  prefix = row["prefix"]
              
  gz_csv_path = row["file_info"]["file_key"] #"INVENTORY/dom-rodrigues-us-east1-dbx-s3-1/dom_dbx_s3_1_notebooks/data/1fabdf54-8a55-4e97-af78-69a147d2fc71.csv.gz"
  file_format = row["file_info"]["file_format"]
  file_schema = row["file_info"]["file_schema"]
  customSchema =  StructType([StructField(field.strip(), StringType(), True) for field in file_schema.split(",")])

  # Path to the gzipped CSV file
  if jpmcEnv:
    gz_path = f'{inventory_s3_location}/{gz_csv_path}' #for JPMC
  else:  
    gz_path = f's3://{bucket_name}/{gz_csv_path}' #for DBX
  #
  print(gz_path) if debug else None

  if file_format == "CSV":
      df = spark.read.option("header", True).schema(customSchema).csv(gz_path)
  elif file_format == "Parquet":
      df = spark.read.parquet(*gz_path)
  elif file_format == "ORC":
      df = spark.read.orc(*gz_path)

  # Read gzipped CSV directly into DataFrame
  if df.isEmpty() == False:

    #Update prior records as ignored for this path
    path_ = row["path"]
    name_ = row["name"]
    spark.sql(f"UPDATE {catalog}.{schema}.ucd_s3_inventory_details set processed= 'ignore' \
     where bucket = '{bucket_name}' and inventory_s3_location = '{inventory_s3_location}' and path = '{path_}' and processed = 'false'")    
    
    if "IsLatest" in df.columns:
      df = df.filter((col('IsLatest') == True) | col('IsLatest').isNull() | (col('IsLatest') == ''))

    df_filtered_for_given_prefix = df.filter(col("Key").startswith(prefix))

    df_filtered_for_given_prefix.limit(2).display() if trace else None

    df_filtered_for_given_prefix.withColumn("inventory_s3_location", lit(row["inventory_s3_location"]))\
      .withColumn("path", lit(row["path"]))\
      .withColumn("name", lit(row["name"]))\
      .withColumn("prefix", lit(row["prefix"]))\
      .withColumn("processed", lit("false"))\
      .withColumn("runId", lit(runId))\
      .write.format("delta").option("mergeSchema", "true").mode("append").saveAsTable(f"{catalog}.{schema}.ucd_s3_inventory_details")
    return True
  else:
    return False



In [0]:

# file="s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/../INVENTORY/dom-rodrigues-us-east1-dbx-s3-1/dom_dbx_s3_1_notebooks/data/1fabdf54-8a55-4e97-af78-69a147d2fc71.csv.gz"
# df = spark.read.option("header", True).csv(file)
# df.count() if debug else None

In [0]:
gz_csv_df_result = pd.DataFrame(columns=s3_inventory_manifest_df_result_spark.columns)

#convert into pandas to use dbutils
for index, row in s3_inventory_manifest_df_result_spark.toPandas().iterrows():
    try: 
      row["result"] = read_gz_csv(row)
      gz_csv_df_result = pd.concat([gz_csv_df_result, pd.DataFrame([row])], ignore_index=True)
    except Exception as e:
      print(e)  

gz_csv_df_result.display() if debug else None  

s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_test/data/b381e4d9-de6e-4fd5-a82f-d44f2e9563a3.csv.gz


Bucket,Key,Size,LastModifiedDate,ETag,StorageClass,IsMultipartUploaded,ReplicationStatus,IntelligentTieringAccessTier,BucketKeyStatus,ChecksumAlgorithm
databricks-e2demofieldengwest,dom_rodrigues/folder_1/31932d15-5e95-4787-9224-7763ad8409a1.csv,105841,2025-11-14T22:50:26.000Z,ab89d01f040a09a02a476a9643fcc764,STANDARD,False,,,DISABLED,CRC64NVME
databricks-e2demofieldengwest,dom_rodrigues/folder_1/XGBoostTrainer_2025-07-16_21-43-17/,0,2025-07-16T21:43:19.000Z,d41d8cd98f00b204e9800998ecf8427e,STANDARD,False,,,DISABLED,CRC64NVME


s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_dest_test/data/8edf25fb-1f97-4922-a222-5eab412a754e.csv.gz


Bucket,Key,Size,LastModifiedDate,ETag,StorageClass,IsMultipartUploaded,ReplicationStatus,IntelligentTieringAccessTier,BucketKeyStatus,ChecksumAlgorithm
databricks-e2demofieldengwest,dom_rodrigues/folder_2/31932d15-5e95-4787-9224-7763ad8409a1.csv,105841,2025-11-14T22:47:10.000Z,ab89d01f040a09a02a476a9643fcc764,STANDARD,False,,,DISABLED,CRC64NVME
databricks-e2demofieldengwest,dom_rodrigues/folder_2/XGBoostTrainer_2025-07-16_21-43-17/,0,2025-11-03T20:55:02.000Z,d41d8cd98f00b204e9800998ecf8427e,STANDARD,False,,,DISABLED,CRC64NVME


bucket_name,prefix,inventory_s3_location,path,size,name,modificationTime,processed,runId,file_info,result
databricks-e2demofieldengwest,dom_rodrigues/folder_1/,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_test,0.0,2025-11-17T01-00Z/,2025-11-18T13:07:52Z,False,2e952ce4-94e4-49f7-83c7-8dc5776331e8,"List(CSV, dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_test/data/b381e4d9-de6e-4fd5-a82f-d44f2e9563a3.csv.gz, Bucket, Key, Size, LastModifiedDate, ETag, StorageClass, IsMultipartUploaded, ReplicationStatus, IntelligentTieringAccessTier, BucketKeyStatus, ChecksumAlgorithm)",True
databricks-e2demofieldengwest,dom_rodrigues/folder_2/,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_dest_test,0.0,2025-11-17T01-00Z/,2025-11-18T13:07:53Z,False,2e952ce4-94e4-49f7-83c7-8dc5776331e8,"List(CSV, dom_rodrigues/INVENTORY/databricks-e2demofieldengwest/dom_rodrigues_inventory_dest_test/data/8edf25fb-1f97-4922-a222-5eab412a754e.csv.gz, Bucket, Key, Size, LastModifiedDate, ETag, StorageClass, IsMultipartUploaded, ReplicationStatus, IntelligentTieringAccessTier, BucketKeyStatus, ChecksumAlgorithm)",True


#### Compare inventory details using batch copy id

### Create recon table using source and destination prefix

In [0]:
src_df_s3_recon = df_s3_recon.select(col("src_bucket_name").alias("bucket_name"), col("src_inventory_name").alias("inventory_name"), col("src_inventory_s3_location").alias("inventory_s3_location"), col("src_prefix").alias("prefix"), col("s3_batch_copy_id"), lit(runId).alias("runId"))
                                     
src_df_s3_recon.display() if trace else None
dest_df_s3_recon = df_s3_recon.select(col("dest_bucket_name").alias("bucket_name"), col("dest_inventory_name").alias("inventory_name"), col("dest_inventory_s3_location").alias("inventory_s3_location"), col("dest_prefix").alias("prefix"), col("s3_batch_copy_id"), lit(runId).alias("runId"))

dest_df_s3_recon.display() if trace else None



bucket_name,inventory_name,inventory_s3_location,prefix,s3_batch_copy_id,runId
databricks-e2demofieldengwest,dom_rodrigues_inventory_test,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,dom_rodrigues/folder_1/,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8


bucket_name,inventory_name,inventory_s3_location,prefix,s3_batch_copy_id,runId
databricks-e2demofieldengwest,dom_rodrigues_inventory_dest_test,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,dom_rodrigues/folder_2/,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8


In [0]:
ucd_s3_inventory_details_df_all = spark.sql(f"select * from {catalog}.{schema}.ucd_s3_inventory_details where processed = 'false'")
ucd_s3_inventory_details_df_src =  ucd_s3_inventory_details_df_all.alias("all").join(src_df_s3_recon.alias("src"), (col("all.Bucket") == col("src.bucket_name")) & (col("all.prefix") == col("src.prefix")) & (col("all.inventory_s3_location") == col("src.inventory_s3_location"))).select(col("all.Bucket"), F.regexp_replace(col("all.Key"), F.concat(F.lit("^"), col("all.prefix")), "").alias("Key"), col("Size"),  col("ETag"), col('all.prefix'), col("src.s3_batch_copy_id"), col("all.runId"), col("all.lastModifiedDate"))

ucd_s3_inventory_details_df_src.limit(5).display() if trace else None

ucd_s3_inventory_details_df_dest =  ucd_s3_inventory_details_df_all.alias("all").join(dest_df_s3_recon.alias("dest"), (col("all.Bucket") == col("dest.bucket_name")) & (col("all.prefix") == col("dest.prefix")) & (col("all.inventory_s3_location") == col("dest.inventory_s3_location"))).select(col("all.Bucket"), F.regexp_replace(col("all.Key"), F.concat(F.lit("^"), col("all.prefix")), "").alias("Key"), col("Size"),  col("ETag"), col('all.prefix'), col("dest.s3_batch_copy_id"), col("all.runId"), col("all.lastModifiedDate"))

ucd_s3_inventory_details_df_dest.limit(5).display() if trace else None


Bucket,Key,Size,ETag,prefix,s3_batch_copy_id,runId,lastModifiedDate
databricks-e2demofieldengwest,31932d15-5e95-4787-9224-7763ad8409a1.csv,105841,ab89d01f040a09a02a476a9643fcc764,dom_rodrigues/folder_1/,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,2025-11-14T22:50:26.000Z
databricks-e2demofieldengwest,XGBoostTrainer_2025-07-16_21-43-17/,0,d41d8cd98f00b204e9800998ecf8427e,dom_rodrigues/folder_1/,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,2025-07-16T21:43:19.000Z
databricks-e2demofieldengwest,XGBoostTrainer_2025-07-16_21-43-17/.validate_storage_marker,0,d41d8cd98f00b204e9800998ecf8427e,dom_rodrigues/folder_1/,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,2025-07-16T21:43:19.000Z
databricks-e2demofieldengwest,XGBoostTrainer_2025-07-16_21-43-17/XGBoostTrainer_e32ff_00000_0_2025-07-16_21-43-21/,0,d41d8cd98f00b204e9800998ecf8427e,dom_rodrigues/folder_1/,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,2025-07-16T21:43:22.000Z
databricks-e2demofieldengwest,XGBoostTrainer_2025-07-16_21-43-17/basic-variant-state-2025-07-16_21-43-21.json,6905,893385a5aa2cbabf348c171b4d74989c,dom_rodrigues/folder_1/,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,2025-07-16T22:08:20.000Z


Bucket,Key,Size,ETag,prefix,s3_batch_copy_id,runId,lastModifiedDate
databricks-e2demofieldengwest,31932d15-5e95-4787-9224-7763ad8409a1.csv,105841,ab89d01f040a09a02a476a9643fcc764,dom_rodrigues/folder_2/,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,2025-11-14T22:47:10.000Z
databricks-e2demofieldengwest,XGBoostTrainer_2025-07-16_21-43-17/,0,d41d8cd98f00b204e9800998ecf8427e,dom_rodrigues/folder_2/,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,2025-11-03T20:55:02.000Z
databricks-e2demofieldengwest,XGBoostTrainer_2025-07-16_21-43-17/.validate_storage_marker,0,d41d8cd98f00b204e9800998ecf8427e,dom_rodrigues/folder_2/,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,2025-11-05T19:40:25.000Z
databricks-e2demofieldengwest,XGBoostTrainer_2025-07-16_21-43-17/XGBoostTrainer_e32ff_00000_0_2025-07-16_21-43-21/,0,d41d8cd98f00b204e9800998ecf8427e,dom_rodrigues/folder_2/,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,2025-11-03T22:41:14.000Z
databricks-e2demofieldengwest,XGBoostTrainer_2025-07-16_21-43-17/XGBoostTrainer_e32ff_00000_0_2025-07-16_21-43-21/XGBoostTrainer_e32ff_00000_0_2025-07-16_21-43-21/,0,d41d8cd98f00b204e9800998ecf8427e,dom_rodrigues/folder_2/,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,2025-11-05T19:40:58.000Z


In [0]:
match_columns = [  'Key','s3_batch_copy_id', 'runId']

existsInSrcAndNotInDestination_files_and_folders = ucd_s3_inventory_details_df_src.join(ucd_s3_inventory_details_df_dest, on=match_columns, how='left_anti')
#depends on how files were copied, folders may not be reported as key objects in destination via inventory service - filter those entries
existsInSrcAndNotInDestination = existsInSrcAndNotInDestination_files_and_folders.filter(~col("Key").endswith("/")).withColumn("status", lit("EXISTS_IN_SRC_AND_BUT_NOT_IN_DEST"))
existsInSrcAndNotInDestination.limit(10).display() if debug else None

existsInDestAndNotInSource_files_and_folders = ucd_s3_inventory_details_df_dest.join(ucd_s3_inventory_details_df_src, on=match_columns, how='left_anti')
#depends on how files were copied, folders may not be reported as key objects in destination via inventory service - filter those entries
existsInDestAndNotInSource = existsInDestAndNotInSource_files_and_folders.filter(~col("Key").endswith("/")).withColumn("status", lit("EXISTS_IN_DEST_AND_BUT_NOT_IN_SRC"))
existsInDestAndNotInSource.limit(10).display() if trace else None

detailed_match_columns = [ 'Key', 's3_batch_copy_id', 'runId']

filesExistsInSrcButNewerThanDestination = ucd_s3_inventory_details_df_src.alias("src").join(ucd_s3_inventory_details_df_dest.alias("dest"), on=detailed_match_columns, how='inner').filter((col("src.LastModifiedDate") > col("dest.LastModifiedDate"))).select("src.*").withColumn("status", lit("FILES_IN_SRC_ARE_NEWER_THAN_DESTINATION"))
filesExistsInSrcButNewerThanDestination.limit(10).display() if trace else None

existsInSrcAndDestinationAndButNotSame = ucd_s3_inventory_details_df_src.alias("src").join(ucd_s3_inventory_details_df_dest.alias("dest"), on=detailed_match_columns, how='inner').filter((col("dest.LastModifiedDate") >= col("src.LastModifiedDate")) & (col("src.size") != col("dest.size")) ).select("src.*").withColumn("status", lit("FILES_IN_DEST_ARE_NEWER_THAN_SRC_AND_NOT_OF_SAME_SIZE"))
# We can't use ETag at JPMC due to KMS | (col("src.ETag") != col("dest.ETag")), using calculated checksum instead
existsInSrcAndDestinationAndButNotSame.limit(10).display() if trace else None

existsInSrcAndDestinationAndSame = ucd_s3_inventory_details_df_src.alias("src").join(ucd_s3_inventory_details_df_dest.alias("dest"), on=detailed_match_columns, how='inner').filter((col("dest.LastModifiedDate") >= col("src.LastModifiedDate")) & (col("src.size") == col("dest.size"))).select("src.*", col("dest.Bucket").alias("dest_Bucket"), col("dest.Prefix").alias("dest_prefix")).withColumn("status", lit("EXISTS_IN_SRC_AND_DEST_SAME"))
existsInSrcAndDestinationAndSame.limit(10).display() if trace else None




Key,s3_batch_copy_id,runId,Bucket,Size,ETag,prefix,lastModifiedDate,status


Key,s3_batch_copy_id,runId,Bucket,Size,ETag,prefix,lastModifiedDate,status


Key,s3_batch_copy_id,runId,Bucket,Size,ETag,prefix,lastModifiedDate,status
31932d15-5e95-4787-9224-7763ad8409a1.csv,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,105841,ab89d01f040a09a02a476a9643fcc764,dom_rodrigues/folder_1/,2025-11-14T22:50:26.000Z,FILES_IN_SRC_ARE_NEWER_THAN_DESTINATION


Key,s3_batch_copy_id,runId,Bucket,Size,ETag,prefix,lastModifiedDate,status


Key,s3_batch_copy_id,runId,Bucket,Size,ETag,prefix,lastModifiedDate,dest_Bucket,dest_prefix,status
XGBoostTrainer_2025-07-16_21-43-17/,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,0,d41d8cd98f00b204e9800998ecf8427e,dom_rodrigues/folder_1/,2025-07-16T21:43:19.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME
XGBoostTrainer_2025-07-16_21-43-17/.validate_storage_marker,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,0,d41d8cd98f00b204e9800998ecf8427e,dom_rodrigues/folder_1/,2025-07-16T21:43:19.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME
XGBoostTrainer_2025-07-16_21-43-17/XGBoostTrainer_e32ff_00000_0_2025-07-16_21-43-21/,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,0,d41d8cd98f00b204e9800998ecf8427e,dom_rodrigues/folder_1/,2025-07-16T21:43:22.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME
XGBoostTrainer_2025-07-16_21-43-17/basic-variant-state-2025-07-16_21-43-21.json,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,6905,893385a5aa2cbabf348c171b4d74989c,dom_rodrigues/folder_1/,2025-07-16T22:08:20.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME
XGBoostTrainer_2025-07-16_21-43-17/experiment_state-2025-07-16_21-43-21.json,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,5538,facb8e70f8d4f70f01adbbcb897b48d7,dom_rodrigues/folder_1/,2025-07-16T22:08:20.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME
XGBoostTrainer_2025-07-16_21-43-17/trainer.pkl,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,2200,66edcb94f0a2af6190c965b06635b2ec,dom_rodrigues/folder_1/,2025-07-16T21:43:19.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME
XGBoostTrainer_2025-07-16_21-43-17/tuner.pkl,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,1242,7b6ffec15856aa77fd72914d5b89467b,dom_rodrigues/folder_1/,2025-07-16T21:43:19.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME
XGBoostTrainer_2025-07-16_21-43-18/.validate_storage_marker,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,0,d41d8cd98f00b204e9800998ecf8427e,dom_rodrigues/folder_1/,2025-07-16T21:43:24.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME
XGBoostTrainer_2025-07-16_21-43-18/XGBoostTrainer_e5f11_00000_0_2025-07-16_21-43-28/,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,0,d41d8cd98f00b204e9800998ecf8427e,dom_rodrigues/folder_1/,2025-07-16T21:43:29.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME
XGBoostTrainer_2025-07-16_21-43-18/XGBoostTrainer_e5f18_00000_0_2025-07-16_21-43-28/,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,0,d41d8cd98f00b204e9800998ecf8427e,dom_rodrigues/folder_1/,2025-07-16T21:43:29.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME


In [0]:
if createMetadataTables:
  spark.sql(f"drop table if exists {catalog}.{schema}.ucd_s3_inventory_recon")
  existsInSrcAndNotInDestination.withColumnRenamed("bucket", "src_bucket").withColumnRenamed("prefix", "src_prefix").withColumn("time_stamp", current_timestamp()).withColumn("dest_bucket", F.lit("")).withColumn("dest_prefix", F.lit("")).limit(0).write.option("mergeSchema", "true").mode("overwrite").saveAsTable(f"{catalog}.{schema}.ucd_s3_inventory_recon")
spark.sql(f"select *  from {catalog}.{schema}.ucd_s3_inventory_recon")

DataFrame[Key: string, s3_batch_copy_id: string, runId: string, src_bucket: string, Size: string, ETag: string, src_prefix: string, status: string, time_stamp: timestamp, dest_bucket: string, dest_prefix: string, dest_bucket_name: string, lastModifiedDate: string, sync_to_dest_bucket_status: string]

In [0]:

if performChecksumValidation:
    @pandas_udf('String')
    def getFilePath(file_key: pd.Series) -> pd.Series:
        return file_key.apply(lambda x: os.path.splitext(x)[-1] if '.' in x else None)

    existsInSrcAndDestinationWithFileType = existsInSrcAndDestinationAndSame.withColumn("file_type", getFilePath(existsInSrcAndDestinationAndSame.Key )).filter((col("file_type").isNotNull()) & (col("file_type") != "validate_storage_marker")).filter(col("Size") > 0)

    file_types = [row.file_type for row in existsInSrcAndDestinationWithFileType.select('file_type').distinct().collect()]
    fractions = {'csv': SAMPLE_PERCENT_FOR_CHECKSUM_CHECK, 'json': SAMPLE_PERCENT_FOR_CHECKSUM_CHECK, 'parquet': SAMPLE_PERCENT_FOR_CHECKSUM_CHECK}
    default_fraction = SAMPLE_PERCENT_FOR_CHECKSUM_CHECK
    fractions_full = {ft: fractions.get(ft, default_fraction) for ft in file_types}

    existsInSrcAndDestinationWithFileType_sample = existsInSrcAndDestinationWithFileType.sampleBy("file_type", fractions_full, seed=42)
    existsInSrcAndDestinationWithFileType_sample.limit(10).display() if trace else None

Key,s3_batch_copy_id,runId,Bucket,Size,ETag,prefix,lastModifiedDate,dest_Bucket,dest_prefix,status,file_type
XGBoostTrainer_2025-07-16_21-46-02/trainer.pkl,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,2200,6697789a2e8707f5087b201a5bc84afe,dom_rodrigues/folder_1/,2025-07-16T21:46:03.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME,.pkl
XGBoostTrainer_2025-07-16_21-48-54/tuner.pkl,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,1242,f913dd4db0f29e338b50df789a4ede3f,dom_rodrigues/folder_1/,2025-07-16T21:48:55.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME,.pkl
XGBoostTrainer_2025-07-17_21-58-17/trainer.pkl,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,2200,d60d8b57cd541d7d03a5b6c3e08ba5e8,dom_rodrigues/folder_1/,2025-07-17T21:58:18.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME,.pkl
XGBoostTrainer_2025-07-24_13-00-39/experiment_state-2025-07-24_13-00-42.json,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,5538,3c93529ff6f4eeaca92e5251709e4f7d,dom_rodrigues/folder_1/,2025-07-24T13:32:53.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME,.json
XGBoostTrainer_2025-07-24_15-41-40/tuner.pkl,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,1242,bbffc89f04b606f37e805a6ca9b62dfd,dom_rodrigues/folder_1/,2025-07-24T15:41:42.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME,.pkl
bigfiles/5gb_dummy_file.parquet/part-00000-tid-6768014599873935333-67a5341b-9215-4e29-88da-37eab66a11fa-112198-1.c000.snappy.parquet,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,34023959,b03c9827c6fa353bdac276bbe0ebf79b-4,dom_rodrigues/folder_1/,2025-10-21T12:53:35.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME,.parquet
bigfiles/5gb_dummy_file.parquet/part-00025-tid-6768014599873935333-67a5341b-9215-4e29-88da-37eab66a11fa-112223-1.c000.snappy.parquet,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,34024055,37cbb96339eb8b38fa7dbb78f0a99f75-4,dom_rodrigues/folder_1/,2025-10-21T12:53:36.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME,.parquet
bigfiles/5gb_dummy_file.parquet/part-00029-tid-6768014599873935333-67a5341b-9215-4e29-88da-37eab66a11fa-112227-1.c000.snappy.parquet,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,34023896,5c0f81384a6d5994dd723f2e20821327-4,dom_rodrigues/folder_1/,2025-10-21T12:53:36.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME,.parquet
bigfiles/subfolder1/subsubfolder1/PyTorchWithHyperOpt.py,Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,6435,5eeee3ff0736abd4764211ef00a4540e,dom_rodrigues/folder_1/,2025-10-31T22:51:30.000Z,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,EXISTS_IN_SRC_AND_DEST_SAME,.py


In [0]:

if performChecksumValidation:
  import warnings
  warnings.filterwarnings("ignore", 
                         message=".*object-dtype columns with all-bool values.*", 
                         category=FutureWarning)
  def validate_checksum(row):
    def calculate_checksum(src_path, dest_path) :
      src_df = spark.read.format("binaryFile").load(unquote(src_path))
      src_checksum_df = src_df.withColumn("md5_checksum", md5(col("content")))
      src_checksum =  src_checksum_df.first()["md5_checksum"]
      dest_df = spark.read.format("binaryFile").load(unquote(dest_path))
      dest_checksum_df = dest_df.withColumn("md5_checksum", md5(col("content")))
      dest_checksum =  dest_checksum_df.first()["md5_checksum"]
      return True if src_checksum == dest_checksum else False
    

    src_path = "s3://" + row["Bucket"] + "/" + row["prefix"] +  row["Key"]
    dest_path = "s3://" + row["dest_Bucket"] + "/" + row["dest_prefix"] +  row["Key"]

    return calculate_checksum(src_path, dest_path )

  existsInSrcAndDestinationWithFileType_sample_checksums = pd.DataFrame(columns=existsInSrcAndDestinationWithFileType_sample.columns)

  #convert into pandas to use spark.read
  for index, row in existsInSrcAndDestinationWithFileType_sample.toPandas().iterrows():
      try: 
        row["checksum_valid"] = validate_checksum(row)
        existsInSrcAndDestinationWithFileType_sample_checksums = pd.concat([existsInSrcAndDestinationWithFileType_sample_checksums, pd.DataFrame([row])], ignore_index=True)
      except Exception as e:
        print(e)  

  existsInSrcAndDestination_with_different_checksums = spark.createDataFrame(existsInSrcAndDestinationWithFileType_sample_checksums[existsInSrcAndDestinationWithFileType_sample_checksums["checksum_valid"] == False].drop(["checksum_valid", "file_type"], axis=1), existsInSrcAndDestinationAndSame.schema).withColumn("status", lit("EXISTS_IN_SRC_AND_DESTINATION_BUT_DIFFERENT_CHECKSUM"))

  existsInSrcAndDestination_with_different_checksums.limit(10).display() if debug else None
else:
  existsInSrcAndDestination_with_different_checksums = spark.createDataFrame([], existsInSrcAndDestinationAndSame.schema)  

'NoneType' object is not subscriptable


Key,s3_batch_copy_id,runId,Bucket,Size,ETag,prefix,lastModifiedDate,dest_Bucket,dest_prefix,status


In [0]:
current_datetime = datetime.now()
recon_failed: bool = False
recon_warning: bool = False

if (existsInSrcAndDestination_with_different_checksums.isEmpty() == False):
    recon_failed = True
    existsInSrcAndDestination_with_different_checksums.alias("inventory").join(df_s3_recon.alias("recon"), (col("inventory.Bucket") == col("recon.src_bucket_name")) & (col("inventory.s3_batch_copy_id") == col("recon.s3_batch_copy_id"))).select(col("inventory.*"), col("recon.dest_bucket_name")).withColumn("time_stamp", lit(current_datetime)).withColumnRenamed("Bucket", "src_bucket").withColumnRenamed("prefix", "src_prefix").withColumn("sync_to_dest_bucket_status", lit("Review"))\
    .write.format("delta").option("mergeSchema", "true").mode("append").saveAsTable(f"{catalog}.{schema}.ucd_s3_inventory_recon")

if (existsInSrcAndNotInDestination.isEmpty() == False):
    recon_warning = True
    existsInSrcAndNotInDestination.alias("inventory").join(df_s3_recon.alias("recon"), (col("inventory.Bucket") == col("recon.src_bucket_name")) & (col("inventory.s3_batch_copy_id") == col("recon.s3_batch_copy_id"))).select(col("inventory.*"), col("recon.dest_bucket_name"), col("recon.dest_prefix")).withColumn("time_stamp", lit(current_datetime)).withColumnRenamed("Bucket", "src_bucket").withColumnRenamed("prefix", "src_prefix").withColumn("sync_to_dest_bucket_status", lit("Copy"))\
    .write.format("delta").option("mergeSchema", "true").mode("append").saveAsTable(f"{catalog}.{schema}.ucd_s3_inventory_recon")

if (existsInDestAndNotInSource.isEmpty() == False):
    recon_warning = True
    existsInDestAndNotInSource.alias("inventory").join(df_s3_recon.alias("recon"), (col("inventory.Bucket") == col("recon.src_bucket_name")) & (col("inventory.s3_batch_copy_id") == col("recon.s3_batch_copy_id"))).select(col("inventory.*"), col("recon.src_bucket_name"), col("recon.src_prefix")).withColumn("time_stamp", lit(current_datetime)).withColumnRenamed("Bucket", "dest_bucket").withColumnRenamed("prefix", "dest_prefix").withColumn("sync_to_dest_bucket_status", lit("Review"))\
    .write.format("delta").option("mergeSchema", "true").mode("append").saveAsTable(f"{catalog}.{schema}.ucd_s3_inventory_recon")


if (existsInSrcAndDestinationAndButNotSame.isEmpty() == False):
    recon_warning = True
    existsInSrcAndDestinationAndButNotSame.alias("inventory").join(df_s3_recon.alias("recon"), (col("inventory.Bucket") == col("recon.src_bucket_name")) & (col("inventory.s3_batch_copy_id") == col("recon.s3_batch_copy_id"))).select(col("inventory.*"), col("recon.dest_bucket_name"), col("recon.dest_prefix")).withColumn("time_stamp", lit(current_datetime)).withColumnRenamed("Bucket", "src_bucket").withColumnRenamed("prefix", "src_prefix").withColumn("sync_to_dest_bucket_status", lit("Review"))\
    .write.format("delta").option("mergeSchema", "true").mode("append").saveAsTable(f"{catalog}.{schema}.ucd_s3_inventory_recon")

if (filesExistsInSrcButNewerThanDestination.isEmpty() == False):
    recon_warning = True
    filesExistsInSrcButNewerThanDestination.alias("inventory").join(df_s3_recon.alias("recon"), (col("inventory.Bucket") == col("recon.src_bucket_name")) & (col("inventory.s3_batch_copy_id") == col("recon.s3_batch_copy_id"))).select(col("inventory.*"), col("recon.dest_bucket_name"), col("recon.dest_prefix")).withColumn("time_stamp", lit(current_datetime)).withColumnRenamed("Bucket", "src_bucket").withColumnRenamed("prefix", "src_prefix").withColumn("sync_to_dest_bucket_status", lit("Review"))\
    .write.format("delta").option("mergeSchema", "true").mode("append").saveAsTable(f"{catalog}.{schema}.ucd_s3_inventory_recon")


In [0]:
recon_df = spark.sql(f"select  s3_batch_copy_id, runId, src_bucket,  src_prefix, dest_bucket_name, dest_prefix, status, time_stamp, count(*) as count from {catalog}.{schema}.ucd_s3_inventory_recon where s3_batch_copy_id = '{s3_batch_copy_id}' and runId = '{runId}' and time_stamp = '{current_datetime}' group by s3_batch_copy_id, runId, src_bucket,  src_prefix, dest_bucket_name, dest_prefix, status, time_stamp").withColumn("status_json", F.to_json(F.struct("status", "count")))

recon_result_df = recon_df.groupBy("s3_batch_copy_id",  "runId", "src_bucket",
                    "src_prefix", "dest_bucket_name", "dest_prefix", "time_stamp").agg(
          F.collect_list("status_json").alias("status_json") 
      ) 

recon_result_df.display() if trace else None

recon_status = {"status": "SUCCESS"} if recon_result_df.select("status_json").first() == None  else recon_result_df.select("status_json").first()[0]
status = 'FAILED' if recon_failed else 'WARNING' if recon_warning else 'SUCCESS'

print(f"Recon status for s3_batch_copy_id {s3_batch_copy_id} for runId {runId} is  {status}, details: {recon_status}")

s3_batch_copy_id,runId,src_bucket,src_prefix,dest_bucket_name,dest_prefix,time_stamp,status_json
Test1,2e952ce4-94e4-49f7-83c7-8dc5776331e8,databricks-e2demofieldengwest,dom_rodrigues/folder_1/,databricks-e2demofieldengwest,dom_rodrigues/folder_2/,2025-11-18T13:08:33.575691Z,"List({""status"":""FILES_IN_SRC_ARE_NEWER_THAN_DESTINATION"",""count"":1})"




In [0]:
spark.sql(f"select * from {catalog}.{schema}.ucd_src_dest_mapping").display()

dest_bucket_name,dest_inventory_name,dest_inventory_s3_location,dest_prefix,enabled,last_recon_timestamp,recon_status,runId,s3_batch_copy_id,src_bucket_name,src_inventory_name,src_inventory_s3_location,src_prefix,status
databricks-e2demofieldengwest,dom_rodrigues_inventory_dest_test,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,dom_rodrigues/folder_2/,True,2025-11-18T13:05:29.49168Z,"[""{""status"":""FILES_IN_SRC_ARE_NEWER_THAN_DESTINATION"",""count"":1}""]",50933e4f-de57-43b9-930d-604d898b2ddc,Test1,databricks-e2demofieldengwest,dom_rodrigues_inventory_test,s3://databricks-e2demofieldengwest/dom_rodrigues/INVENTORY,dom_rodrigues/folder_1/,FAILED


In [0]:
spark.sql(f"""UPDATE {catalog}.{schema}.ucd_src_dest_mapping set last_recon_timestamp = '{current_datetime}', runId = '{runId}',status = '{status}', recon_status = '{json.dumps(recon_status)}' where s3_batch_copy_id = '{s3_batch_copy_id}' and enabled=True""")

DataFrame[num_affected_rows: bigint]

In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import lit, concat, col

delta_tgt = DeltaTable.forName(spark, f"{catalog}.{schema}.ucd_s3_inventory_list")

delta_tgt.alias("tgt").merge(
    s3_inventory_files_to_process.alias("src"),
    "tgt.bucket_name = src.bucket_name and tgt.path == src.path and tgt.processed =='false'"
) .whenMatchedUpdate(set={"processed": lit("true"), "runId":  lit(runId)}) \
 .execute()


In [0]:
spark.sql(f"""select runId, s3_batch_copy_id, src_bucket_name, dest_bucket_name, src_prefix, dest_prefix, status, recon_status, last_recon_timestamp  from {catalog}.{schema}.ucd_src_dest_mapping  where s3_batch_copy_id = '{s3_batch_copy_id}' and enabled=True""").display()

runId,s3_batch_copy_id,src_bucket_name,dest_bucket_name,src_prefix,dest_prefix,status,recon_status,last_recon_timestamp
2e952ce4-94e4-49f7-83c7-8dc5776331e8,Test1,databricks-e2demofieldengwest,databricks-e2demofieldengwest,dom_rodrigues/folder_1/,dom_rodrigues/folder_2/,WARNING,"[""{""status"":""FILES_IN_SRC_ARE_NEWER_THAN_DESTINATION"",""count"":1}""]",2025-11-18T13:08:33.575691Z


In [0]:
dbutils.notebook.exit(json.dumps( {"status": f'{status}', "message": f'{recon_status}', 'runId': runId}))