In [0]:
import uuid
from pyspark.sql.functions import pandas_udf, explode, udf,from_json
from pyspark.sql.types import ArrayType, StringType, DoubleType
import json
from pyspark.sql.functions import struct, flatten, lit, col,row_number, current_timestamp
from pyspark.sql import functions as F
from typing import Iterator, Tuple
import pandas as pd

from pyspark.sql.types import StructType, StructField, StringType, TimestampType, BooleanType
from datetime import datetime
from pyspark.sql import Window
import time
import threading
from concurrent.futures import ProcessPoolExecutor, as_completed,ThreadPoolExecutor
from multiprocessing import cpu_count
import concurrent.futures


In [0]:

dbutils.widgets.text("catalog", "users")
dbutils.widgets.text("schema", "dom_rodrigues")
dbutils.widgets.text("max_workers", "4", "Max Workers")

dbutils.widgets.dropdown("debug", "true", ["true", "false"])
dbutils.widgets.text("runId", "")
dbutils.widgets.text("s3_batch_copy_id", "Test1", "Batch Copy Id to sync")
dbutils.widgets.text("dbutils_thread_pool_size", "80", "DBUtils Thread Pool Size")

catalog = dbutils.widgets.get("catalog")
schema =  dbutils.widgets.get("schema")
runId =  dbutils.widgets.get("runId")

runId = str(uuid.uuid4()) if len(runId.strip()) == 0 else runId

s3_batch_copy_id=dbutils.widgets.get("s3_batch_copy_id")
max_workers = int(dbutils.widgets.get("max_workers"))
dbutils_thread_pool_size  = int(dbutils.widgets.get("dbutils_thread_pool_size"))

debug =  True if dbutils.widgets.get("debug") == 'true' else False
trace = False
createMetadataTables = False

spark.sql(f"use {catalog}.{schema}")

DataFrame[]

In [0]:
missing_files_df = spark.sql(f"""
    select recon.*
    from {catalog}.{schema}.ucd_src_dest_mapping mapping
    join {catalog}.{schema}.ucd_s3_inventory_recon recon 
    on mapping.s3_batch_copy_id = recon.s3_batch_copy_id and mapping.runId = recon.runId
    and mapping.enabled= True
    and mapping.s3_batch_copy_id = '{s3_batch_copy_id}'
    and recon.status = 'EXISTS_IN_SRC_AND_BUT_NOT_IN_DEST'
""")

missing_files_df.limit(10).display() if trace else None

In [0]:
# The operation is parallelized by first recursively listing the objects in the source directory using a thread pool on the driver and subsequently launching a Spark job to perform the individual operations.

spark.conf.set("spark.databricks.service.dbutils.fs.parallel.enabled", True)
spark.conf.set("spark.databricks.service.dbutils.fs.parallel.ls.threadPoolSize", dbutils_thread_pool_size) #default 20
#spark.conf.set("spark.databricks.service.dbutils.fs.parallel.ls.timeoutSeconds", 7200)

In [0]:
s3prefix = "s3://"
missing_files_df_copy = missing_files_df.select(F.concat(F.lit(s3prefix), col("src_bucket"), F.lit("/") , col("src_prefix"), col("Key")).alias("src_key"),
                        F.concat(F.lit(s3prefix), col("dest_bucket_name"), F.lit("/"),col("dest_prefix"), col("Key")).alias("dest_key"),
                        F.when(col("Key").endswith("/"), F.lit("dir")).otherwise(F.lit("file")).alias("type"))

missing_files_df_copy.limit(10).display() if trace else None

In [0]:
new_paths_to_copy_pd_df_all = missing_files_df_copy.toPandas()
indexed_files = [(row['src_key'], row['dest_key'], row['type'], i+1, new_paths_to_copy_pd_df_all.shape[0]) for i, row in enumerate(new_paths_to_copy_pd_df_all.to_records(index=False))]

In [0]:
print(f"Starting sync process for {s3_batch_copy_id} using runId {runId}")

Starting sync process for Test1 using runId c79863a1-6f67-44dc-a31f-f99f1c001ed2


In [0]:
def copy_files(args):
    start_time = time.time()
    src_path, dest_path, file_type, file_index, total_files_len = args

    try: 
        print(f"\nCopying {src_path} to {dest_path} ") if debug else None
        dbutils.fs.cp(src_path, dest_path, recurse=True if file_type == 'dir' else False)
        print(f"SUCCESS -  {src_path} copied, {file_index} /{total_files_len} completed.  Duration: {time.time() -start_time}") if debug else None
    except Exception as e:
        print(f"Exception {e}")
        print(f"🚨 FAILED - {file_index} /{total_files_len} completed.  Duration: {time.time() -start_time}")


In [0]:
#Let's do this sequentially to avoid potential data overwrites  (duplicate writes ok for now). Some intelligence can be built in to do this in parallel
start_time = time.time()
for args in indexed_files:
    copy_files(args)
    
total_time = time.time() - start_time
print(f"📊 Total time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")


Copying s3://databricks-e2demofieldengwest/dom_rodrigues/folder_1/XGBoostTrainer_2025-07-16_21-43-18/ to s3://databricks-e2demofieldengwest/dom_rodrigues/folder_2/XGBoostTrainer_2025-07-16_21-43-18/ 
SUCCESS -  s3://databricks-e2demofieldengwest/dom_rodrigues/folder_1/XGBoostTrainer_2025-07-16_21-43-18/ copied, 1 /44 completed.  Duration: 1.790578842163086

Copying s3://databricks-e2demofieldengwest/dom_rodrigues/folder_1/XGBoostTrainer_2025-07-16_21-43-19/ to s3://databricks-e2demofieldengwest/dom_rodrigues/folder_2/XGBoostTrainer_2025-07-16_21-43-19/ 
SUCCESS -  s3://databricks-e2demofieldengwest/dom_rodrigues/folder_1/XGBoostTrainer_2025-07-16_21-43-19/ copied, 2 /44 completed.  Duration: 1.122368574142456

Copying s3://databricks-e2demofieldengwest/dom_rodrigues/folder_1/XGBoostTrainer_2025-07-16_21-43-42/ to s3://databricks-e2demofieldengwest/dom_rodrigues/folder_2/XGBoostTrainer_2025-07-16_21-43-42/ 
SUCCESS -  s3://databricks-e2demofieldengwest/dom_rodrigues/folder_1/XGBoostTra