In [0]:
import pandas as pd
import time
import concurrent.futures
import os
import time
import concurrent.futures
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed,ThreadPoolExecutor
from multiprocessing import cpu_count
import threading
import uuid

In [0]:
dbutils.widgets.text("root_src_folder", "s3://databricks-e2demofieldengwest/dom_rodrigues/folder_1/", "Source folder") 
#dbfs:/tmp/dom3
dbutils.widgets.text("root_dest_folder", "s3://databricks-e2demofieldengwest/dom_rodrigues/folder_2/", "Destination Folder") 
#dbfs:/tmp/dom4

dbutils.widgets.text("max_workers", "4", "Max Workers")
dbutils.widgets.text("dbutils_thread_pool_size", "80", "DBUtils Thread Pool Size")

dbutils.widgets.dropdown("fileCopyDepthLevel", "3", ["2", "3"],  "Starting Folder Copy Level")
dbutils.widgets.dropdown("use_parallel_notebooks", "False", ["True", "False"], "Use Parallel Notebooks instead of threads")

dbutils.widgets.dropdown("debug", "True", ["True", "False"], "Debug")

dbutils.widgets.text("runId", "")
runId =  dbutils.widgets.get("runId")
runId = str(uuid.uuid4()) if len(runId.strip()) == 0 else runId

In [0]:
root_src_folder =dbutils.widgets.get("root_src_folder")
root_dest_folder = dbutils.widgets.get("root_dest_folder")
debug = True if dbutils.widgets.get("debug") == "True" else False
use_parallel_notebooks = True if dbutils.widgets.get("use_parallel_notebooks") == "True" else False

fileCopyDepthLevel = int(dbutils.widgets.get("fileCopyDepthLevel")) #(root, folderLevel1, folderLevel11)
max_workers = int(dbutils.widgets.get("max_workers"))
dbutils_thread_pool_size  = int(dbutils.widgets.get("dbutils_thread_pool_size"))

trace=False
generateTestData=False

if root_src_folder == root_dest_folder:
    raise Exception("Source and Destination folders cannot be the same")


In [0]:
# The operation is parallelized by first recursively listing the objects in the source directory using a thread pool on the driver and subsequently launching a Spark job to perform the individual operations.

spark.conf.set("spark.databricks.service.dbutils.fs.parallel.enabled", True)
spark.conf.set("spark.databricks.service.dbutils.fs.parallel.ls.threadPoolSize", dbutils_thread_pool_size) #default 20
#spark.conf.set("spark.databricks.service.dbutils.fs.parallel.ls.timeoutSeconds", 7200)

### Generate Test Data to Validate file and folder list generation

In [0]:

if generateTestData:
    dbutils.fs.mkdirs(root_src_folder)

    #csv file folder at root filer
    data = [("Apple", 10), ("Banana", 20), ("Orange", 30)]
    columns = ["Fruit", "Quantity"]
    df = spark.createDataFrame(data, columns)
    df.write.mode("overwrite").csv(f"{root_src_folder}/a.csv")

    #parquet file folder  at root folder
    single_partition_df = df.coalesce(1)
    output_path = "output_single.parquet"
    single_partition_df.write.mode("overwrite").parquet(f"{root_src_folder}/p.parquet")

    #empty subfolder at two levels
    dbutils.fs.mkdirs(f"{root_src_folder}/folder1/folder11")

    #empty subfolder at three levels
    dbutils.fs.mkdirs(f"{root_src_folder}/folder1/folder11/folder111")

    #empty subfolder at fourth levels
    dbutils.fs.mkdirs(f"{root_src_folder}/folder1/folder11/folder111/folder1111")

    #file at folder2 folder
    single_partition_df.write.mode("overwrite").parquet(f"{root_src_folder}/folder2/p.parquet")

    #filparquet file folder e at folder3 folder and empty folder folder31
    dbutils.fs.mkdirs(f"{root_src_folder}/folder1/folder3/folder31")
    single_partition_df.write.mode("overwrite").parquet(f"{root_src_folder}/folder3/p.parquet")

    # parquet file folder at folder41 folder 
    single_partition_df.write.mode("overwrite").parquet(f"{root_src_folder}/folder4/folder41/p.parquet")



In [0]:
dbutils.fs.ls(f"{root_src_folder}/p.parquet") if generateTestData else None

In [0]:
if generateTestData:
    parquet_single_file=f"{root_src_folder}/p.parquet/part-00000-tid-3071788195048556433-4a9763e1-4acc-4310-b8b1-55d68c23f923-33-1.c000.snappy.parquet"
    dbutils.fs.ls(parquet_single_file)


In [0]:
if generateTestData:

    #root_level
    dbutils.fs.cp(parquet_single_file, f"{root_src_folder}/level1.parquet")

    #at level 3
    dbutils.fs.cp(parquet_single_file, f"{root_src_folder}/folder5/level2.parquet")

    #at level 3
    dbutils.fs.cp(parquet_single_file, f"{root_src_folder}/folder6/folder61/level3.parquet")



### File and Folder List Generator

In [0]:
dbutils.fs.ls(f"{root_src_folder}") if debug else None
try:
    dbutils.fs.ls(f"{root_dest_folder}") if debug else None 
except Exception as e:
    print(e)
    print(f"creating new destination folder {root_dest_folder}")
    dbutils.fs.mkdirs(root_dest_folder)



In [0]:
def collect_paths_at_second_level(base_path):
    # Track second-level files and first-level files/folders (without overlap)
    level_1 = dbutils.fs.ls(base_path)
    paths_to_copy = []
    filenames = set()

    if len(level_1) == 0:
        paths_to_copy.append({'path': base_path,  'type': 'dir', 'size': 0}) 
    else:    
        for item in level_1:
            if item.isDir():
                subitems = dbutils.fs.ls(item.path)
                # Only add sub-items (files and folders) at second level
                if len(subitems) == 0:
                    paths_to_copy.append({'path': item.path,  'type': 'dir', 'size': item.size}) 
                else:
                    for subitem in subitems:
                        paths_to_copy.append({'path': subitem.path, 'type': 'dir' if subitem.isDir() else 'file','size': subitem.size})
                        filenames.add(subitem.path)
            else:
                paths_to_copy.append({'path': item.path, 'type': 'file', 'size': item.size})
                filenames.add(item.path)
    return paths_to_copy

In [0]:
paths_to_copy = collect_paths_at_second_level(f"{root_src_folder}")

paths_to_copy_pd_df = pd.DataFrame(paths_to_copy)
paths_to_copy_pd_df_files = paths_to_copy_pd_df[paths_to_copy_pd_df['type'] == 'file']
paths_to_copy_pd_df_dirs = paths_to_copy_pd_df[paths_to_copy_pd_df['type'] == 'dir']

print(paths_to_copy_pd_df_files.sort_values(by='path')) if trace else None
print(paths_to_copy_pd_df_dirs.sort_values(by='path')) if trace else None


In [0]:

if fileCopyDepthLevel == 3:
    new_paths_to_copy_pd_df_files = pd.DataFrame(columns=paths_to_copy_pd_df_files.columns)
    new_paths_to_copy_pd_df_dirs = pd.DataFrame(columns=paths_to_copy_pd_df_dirs.columns)

    new_paths_to_copy_pd_df_files = pd.concat([paths_to_copy_pd_df_files, new_paths_to_copy_pd_df_files], ignore_index=True, axis=0)

    for row in paths_to_copy_pd_df_dirs.itertuples(index=False):  # index=False if you don't need the index
        new_paths_df = collect_paths_at_second_level(row.path)
        new_paths_to_copy_pd_df = pd.DataFrame(new_paths_df)
        #AdAppend new files
        new_paths_to_copy_pd_df_files = pd.concat([new_paths_to_copy_pd_df_files, new_paths_to_copy_pd_df[new_paths_to_copy_pd_df['type'] == 'file']], ignore_index=True, axis=0)
        #Append new dirs
        new_paths_to_copy_pd_df_dirs = pd.concat([new_paths_to_copy_pd_df_dirs, new_paths_to_copy_pd_df[new_paths_to_copy_pd_df['type'] == 'dir']], ignore_index=True, axis=0)
else:
    new_paths_to_copy_pd_df_files =  paths_to_copy_pd_df_files
    new_paths_to_copy_pd_df_dirs =  paths_to_copy_pd_df_dirs       


In [0]:
print(f"file count {paths_to_copy_pd_df_files.shape[0]}") if trace else None
print(f"dir count {paths_to_copy_pd_df_dirs.shape[0]}") if trace else None

print(paths_to_copy_pd_df_files.sort_values(by='path')) if trace else None
print(paths_to_copy_pd_df_dirs.sort_values(by='path')) if trace else None

In [0]:
print(f"file count {new_paths_to_copy_pd_df_files.shape[0]}") if trace else None
print(f"dir count {new_paths_to_copy_pd_df_dirs.shape[0]}") if trace else None

print(new_paths_to_copy_pd_df_files.sort_values(by='path')) if trace else None
print(new_paths_to_copy_pd_df_dirs.sort_values(by='path')) if trace else None

In [0]:
new_paths_to_copy_pd_df_all = pd.concat([new_paths_to_copy_pd_df_files, new_paths_to_copy_pd_df_dirs]) 

new_paths_to_copy_pd_df_all['dest_path'] = new_paths_to_copy_pd_df_all['path'].str.replace(root_src_folder, root_dest_folder, regex=False)

new_paths_to_copy_pd_df_all['src_dest_tuple']  = new_paths_to_copy_pd_df_all[['path', 'dest_path']].apply(tuple, axis=1)



In [0]:
if generateTestData:
    all_files = []
    all_files.append(("dbfs:/tmp/dom3/folder5/level2.parquet", "dbfs:/tmp/dom4/folder5/level2.parquet"))
    all_files.append(("dbfs:/tmp/dom3/folder6/folder61/level3.parquet", "dbfs:/tmp/dom4/folder6/folder61/level3.parquet"))
    all_files.append(("dbfs:/tmp/dom3/folder5/level2.parquet", "dbfs:/tmp/dom4/folder5/level2.parquet"))
    # Add file indices for better tracking
    indexed_files = [(src_path, dest_path, i+1, type, len(all_files)) for i, (src_path, dest_path) in enumerate(all_files)]

else:
    indexed_files = [(row['path'], row['dest_path'], row['type'], i+1, new_paths_to_copy_pd_df_all.shape[0]) for i, row in enumerate(new_paths_to_copy_pd_df_all.to_records(index=False))]



In [0]:
#indexed_files = indexed_files[:2]
#print(indexed_files[:1]) if debug else None
#dbutils.fs.cp(indexed_files[0][0], indexed_files[0][1], recurse=True)

### Actual copy process

In [0]:

if generateTestData:
    print(f"\n🚨 FILE COPY : {len(all_files)} files using {max_workers} cores...")
else:    
    print(f"\n🚨 FILE COPY : {new_paths_to_copy_pd_df_all.shape[0]} files using {max_workers} cores...")
print("📊 Progress will be reported every 1 minute...")
print("🔔 Each file or folder completion will be reported immediately with timing...")

def progress_monitor(futures_dict, total_files, max_workers):
    """Monitor progress with ACCURATE running count"""
    start_time = time.time()
    
    while True:
        time.sleep(60)  # Check every 1 minute
        
        completed = sum(1 for f in futures_dict.keys() if f.done())
        
        # CORRECT way to calculate running tasks
        actually_running = min(max_workers, total_files - completed)
        pending = total_files - completed - actually_running
        
        elapsed = time.time() - start_time
        
        if completed > 0:
            avg_time = elapsed / completed
            remaining_estimate = (total_files - completed) * avg_time / max_workers / 60
            print(f"📊 Progress: {completed}/{total_files} completed, {actually_running} actively running, {pending} pending, ~{remaining_estimate:.1f}min remaining")
        else:
            print(f"📊 Progress: {completed}/{total_files} completed, {actually_running} actively running, {pending} pending, {elapsed/60:.1f}min elapsed")
            # If nothing completed after 5+ minutes, likely hanging
            if elapsed > 300:
                print("⚠️  WARNING: No files completed in 5+ minutes - processes may be hanging!")
        
        if completed == total_files:
            break


🚨 FILE COPY : 284 files using 4 cores...
📊 Progress will be reported every 1 minute...
🔔 Each file or folder completion will be reported immediately with timing...


In [0]:
def copy_files(args):
    start_time = time.time()
    src_path, dest_path, file_type, file_index, total_files_len = args

    print(f"ThreadId {threading.get_native_id()}: copying src {src_path}\n") if trace else None
    try: 
        dbutils.fs.cp(src_path, dest_path, recurse=True if file_type == 'dir' else False)
        return {"status": "SUCCESS","duration": time.time() -start_time}
    except Exception as e:
        print(f"Exception {e}")
        return {"status": "FAILED","duration": time.time() -start_time}

In [0]:
cwd = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
notebook_path = "/Workspace" + cwd + "/../s3_copy_worker_notebook"

def copy_files_using_parallel_notebooks(args):
    start_time = time.time()
    src_path, dest_path, file_type, file_index, total_files_len = args
    notebook_parameters = {"src_path": src_path, "dest_path": dest_path, "file_type": file_type, \
        "file_index": file_index, "total_files_len": total_files_len, "dbutils_thread_pool_size": dbutils_thread_pool_size}
                           
    try: 
        returned_value =  dbutils.notebook.run(notebook_path, 0, notebook_parameters)
        print(f"notebook return value {returned_value }") if debug else None
        return {"status": "SUCCESS","duration": time.time() -start_time}
    except Exception as e:
        print(f"Exception {e}")
        return {"status": "FAILED","duration": time.time() -start_time}


In [0]:
start_time = time.time()

#with ProcessPoolExecutor(max_workers=max_workers) as executor:
with ThreadPoolExecutor(max_workers=max_workers) as executor:    
    # Submit all tasks
    if use_parallel_notebooks == True:
        futures_dict = {executor.submit(copy_files_using_parallel_notebooks, file_data): file_data 
                for file_data in indexed_files}
    else:
        futures_dict = {executor.submit(copy_files, file_data): file_data 
           for file_data in indexed_files}    

    # Start progress monitor with correct max_workers
    progress_thread = threading.Thread(target=progress_monitor, args=(futures_dict, len(indexed_files), max_workers))
    progress_thread.daemon = True
    progress_thread.start()
    
    completed = 0
    converted = 0
    failed = 0
    
    # Process results with 10 minute timeout per file
    for future in as_completed(futures_dict, timeout=None):
        completed += 1
        
        try:
            result = future.result(timeout=600)  # 10 minute timeout
            
            # Extract timing information and print completion notice
            file_data = futures_dict[future]
            src_path, dest_path, file_type, file_index, all_files_len = file_data
            if  result["status"] == "SUCCESS":
                converted += 1
                # Extract time from result (it's in format "filename (X.Xs)")
                time_part = result["duration"]
                print(f"✅ COMPLETED [{completed}/{all_files_len}]: {src_path} - {time_part}") if trace else None
            elif  result["status"] == "FAILED":
                failed += 1
                print(f"❌ FAILED [{completed}/{all_files_len}]: {src_path}")
            elif   result["status"] == "SKIPPED":
                print(f"⏭️ SKIPPED [{completed}/{all_files_len}]: {src_path} (already exists)")
            
        except concurrent.futures.TimeoutError as e1:
            print(f"TimeoutError Exception: {e1}")
            failed += 1
            file_data = futures_dict[future]
            src_path, dest_path, file_type, file_index, all_files_len = file_data
            print(f"❌ TIMEOUT [{completed}/{all_files_len}]: {src_path} - took >10min")
            
        except Exception as e:
            print(f"Exception: {e}")
            failed += 1
            file_data = futures_dict[future]
            
            src_path, dest_path, file_type, file_index, all_files_len = file_data
            print(f"❌ ERROR [{completed}/{all_files_len}]: {src_path} - {str(e)[:50]}...")

total_time = time.time() - start_time

📊 Progress: 282/284 completed, 2 actively running, 0 pending, ~0.0min remaining


In [0]:
print(f"📊 Total time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
if converted > 0:
    print(f"📊 Average per copied file (or folder): {total_time/converted:.1f} seconds")
    print(f"📊 Copy rate: {converted/(total_time/60):.1f} files (or folders)/minute")
print(f"📊 Results: {converted} copied, {failed} failed")

📊 Total time: 117.9 seconds (2.0 minutes)
📊 Average per copied file (or folder): 0.4 seconds
📊 Copy rate: 144.6 files (or folders)/minute
📊 Results: 284 copied, 0 failed


In [0]:
dbutils.fs.ls(f"{root_dest_folder}") if trace else None