In [0]:
from databricks.sdk.service.jobs import JobSettings as Job
from databricks.sdk import WorkspaceClient
from databricks.sdk.service import jobs, compute
import requests
import json
import os
import uuid

In [0]:
dbutils.widgets.text("ucd_s3_copy_job_name", "UCD S3 Copy Using DbUtils", "Job Name")
dbutils.widgets.text("ucd_s3_copy_task_key", "ucd_s3_copy_using_dbutils_task_name", "Task Key Name")

dbutils.widgets.dropdown("debug", "True", ["True", "False"], "Debug")


In [0]:
createJob=True

root_src_folder = "dbfs:/tmp/folder_1/"
root_dest_folder = "dbfs:/tmp/folder_2/"

max_workers = 4 #no of cores
job_timeout = 3600*12 #12 hours
max_concurrent_runs=2
queueEnabled = True

node_type = "i3.xlarge" #"rd-fleet.xlarge"
spark_version = "16.4.x-scala2.12"
instance_profile_arn =  ""

email_ids = ["dom.rodrigues@databricks.com"]
cluster_tags =  { "ResourceClass": "SingleNode"}
policy_id= ""

notebook_name = "aws_s3_copy_using_dbutils"

In [0]:
debug = True if dbutils.widgets.get("debug") == "True" else False

ucd_s3_copy_job_name = dbutils.widgets.get("ucd_s3_copy_job_name")
ucd_s3_copy_task_key = dbutils.widgets.get("ucd_s3_copy_task_key")

cwd = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
folder_path, file_name = os.path.split(cwd)
notebook_path = f"/Workspace{folder_path}/{notebook_name}"

aws_attributes = compute.AwsAttributes(
        instance_profile_arn= None if len(instance_profile_arn) ==0 else instance_profile_arn
)

s3_cp_task_parameters= {
  "root_src_folder": root_src_folder,
  "root_dest_folder": root_dest_folder,
  "max_workers": max_workers,
  "runId" : "{{job.run_id}}"
}

In [0]:


s3_cp_task_cluster = compute.ClusterSpec(
    num_workers= 1,
    spark_version= spark_version,
    spark_conf= {},
    node_type_id= f"{node_type}",
    custom_tags= cluster_tags,
    data_security_mode= compute.DataSecurityMode.SINGLE_USER ,
    policy_id = None if len(policy_id) == 0 else policy_id,
    aws_attributes = aws_attributes
)

In [0]:

s3_cp_task  = jobs.Task(
            description="Copy files from S3 to S3 using dbutils",
            notebook_task=jobs.NotebookTask(
                notebook_path=f"{notebook_path}",
                base_parameters=s3_cp_task_parameters
            ),
            #existing_cluster_id="YOUR_CLUSTER_ID",
            #job_cluster_key="default",
            new_cluster=s3_cp_task_cluster,
            task_key=f"{ucd_s3_copy_task_key}",
            timeout_seconds=job_timeout
        )

email_notifications=jobs.JobEmailNotifications(
        on_success=[],
        on_failure=email_ids,
)

In [0]:
if createJob:
    w = WorkspaceClient()
    created_job = w.jobs.create(
        name=f"{ucd_s3_copy_job_name}",
        max_concurrent_runs= max_concurrent_runs,
        tasks=[s3_cp_task],
        email_notifications=email_notifications,
        queue= jobs.QueueSettings(enabled=queueEnabled) 
    )
    job_id = created_job.job_id
    print(f"View the ucb job {ucd_s3_copy_job_name} at {w.config.host}/#job/{job_id}")

    dbutils.notebook.exit(json.dumps( {"jobId": f"{job_id}"}))