Download incremental data using Databricks, alternative to ADF

In [0]:
import sys
import os
import urllib.request
import shutil

root = os.path.abspath(os.path.join(os.getcwd(), "../.."))

if root not in sys.path:
    sys.path.append(root)

from modules.utils.date_utils import get_target_yyyymm
from modules.data_loader.file_downloader import download_file

In [0]:
# from today to 3 months ago
formatted_date = get_target_yyyymm(3)

# local directory date wise
dir_path = f"/Volumes/nyc-taxi/00_landing/data_sources/nyctaxi_green/{formatted_date}"

# Full path for the file
local_path = f"{dir_path}/green_tripdata_{formatted_date}.parquet"

try:
    dbutils.fs.ls(local_path)

    # If the file already exists, set continue_downstream to no, used in job configuration when ADF is not used to fetch data
    dbutils.jobs.taskValues.set(key="continue_downstream", value="no")
    print("File already downloaded, aborting downstream tasks")
except:
    try:
        # URL to download file
        url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_{formatted_date}.parquet"

        download_file(url, dir_path, local_path)

        # set continue_downstream to yes if the file was loaded
        dbutils.jobs.taskValues.set(key="continue_downstream", value="yes")
        print("File succesfully uploaded in current run")
    except Exception as e:
        # set continue downstream to no if the file was not loaded / errors
        dbutils.jobs.taskValues.set(key="continue_downstream", value="no")
        print(f"File download failed: {str(e)}")