In [0]:
from datetime import datetime, timezone
from pytz import timezone as pytz_timezone
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import lit,col
import json

In [0]:
dbutils.widgets.text("environment", "", "")
dbutils.widgets.text("job_name", "")
dbutils.widgets.text("back_fill", "N")
dbutils.widgets.text("proc_time", "")
dbutils.widgets.text("cron_pattern", "")

environment = dbutils.widgets.get("environment")
job_name = dbutils.widgets.get("job_name")
back_fill = dbutils.widgets.get("back_fill")
proc_time_str = dbutils.widgets.get("proc_time")
cron_pattern = dbutils.widgets.get("cron_pattern")

In [0]:
config = open("../configs/config.json")
settings = json.load(config)

In [0]:
catalog_name = settings[environment]['catalog_name']
catalog_name

In [0]:
IN_PROGRESS = "IN_PROGRESS"
COMPLETED = "COMPLETED"
FAILED = "FAILED"

In [0]:
TBL_META_JOB_RUNS = f"{catalog_name}.default.meta_job_runs"
TBL_META_JOB_RUNS

In [0]:
%python
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {catalog_name}.default.meta_job_runs ( 
    inserted_at TIMESTAMP,
    updated_at TIMESTAMP,
    job_name STRING,
    proc_time TIMESTAMP,
    cron_pattern STRING,
    status STRING
  )
""")

In [0]:
proc_time = None
try:
    proc_time = datetime.strptime(proc_time_str, "%Y-%m-%dT%H:%M:%S.%f")
except ValueError:
    raise ValueError(f"Invalid datetime format: {proc_time_str}")
proc_time

In [0]:
fmtd_proc_date = proc_time.strftime("%Y-%m-%d")

if back_fill == "Y":
    dbutils.jobs.taskValues.set("proc_date", proc_time_str)
    dbutils.notebook.exit()

# Incremental Runs
df = spark.table(TBL_META_JOB_RUNS).filter(col("job_name") == job_name)

if df.count() == 0:
    print(f"No entry for job '{job_name}', inserting with status '{IN_PROGRESS}'")
    record = {
        "inserted_at": proc_time,
        "updated_at": proc_time,
        "job_name": job_name,
        "proc_time": proc_time,
        "cron_pattern": cron_pattern,
        "status": IN_PROGRESS,
    }
    spark.createDataFrame([record]).write.mode("append").format("delta").saveAsTable(TBL_META_JOB_RUNS)
else:
    # Update all 'COMPLETED' entries to 'IN_PROGRESS' with new proc_date
    print(f"Updating all COMPLETED records for job '{job_name}' to '{IN_PROGRESS}'")

    query = f"""
    SELECT * FROM {TBL_META_JOB_RUNS}
    WHERE job_name = '{job_name}' AND status = '{IN_PROGRESS}'
    """
    df_in_progress = spark.sql(query)

    if df_in_progress.count() > 0:
        raise ValueError(f"Job {job_name} already has an {IN_PROGRESS} state. Skipping run")
    else:
        query = f"""
        UPDATE {TBL_META_JOB_RUNS}
        SET 
            updated_at = '{proc_time}',
            proc_time = '{proc_time}',
            status = '{IN_PROGRESS}'
        WHERE job_name = '{job_name}' 
        """
        print(query)
        spark.sql(query)
        print(f"Job {job_name} is marked '{IN_PROGRESS}")

# Pass the updated proc_date to the next notebook
dbutils.jobs.taskValues.set("proc_date", proc_time_str)

In [0]:
# layer = dbutils.widgets.get("layer")
# dbutils.jobs.taskValues.set(key="layer", value=layer)