
## Notebook to cross check jobs vs workspace
Notebook validates if notebook in workspace are present in the job, also it can check if in job there is no notebook that has been just deleted.

This notebook created two tables:
1. dbx_user_notebook
2. dbx_user_job_task

In [None]:
dbutils.widgets.text('schema_name','trsales_dm')
dbutils.widgets.text('user_notebook_table','dbx_user_notebook')
dbutils.widgets.text('user_job_task_table','dbx_user_job_task')

in_schema_name = dbutils.widgets.get('schema_name')
in_user_notebook_table = dbutils.widgets.get('user_notebook_table')
in_user_job_task_table = dbutils.widgets.get('user_job_task_table')

In [None]:
import requests
import os

In [None]:
DBX_HOST = os.environ['DBX_HOST']
DBX_TOKEN = os.environ['DBX_TOKEN']

Rest API calls

In [None]:
class BearerAuth(requests.auth.AuthBase):
    def __init__(self):
        self.token = DBX_TOKEN
    def __call__(self, r):
        r.headers["authorization"] = "Bearer " + self.token
        return r

In [None]:
def get_job_name(job_id: str) -> str:
    return requests.get(f"{DBX_HOST}/api/2.1/jobs/get?job_id={job_id}", auth=BearerAuth()).json()

In [None]:
def list_jobs() -> []:
    data = {"limit" : "100"}
    return requests.get(f"{DBX_HOST}/api/2.1/jobs/list",json=data, auth=BearerAuth()).json()

In [None]:
def list_workspace(path: '/Shared/') -> [] :
    data = {"path" : path}
    return requests.get(f"{DBX_HOST}/api/2.0/workspace/list",json=data, auth=BearerAuth()).json()['objects']

Utility functions
- listing notebooks in jobs
- listing notebooks in workspace
- saving results to tables

In [None]:
def list_notebook_in_jobs() -> []:
    """
    For all jobs, listing tasks / notebooks assigne to the job.
    
    Returns
    -------
    list
        a list of tasks linked to jobs.
    """

    job_ids = [job['job_id'] for job in list_jobs()['jobs']]
    tasks_in_jobs = []
    for job_id in job_ids:
        job = get_job_name(job_id)    
        [tasks_in_jobs.append([job['job_id'], job['settings']['name'], task['task_key'], task['notebook_task']['notebook_path']]) for task in job['settings']['tasks']]        

    return tasks_in_jobs

In [None]:
def list_notebooks_in_workspace(workspace_path) -> []:
    """
    Lists notebooks in workspace. If folder in a path found, it iterates recourively through the directory tree.

    Parameters
    ----------
    workspace_path : str
        worspace path where to look for the tasks

    Returns
    -------
    list
        a list of notebooks
    """  
    notebook_in_wks = []
    for obj in list_workspace(workspace_path):
        if obj['object_type'] == 'DIRECTORY':            
            notebook_in_wks.extend(list_notebooks_in_workspace(obj['path']))
            
        elif obj['object_type'] == 'NOTEBOOK':
            notebook_in_wks.append([obj['path'], obj['path'].split('/')[-1]])

    return notebook_in_wks

In [None]:
def store_dbx_job_notebooks(tasks_in_jobs: [], schema_name: str, user_job_task_table: str):
    """
    Lists tasks in jobs and saves them to the destination defined in the parameters

    Parameters
    ----------
    schema_name : str
        target schema
    user_job_task_table : str
        target table where results will be stored
    """
    columns = ['job_id','job_name','task_name','task_path']
    df = spark.createDataFrame(tasks_in_jobs, columns)
    df.write.mode("overwrite").saveAsTable(f'{schema_name}.{user_job_task_table}')


In [None]:
def store_dbx_workspace_notebooks(workspace_notebooks: [], schema_name: str, user_notebook_table: str):
    """
    Lists notebooks in workspace and saves them to the destination defined in the parameters

    Parameters
    ----------
    schema_name : str
        target schema
    user_job_task_table : str
        target table where results will be stored
    """
    columns = ['notebook_path','notebook_name']
    df = spark.createDataFrame(workspace_notebooks, columns)
    df.write.mode('overwrite').saveAsTable(f'{schema_name}.{user_notebook_table}')

Querying API, saving results and final comparison

In [None]:
workspace_name = '/Shared/'
job_notebooks = list_notebook_in_jobs()
store_dbx_job_notebooks(job_notebooks,in_schema_name, in_user_job_task_table)
workspace_notebooks = list_notebooks_in_workspace(workspace_name)
store_dbx_workspace_notebooks(workspace_notebooks, in_schema_name, in_user_notebook_table)

In [None]:
%sql

with user_notebooks as (
select * from ${schema_name}.${user_notebook_table} un
where notebook_path not like '%90 debug%'
)
select * from user_notebooks un
full outer join ${schema_name}.${user_job_task_table} ujn on (un.notebook_path = ujn.task_path)
where un.notebook_path is null or ujn.task_path is null