In [None]:
from pathlib import Path
Path("src").mkdir(exist_ok=True)

In [None]:
import digitalhub as dh

In [None]:
PROJECT = "project-etl-ci"
project = dh.get_or_create_project(PROJECT)

In [None]:
%%writefile "src/download-data.py"

from digitalhub_runtime_python import handler

@handler(outputs=["dataset"])
def downloader(url):
    return url.as_df(file_format='csv',sep=";")

In [None]:
func = project.new_function(name="download-data",
                            kind="python",
                            python_version="PYTHON3_10",
                            code_src="src/download-data.py",
                            handler="downloader")

In [None]:
URL = "https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/rilevazione-flusso-veicoli-tramite-spire-anno-2023/exports/csv?lang=it&timezone=Europe%2FRome&use_labels=true&delimiter=%3B"
di = project.new_dataitem(name="url_data_item",
                          kind="table",
                          path=URL)

In [None]:
run = func.run("job",
               inputs={'url': di.key},
               wait=True)

Wait until the 'run' job is completed. One can view the state of job from digitalhub application console or using digitalhub sdk API call.

In [None]:
run.refresh().status.state

Once 'Completed', retrieve the newly created dataset.

In [None]:
dataset_di = project.get_dataitem('dataset')
dataset_di.as_df().head()

In [None]:
%%writefile "src/process-spire.py"

from digitalhub_runtime_python import handler

KEYS=['codice spira','longitudine','latitudine',
      'Livello','tipologia','codice','codice arco',
      'codice via','Nome via', 'stato','direzione',
      'angolo','geopoint']

@handler(outputs=["dataset-spire"])
def process(project, di):
    df = di.as_df()
    sdf= df.groupby(['codice spira']).first().reset_index()[KEYS]
    return sdf

In [None]:
process_func = project.new_function(name="process-spire",
                                    kind="python",
                                    python_version="PYTHON3_10",
                                    code_src="src/process-spire.py",
                                    handler="process")

In [None]:
process_run = process_func.run("job",
                               inputs={'di':dataset_di.key},
                               wait=True)

Wait until 'run' job is completed. Check the state of run from application console or using digitalhub sdk API call.

In [None]:
process_run.refresh().status.state

Once completed, proceed with the next steps of scenario

In [None]:
process_run.output('dataset-spire').as_df().head()

In [None]:
%%writefile "src/process-measures.py"

from digitalhub_runtime_python import handler
import pandas as pd

KEYS = ['00:00-01:00', '01:00-02:00', '02:00-03:00', '03:00-04:00',
        '04:00-05:00', '05:00-06:00', '06:00-07:00', '07:00-08:00',
        '08:00-09:00', '09:00-10:00', '10:00-11:00', '11:00-12:00',
        '12:00-13:00', '13:00-14:00', '14:00-15:00', '15:00-16:00',
        '16:00-17:00', '17:00-18:00', '18:00-19:00', '19:00-20:00',
        '20:00-21:00', '21:00-22:00', '22:00-23:00', '23:00-24:00']
COLUMNS=['data','codice spira']

@handler(outputs=["dataset-measures"])
def process(project, di):
    df = di.as_df()
    rdf = df[COLUMNS+KEYS]
    ls = []
    for key in KEYS:
        k = key.split("-")[0]
        xdf = rdf[COLUMNS + [key]]
        xdf['time'] = xdf.data.apply(lambda x: x+' ' +k)
        xdf['value'] = xdf[key]
        ls.append(xdf[['time','codice spira','value']])
    edf = pd.concat(ls)
    return edf

In [None]:
process_measures_func = project.new_function(name="process-measures",
                                             kind="python",
                                             python_version="PYTHON3_10",
                                             code_src="src/process-measures.py",
                                             handler="process")

In [None]:
process_measures_run = process_measures_func.run("job",
                                                 inputs={'di':dataset_di.key},
                                                 wait=True)

Wait until 'run' is completed. One can see the state of 'run' job from application console or using the digitalhub sdk API

In [None]:
process_measures_run.refresh().status.state

Once 'Completed', proceed with next steps of scenario.

In [None]:
process_measures_run.output('dataset-measures').as_df().head()

## Serve

In [None]:
%%writefile 'src/api.py'

def init_context(context):
    di = context.project.get_dataitem('dataset-measures')
    df = di.as_df()
    setattr(context, "df", df)

def handler(context, event):
    df = context.df

    if df is None:
        return ""

    # mock REST api
    method = event.method
    path = event.path
    fields = event.fields

    id = False

    # pagination
    page = 0
    pageSize = 50

    if "page" in fields:
        page = int(fields['page'])

    if "size" in fields:
        pageSize = int(fields['size'])

    if page < 0:
        page = 0

    if pageSize < 1:
        pageSize = 1

    if pageSize > 100:
        pageSize = 100

    start = page * pageSize
    end = start + pageSize
    total = len(df)

    if end > total:
        end = total

    ds = df.iloc[start:end]
    json = ds.to_json(orient="records")

    res = {"data": json, "page": page, "size": pageSize, "total": total}

    return res

In [None]:
api_func = project.new_function(name="api",
                                kind="python",
                                python_version="PYTHON3_10",
                                code_src="src/api.py",
                                handler="handler",
                                init_function="init_context")

In [None]:
run_serve_model = api_func.run("serve", wait=True)

Wait until serve job is completed. See the application console to view the state of 'serve' job or query it via digitalhub sdk API call.

In [None]:
svc_url = f"http://{run_serve_model.status.service['url']}/?page=5&size=10"
res = run_serve_model.invoke(url=svc_url).json()

In [None]:
import pandas as pd

rdf = pd.read_json(res['data'], orient='records')
rdf.head()

## Pipeline

In [None]:
%%writefile "src/pipeline.py"
from hera.workflows import Workflow, DAG, Parameter
from digitalhub_runtime_hera.dsl import step


def pipeline():
    with Workflow(entrypoint="dag", arguments=Parameter(name="url")) as w:

        with DAG(name="dag"):
            A = step(template={"action":"job", "inputs": {"url": "{{workflow.parameters.url}}"}},
                     function="download-data",
                     outputs=["dataset"])
            B = step(template={"action":"job", "inputs": {"di": "{{inputs.parameters.di}}"}},
                     function="process-spire",
                     inputs={"di": A.get_parameter("dataset")})
            C = step(template={"action":"job", "inputs": {"di": "{{inputs.parameters.di}}"}},
                     function="process-measures",
                     inputs={"di": A.get_parameter("dataset")},
                     outputs=["dataset-measures"])
            D = step(template={"action": "serve", "init_parameters": {"dataitem": "{{inputs.parameters.dataitem}}"}},
                     function="api",
                     inputs={"dataitem": C.get_parameter("dataset-measures")})
            A >> [B, C]
            C >> D
    return w

In [None]:
workflow = project.new_workflow(name="pipeline",
                                kind="hera",
                                code_src="src/pipeline.py",
                                handler="pipeline")

In [None]:
wf_run = workflow.run("build", wait=True)

In [None]:
wf_run = workflow.run("pipeline", parameters={"url": di.key}, wait=True)