## Data Manipulation

In [1]:
import digitalhub as dh
import pandas as pd
import requests
import os

In [2]:
PROJECT = "project-etl-ci"
project = dh.get_or_create_project(PROJECT)

In [3]:
project.to_dict()

{'kind': 'project',
 'metadata': {'project': 'project-etl-ci',
  'name': 'project-etl-ci',
  'created': '2024-10-30T09:41:31.887Z',
  'updated': '2024-10-30T09:41:31.887Z',
  'created_by': 'khurshid@fbk.eu',
  'updated_by': 'khurshid@fbk.eu'},
 'spec': {'context': './',
  'functions': [],
  'artifacts': [],
  'workflows': [],
  'dataitems': [],
  'models': []},
 'status': {'state': 'CREATED'},
 'user': 'khurshid@fbk.eu',
 'id': 'project-etl-ci',
 'name': 'project-etl-ci',
 'key': 'store://project-etl-ci'}

In [4]:
URL = "https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/rilevazione-flusso-veicoli-tramite-spire-anno-2023/exports/csv?lang=it&timezone=Europe%2FRome&use_labels=true&delimiter=%3B"
filename = "rilevazione-flusso-veicoli-tramite-spire-anno-2023.csv"

In [5]:
with requests.get(URL) as r:
    with open(filename, "wb") as f:
        f.write(r.content)

In [6]:
df = pd.read_csv(filename, sep=";")

In [7]:
df.head()

Unnamed: 0,data,codice spira,00:00-01:00,01:00-02:00,02:00-03:00,03:00-04:00,04:00-05:00,05:00-06:00,06:00-07:00,07:00-08:00,...,ordinanza,stato,codimpsem,direzione,angolo,longitudine,latitudine,geopoint,ID_univoco_stazione_spira,giorno settimana
0,2023-03-25,0.127 3.89 6 1,97,55,31,16,15,38,77,194,...,4000/343434,A,156,N,348.0,11.373242,44.499532,"44.4995320603747, 11.3732419756135",207.0,Sabato
1,2023-03-25,0.127 3.89 8 1,52,25,12,14,5,13,24,69,...,4000/343434,A,156,E,242.0,11.372867,44.499893,"44.4998932893995, 11.3728670312021",208.0,Sabato
2,2023-03-25,0.127 3.90 6 1,167,94,69,37,42,62,85,151,...,4000/343434,A,232,NE,308.0,11.410369,44.524032,"44.52403243227343, 11.410369186317224",211.0,Sabato
3,2023-03-25,0.127 3.93 4 1,5,3,0,5,5,7,10,26,...,4000/343434,A,106,SE,217.0,11.381753,44.518132,"44.5181320170998, 11.381752851058",217.0,Sabato
4,2023-03-25,0.127 3.93 6 1,234,136,95,85,55,103,141,287,...,4000/343434,A,106,NE,321.0,11.381681,44.517642,"44.5176422952597, 11.3816808772175",218.0,Sabato


In [8]:
new_folder = 'src'
if not os.path.exists(new_folder):
    os.makedirs(new_folder)

In [9]:
%%writefile "src/download-data.py"

from digitalhub_runtime_python import handler

@handler(outputs=["dataset"])
def downloader(url):
    # read and rewrite to normalize and export as data
    df = url.as_df(file_format='csv',sep=";")
    return df

Overwriting src/download-data.py


In [10]:
func = project.new_function(
                         name="download-data",
                         kind="python",
                         python_version="PYTHON3_10",
                         code_src="src/download-data.py",
                         handler="downloader")

In [11]:
URL = "https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/rilevazione-flusso-veicoli-tramite-spire-anno-2023/exports/csv?lang=it&timezone=Europe%2FRome&use_labels=true&delimiter=%3B"
di= project.new_dataitem(name="url_data_item",kind="table",path=URL)

In [12]:
run = func.run(action="job", inputs={'url':di.key}, outputs={"dataset": "dataset"}, local_execution=False)

Wait until the 'run' job is completed. One can view the state of job from digitalhub application console or using digitalhub sdk API call.

In [20]:
run.refresh().status.state

'COMPLETED'

Once 'Completed', retrieve the newly created dataset.

In [21]:
dataset_di = project.get_dataitem('dataset')

In [22]:
dataset_df = dataset_di.as_df()

In [23]:
dataset_df.head()

Unnamed: 0,data,codice spira,00:00-01:00,01:00-02:00,02:00-03:00,03:00-04:00,04:00-05:00,05:00-06:00,06:00-07:00,07:00-08:00,...,ordinanza,stato,codimpsem,direzione,angolo,longitudine,latitudine,geopoint,ID_univoco_stazione_spira,giorno settimana
0,2023-03-25,0.127 3.89 6 1,97,55,31,16,15,38,77,194,...,4000/343434,A,156,N,348.0,11.373242,44.499532,"44.4995320603747, 11.3732419756135",207.0,Sabato
1,2023-03-25,0.127 3.89 8 1,52,25,12,14,5,13,24,69,...,4000/343434,A,156,E,242.0,11.372867,44.499893,"44.4998932893995, 11.3728670312021",208.0,Sabato
2,2023-03-25,0.127 3.90 6 1,167,94,69,37,42,62,85,151,...,4000/343434,A,232,NE,308.0,11.410369,44.524032,"44.52403243227343, 11.410369186317224",211.0,Sabato
3,2023-03-25,0.127 3.93 4 1,5,3,0,5,5,7,10,26,...,4000/343434,A,106,SE,217.0,11.381753,44.518132,"44.5181320170998, 11.381752851058",217.0,Sabato
4,2023-03-25,0.127 3.93 6 1,234,136,95,85,55,103,141,287,...,4000/343434,A,106,NE,321.0,11.381681,44.517642,"44.5176422952597, 11.3816808772175",218.0,Sabato


In [24]:
%%writefile "src/process-spire.py"

from digitalhub_runtime_python import handler

KEYS=['codice spira','longitudine','latitudine','Livello','tipologia','codice','codice arco','codice via','Nome via', 'stato','direzione','angolo','geopoint']

@handler(outputs=["dataset-spire"])
def process(project, di):
    df = di.as_df()
    sdf= df.groupby(['codice spira']).first().reset_index()[KEYS]
    return sdf

Overwriting src/process-spire.py


In [25]:
process_func = project.new_function(
                         name="process-spire",
                         kind="python",
                         python_version="PYTHON3_10",
                         code_src="src/process-spire.py",
                         handler="process")

In [26]:
process_run = process_func.run(action="job", inputs={'di': dataset_di.key}, outputs={'dataset-spire': 'dataset-spire'}, local_execution=False)

Wait until 'run' job is completed. Check the state of run from application console or using digitalhub sdk API call.

In [32]:
process_run.refresh().status.state

'COMPLETED'

Once completed, proceed with the next steps of scenario

In [33]:
spire_di = project.get_dataitem('dataset-spire')
spire_df = spire_di.as_df()

In [34]:
spire_df.head()

Unnamed: 0,codice spira,longitudine,latitudine,Livello,tipologia,codice,codice arco,codice via,Nome via,stato,direzione,angolo,geopoint
0,0.127 1.1 6 1,11.354166,44.498535,1,spira,498.0,3312,19900,VIA G.BATTISTA DE ROLANDIS,A,N,342.0,"44.4985349106485, 11.3541657967424"
1,0.127 1.12 8 1,11.33897,44.495251,1,spira,1045.0,1016,5900,VIA CESARE BATTISTI,A,N,350.0,"44.4952505129043, 11.338970003537"
2,0.127 1.13 6 1,11.34642,44.491648,1,spira,130.0,1169,14700,VIA CASTIGLIONE,A,S,198.0,"44.4916483847646, 11.3464200565732"
3,0.127 1.14 4 1,11.339836,44.490116,1,spira,521.0,1050,59900,VIA URBANA,A,E,264.0,"44.4901162203284, 11.3398356513878"
4,0.127 1.15 2 1,11.343358,44.489507,1,spira,132.0,1064,25800,VIA GARIBALDI,A,N,347.0,"44.4895074220971, 11.3433581064329"


In [35]:
%%writefile "src/process-measures.py"

from digitalhub_runtime_python import handler
import pandas as pd

KEYS = ['00:00-01:00', '01:00-02:00', '02:00-03:00', '03:00-04:00', '04:00-05:00', '05:00-06:00', '06:00-07:00', '07:00-08:00', '08:00-09:00', '09:00-10:00', '10:00-11:00', '11:00-12:00', '12:00-13:00', '13:00-14:00', '14:00-15:00', '15:00-16:00', '16:00-17:00', '17:00-18:00', '18:00-19:00', '19:00-20:00', '20:00-21:00', '21:00-22:00', '22:00-23:00', '23:00-24:00']
COLUMNS=['data','codice spira']

@handler(outputs=["dataset-measures"])
def process(project, di):
    df = di.as_df()
    rdf = df[COLUMNS+KEYS]
    ls = []
    for key in KEYS:
        k = key.split("-")[0]
        xdf = rdf[COLUMNS + [key]]
        xdf['time'] = xdf.data.apply(lambda x: x+' ' +k)
        xdf['value'] = xdf[key]
        ls.append(xdf[['time','codice spira','value']])
    edf = pd.concat(ls)
    return edf

Overwriting src/process-measures.py


In [36]:
process_measures_func = project.new_function(
                         name="process-measures",
                         kind="python",
                         python_version="PYTHON3_10",
                         code_src="src/process-measures.py",
                         handler="process")

In [37]:
process_measures_run = process_measures_func.run(action="job", inputs={'di': dataset_di.key}, outputs={'dataset-measures': 'dataset-measures'}, local_execution=False)

Wait until 'run' is completed. One can see the state of 'run' job from application console or using the digitalhub sdk API

In [45]:
process_measures_run.refresh().status.state

'COMPLETED'

Once 'Completed', proceed with next steps of scenario.

In [46]:
measures_di = project.get_dataitem('dataset-measures')
measures_df = measures_di.as_df()
measures_df.head()

Unnamed: 0,time,codice spira,value
0,2023-03-25 00:00,0.127 3.89 6 1,97
1,2023-03-25 00:00,0.127 3.89 8 1,52
2,2023-03-25 00:00,0.127 3.90 6 1,167
3,2023-03-25 00:00,0.127 3.93 4 1,5
4,2023-03-25 00:00,0.127 3.93 6 1,234


## Pipeline

In [47]:
%%writefile "src/pipeline.py"

from digitalhub_runtime_kfp.dsl import pipeline_context

def pipeline(url):
    with pipeline_context() as pc:
        downloader = pc.step(
            name="download-data",
            function="download-data",
            action="job",
            inputs={"url": url},
            outputs={"dataset": "dataset"},
        )

        process_spire = pc.step(
            name="process-spire",
            function="process-spire",
            action="job",
            inputs={"di": downloader.outputs["dataset"]}
        )

        process_measures = pc.step(
            name="process-measures",
            function="process-measures",
            action="job",
            inputs={"di": downloader.outputs["dataset"]}
        )

Overwriting src/pipeline.py


In [48]:
workflow = project.new_workflow(name="pipeline", kind="kfp", code_src="src/pipeline.py", handler="pipeline")

In [72]:
wf_run = workflow.run(parameters={"url": di.key})

In [79]:
wf_run.refresh().status.state

'RUNNING'

## Serve

In [80]:
%%writefile 'src/api.py'

def init_context(context):
    di = context.project.get_dataitem('dataset-measures')
    df = di.as_df()
    setattr(context, "df", df)

def handler(context, event):
    df = context.df

    if df is None:
        return ""

    # mock REST api
    method = event.method
    path = event.path
    fields = event.fields

    id = False

    # pagination
    page = 0
    pageSize = 50

    if "page" in fields:
        page = int(fields['page'])

    if "size" in fields:
        pageSize = int(fields['size'])

    if page < 0:
        page = 0

    if pageSize < 1:
        pageSize = 1

    if pageSize > 100:
        pageSize = 100

    start = page * pageSize
    end = start + pageSize
    total = len(df)

    if end > total:
        end = total

    ds = df.iloc[start:end]
    json = ds.to_json(orient="records")

    res = {"data": json, "page": page, "size": pageSize, "total": total}

    return res

Overwriting src/api.py


In [81]:
api_func = project.new_function(
                         name="api",
                         kind="python",
                         python_version="PYTHON3_10",
                         code_src="src/api.py",
                         handler="handler",
                         init_function="init_context")

In [82]:
run_serve_model = api_func.run(action="serve")

Wait until serve job is completed. See the application console to view the state of 'serve' job or query it via digitalhub sdk API call.

In [83]:
run_serve_model.refresh().status.state

'RUNNING'

In [91]:
service = run_serve_model.refresh().status.service
service

{'name': 's-pythonserve-103fe8c2-1c80-4d6a-82dc-dbb8d8a5de22',
 'namespace': 'digitalhub-test',
 'type': 'NodePort',
 'clusterIP': '10.98.214.182',
 'ports': [{'name': 'port8080',
   'nodePort': 31624,
   'port': 8080,
   'protocol': 'TCP',
   'targetPort': 8080}],
 'url': 's-pythonserve-103fe8c2-1c80-4d6a-82dc-dbb8d8a5de22.digitalhub-test:8080'}

Once 'Service' object available, proceed by making call to deployed service.

In [92]:
SERVICE_URL = f"http://{run_serve_model.status.to_dict()['service']['url']}"
SERVICE_URL

'http://s-pythonserve-103fe8c2-1c80-4d6a-82dc-dbb8d8a5de22.digitalhub-test:8080'

In [93]:
with requests.get(f'{SERVICE_URL}/?page=5&size=10') as r:
    res = r.json()
print(res)

{'data': '[{"time":"2023-03-25 00:00","codice spira":"1.8 1.11 6 1","value":100},{"time":"2023-03-25 00:00","codice spira":"1.8 1.12 2 1","value":140},{"time":"2023-03-25 00:00","codice spira":"1.8 1.18 6 1","value":106},{"time":"2023-03-25 00:00","codice spira":"1.9 1.8 2 1","value":146},{"time":"2023-03-25 00:00","codice spira":"1.11 1.8 2 1","value":155},{"time":"2023-03-25 00:00","codice spira":"1.12 1.8 6 1","value":98},{"time":"2023-03-25 00:00","codice spira":"1.13 1.19 8 1","value":0},{"time":"2023-03-25 00:00","codice spira":"1.16 1.17 8 1","value":28},{"time":"2023-03-25 00:00","codice spira":"1.17 4.19 6 1","value":151},{"time":"2023-03-25 00:00","codice spira":"1.18 1.10 4 1","value":78}]', 'page': 5, 'size': 10, 'total': 7513608}


In [94]:
rdf = pd.read_json(res['data'], orient='records')
rdf.head()

Unnamed: 0,time,codice spira,value
0,2023-03-25 00:00,1.8 1.11 6 1,100
1,2023-03-25 00:00,1.8 1.12 2 1,140
2,2023-03-25 00:00,1.8 1.18 6 1,106
3,2023-03-25 00:00,1.9 1.8 2 1,146
4,2023-03-25 00:00,1.11 1.8 2 1,155
