In [1]:
import digitalhub as dh
import pandas as pd
import requests
import os

# Project

In [2]:
PROJECT = "demo-etl"
project = dh.get_or_create_project(PROJECT)

In [3]:
print(project)

{'id': 'demo-etl', 'name': 'demo-etl', 'kind': 'project', 'key': 'store://demo-etl', 'metadata': {'project': 'demo-etl', 'name': 'demo-etl', 'created': '2024-07-12T09:11:54.785Z', 'updated': '2024-07-12T09:11:54.785Z', 'created_by': 'tenant1userid', 'updated_by': 'tenant1userid'}, 'spec': {'context': './', 'functions': [], 'artifacts': [], 'workflows': [], 'dataitems': [], 'models': []}, 'status': {'state': 'CREATED'}, 'user': 'tenant1userid'}


# Explore

In [4]:
URL = "https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/rilevazione-flusso-veicoli-tramite-spire-anno-2023/exports/csv?lang=it&timezone=Europe%2FRome&use_labels=true&delimiter=%3B"
filename = "rilevazione-flusso-veicoli-tramite-spire-anno-2023.csv"

In [5]:
with requests.get(URL) as r:
    with open(filename, "wb") as f:
        f.write(r.content)


In [6]:
df = pd.read_csv(filename, sep=";")

In [7]:
df.head()

Unnamed: 0,data,codice spira,00:00-01:00,01:00-02:00,02:00-03:00,03:00-04:00,04:00-05:00,05:00-06:00,06:00-07:00,07:00-08:00,...,ordinanza,stato,codimpsem,direzione,angolo,longitudine,latitudine,geopoint,ID_univoco_stazione_spira,giorno settimana
0,2023-03-25,0.127 3.89 6 1,97,55,31,16,15,38,77,194,...,4000/343434,A,156,N,348.0,11.373242,44.499532,"44.4995320603747, 11.3732419756135",207.0,Sabato
1,2023-03-25,0.127 3.89 8 1,52,25,12,14,5,13,24,69,...,4000/343434,A,156,E,242.0,11.372867,44.499893,"44.4998932893995, 11.3728670312021",208.0,Sabato
2,2023-03-25,0.127 3.90 6 1,167,94,69,37,42,62,85,151,...,4000/343434,A,232,NE,308.0,11.410369,44.524032,"44.52403243227343, 11.410369186317224",211.0,Sabato
3,2023-03-25,0.127 3.93 4 1,5,3,0,5,5,7,10,26,...,4000/343434,A,106,SE,217.0,11.381753,44.518132,"44.5181320170998, 11.381752851058",217.0,Sabato
4,2023-03-25,0.127 3.93 6 1,234,136,95,85,55,103,141,287,...,4000/343434,A,106,NE,321.0,11.381681,44.517642,"44.5176422952597, 11.3816808772175",218.0,Sabato


In [8]:
df.dtypes

data                          object
codice spira                  object
00:00-01:00                    int64
01:00-02:00                    int64
02:00-03:00                    int64
03:00-04:00                    int64
04:00-05:00                    int64
05:00-06:00                    int64
06:00-07:00                    int64
07:00-08:00                    int64
08:00-09:00                    int64
09:00-10:00                    int64
10:00-11:00                    int64
11:00-12:00                    int64
12:00-13:00                    int64
13:00-14:00                    int64
14:00-15:00                    int64
15:00-16:00                    int64
16:00-17:00                    int64
17:00-18:00                    int64
18:00-19:00                    int64
19:00-20:00                    int64
20:00-21:00                    int64
21:00-22:00                    int64
22:00-23:00                    int64
23:00-24:00                    int64
id_uni                         int64
L

In [9]:
df.size

14088015

## 1. Collect the data

Create a new folder to store the functions in:

In [10]:
new_folder = 'src'
if not os.path.exists(new_folder):
    os.makedirs(new_folder)

Define and export a function for downloading data and persisting into repository

In [11]:
%%writefile "src/download-data.py"

from digitalhub_runtime_python import handler
import pandas as pd
import requests

@handler(outputs=["dataset"])
def downloader(project, url):
    # read and rewrite to normalize and export as data
    df = url.as_df(file_format='csv',sep=";")
    return df

Writing src/download-data.py


register the function

In [12]:
func = project.new_function(
                         name="download-data",
                         kind="python",
                         python_version="PYTHON3_9",
                         source={"source": "src/download-data.py", "handler": "downloader"})

For the function to be executed, we need to pass it a reference to the data item. Let us create and register the corresponding data item:

In [13]:
URL = "https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/rilevazione-flusso-veicoli-tramite-spire-anno-2023/exports/csv?lang=it&timezone=Europe%2FRome&use_labels=true&delimiter=%3B"
di = project.new_dataitem(name="url_data_item",kind="table",path=URL)

Then, execute the function (locally) as a single job. Note that it may take a few minutes.

In [14]:
run = func.run(action="job", inputs={'url':di.key}, outputs={"dataset": "dataset"}, local_execution=True)

2024-07-12 09:15:28,650 - INFO - Validating task.
2024-07-12 09:15:28,651 - INFO - Validating run.
2024-07-12 09:15:28,651 - INFO - Starting task.
2024-07-12 09:15:28,652 - INFO - Configuring execution.
2024-07-12 09:15:28,653 - INFO - Composing function arguments.
2024-07-12 09:15:28,669 - INFO - Executing run.
2024-07-12 09:17:08,263 - INFO - Task completed, returning run status.


The result will be saved as an artifact in the data store, versioned and addressable with a unique key. The name of the artifact will be defined according to the mapping specified in outputs map: it maps the handler outputs (see the @handler annotation and its output definition) to the expected name.

In [15]:
dataset_di = project.get_dataitem('dataset')

In [16]:
dataset_df = dataset_di.as_df()

In [17]:
dataset_df.head()

Unnamed: 0,data,codice spira,00:00-01:00,01:00-02:00,02:00-03:00,03:00-04:00,04:00-05:00,05:00-06:00,06:00-07:00,07:00-08:00,...,ordinanza,stato,codimpsem,direzione,angolo,longitudine,latitudine,geopoint,ID_univoco_stazione_spira,giorno settimana
0,2023-03-25,0.127 3.89 6 1,97,55,31,16,15,38,77,194,...,4000/343434,A,156,N,348.0,11.373242,44.499532,"44.4995320603747, 11.3732419756135",207.0,Sabato
1,2023-03-25,0.127 3.89 8 1,52,25,12,14,5,13,24,69,...,4000/343434,A,156,E,242.0,11.372867,44.499893,"44.4998932893995, 11.3728670312021",208.0,Sabato
2,2023-03-25,0.127 3.90 6 1,167,94,69,37,42,62,85,151,...,4000/343434,A,232,NE,308.0,11.410369,44.524032,"44.52403243227343, 11.410369186317224",211.0,Sabato
3,2023-03-25,0.127 3.93 4 1,5,3,0,5,5,7,10,26,...,4000/343434,A,106,SE,217.0,11.381753,44.518132,"44.5181320170998, 11.381752851058",217.0,Sabato
4,2023-03-25,0.127 3.93 6 1,234,136,95,85,55,103,141,287,...,4000/343434,A,106,NE,321.0,11.381681,44.517642,"44.5176422952597, 11.3816808772175",218.0,Sabato


# 2. Process the data

Raw data (as ingested from remote API) is usually not suitable for consumption. We'll define a set of functions to derive data as required by the scenario.

### Extract *spire* information
extract information about the _spire_ (for example `id`,`geolocation`,`address`, `name`...)

In [18]:
sdf= dataset_df.groupby(['codice spira']).first().reset_index()[['codice spira','longitudine','latitudine','Livello','tipologia','codice','codice arco','codice via','Nome via', 'stato','direzione','angolo','geopoint']]

In [19]:
sdf.head()

Unnamed: 0,codice spira,longitudine,latitudine,Livello,tipologia,codice,codice arco,codice via,Nome via,stato,direzione,angolo,geopoint
0,0.127 1.1 6 1,11.354166,44.498535,1,spira,498.0,3312,19900,VIA G.BATTISTA DE ROLANDIS,A,N,342.0,"44.4985349106485, 11.3541657967424"
1,0.127 1.12 8 1,11.33897,44.495251,1,spira,1045.0,1016,5900,VIA CESARE BATTISTI,A,N,350.0,"44.4952505129043, 11.338970003537"
2,0.127 1.13 6 1,11.34642,44.491648,1,spira,130.0,1169,14700,VIA CASTIGLIONE,A,S,198.0,"44.4916483847646, 11.3464200565732"
3,0.127 1.14 4 1,11.339836,44.490116,1,spira,521.0,1050,59900,VIA URBANA,A,E,264.0,"44.4901162203284, 11.3398356513878"
4,0.127 1.15 2 1,11.343358,44.489507,1,spira,132.0,1064,25800,VIA GARIBALDI,A,N,347.0,"44.4895074220971, 11.3433581064329"


In [20]:
sdf['tipologia'].unique()

array(['spira', 'telecamera'], dtype=object)

define a function to derive the dataset and save in the store

In [21]:
%%writefile "src/process-spire.py"

from digitalhub_runtime_python import handler
import pandas as pd

KEYS=['codice spira','longitudine','latitudine','Livello','tipologia','codice','codice arco','codice via','Nome via', 'stato','direzione','angolo','geopoint']

@handler(outputs=["dataset-spire"])
def process(project, di):
    df = di.as_df()
    sdf= df.groupby(['codice spira']).first().reset_index()[KEYS]

    return sdf

Writing src/process-spire.py


register the function

In [22]:
process_func = project.new_function(
                         name="process-spire",
                         kind="python",
                         python_version="PYTHON3_9",
                         source={"source": "src/process-spire.py", "handler": "process"})


and execute (locally)

In [23]:
process_run = process_func.run(action="job", inputs={'di': dataset_di.key}, outputs={'dataset-spire': 'dataset-spire'}, local_execution=True)

2024-07-12 09:21:19,052 - INFO - Validating task.
2024-07-12 09:21:19,052 - INFO - Validating run.
2024-07-12 09:21:19,053 - INFO - Starting task.
2024-07-12 09:21:19,053 - INFO - Configuring execution.
2024-07-12 09:21:19,054 - INFO - Composing function arguments.
2024-07-12 09:21:19,075 - INFO - Executing run.
2024-07-12 09:21:19,452 - INFO - Task completed, returning run status.


The result of the execution will be saved as an artifact in the data store, with a unique key.

In [25]:
spire_di = project.get_dataitem('dataset-spire')
spire_df = spire_di.as_df()

In [26]:
spire_df.head()

Unnamed: 0,codice spira,longitudine,latitudine,Livello,tipologia,codice,codice arco,codice via,Nome via,stato,direzione,angolo,geopoint
0,0.127 1.1 6 1,11.354166,44.498535,1,spira,498.0,3312,19900,VIA G.BATTISTA DE ROLANDIS,A,N,342.0,"44.4985349106485, 11.3541657967424"
1,0.127 1.12 8 1,11.33897,44.495251,1,spira,1045.0,1016,5900,VIA CESARE BATTISTI,A,N,350.0,"44.4952505129043, 11.338970003537"
2,0.127 1.13 6 1,11.34642,44.491648,1,spira,130.0,1169,14700,VIA CASTIGLIONE,A,S,198.0,"44.4916483847646, 11.3464200565732"
3,0.127 1.14 4 1,11.339836,44.490116,1,spira,521.0,1050,59900,VIA URBANA,A,E,264.0,"44.4901162203284, 11.3398356513878"
4,0.127 1.15 2 1,11.343358,44.489507,1,spira,132.0,1064,25800,VIA GARIBALDI,A,N,347.0,"44.4895074220971, 11.3433581064329"


### Extract measures
extract measures for traffic as recorded by _spire_ (e.g. `time`,`value`)

In [27]:
keys = ['00:00-01:00',
         '01:00-02:00',
         '02:00-03:00',
         '03:00-04:00',
         '04:00-05:00',
         '05:00-06:00',
         '06:00-07:00',
         '07:00-08:00',
         '08:00-09:00',
         '09:00-10:00',
         '10:00-11:00',
         '11:00-12:00',
         '12:00-13:00',
         '13:00-14:00',
         '14:00-15:00',
         '15:00-16:00',
         '16:00-17:00',
         '17:00-18:00',
         '18:00-19:00',
         '19:00-20:00',
         '20:00-21:00',
         '21:00-22:00',
         '22:00-23:00',
         '23:00-24:00']
columns=['data','codice spira'] + keys

In [28]:
rdf = dataset_df[columns]

In [29]:
rdf.head()

Unnamed: 0,data,codice spira,00:00-01:00,01:00-02:00,02:00-03:00,03:00-04:00,04:00-05:00,05:00-06:00,06:00-07:00,07:00-08:00,...,14:00-15:00,15:00-16:00,16:00-17:00,17:00-18:00,18:00-19:00,19:00-20:00,20:00-21:00,21:00-22:00,22:00-23:00,23:00-24:00
0,2023-03-25,0.127 3.89 6 1,97,55,31,16,15,38,77,194,...,311,384,373,452,668,535,326,177,162,122
1,2023-03-25,0.127 3.89 8 1,52,25,12,14,5,13,24,69,...,123,113,118,119,135,138,72,63,68,64
2,2023-03-25,0.127 3.90 6 1,167,94,69,37,42,62,85,151,...,376,398,431,441,487,451,354,193,193,174
3,2023-03-25,0.127 3.93 4 1,5,3,0,5,5,7,10,26,...,31,27,32,44,125,90,33,13,15,13
4,2023-03-25,0.127 3.93 6 1,234,136,95,85,55,103,141,287,...,607,643,738,766,854,667,504,309,295,310


In [30]:
tdf = rdf.head()

In [31]:
key = '00:00-01:00'

In [32]:
k = key.split("-")[0]

In [33]:
xdf = tdf[['data','codice spira',key]]

In [34]:
xdf['time'] = xdf.data.apply(lambda x: x+' ' +k)
xdf['value'] = xdf[key]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xdf['time'] = xdf.data.apply(lambda x: x+' ' +k)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xdf['value'] = xdf[key]


In [35]:
xdf

Unnamed: 0,data,codice spira,00:00-01:00,time,value
0,2023-03-25,0.127 3.89 6 1,97,2023-03-25 00:00,97
1,2023-03-25,0.127 3.89 8 1,52,2023-03-25 00:00,52
2,2023-03-25,0.127 3.90 6 1,167,2023-03-25 00:00,167
3,2023-03-25,0.127 3.93 4 1,5,2023-03-25 00:00,5
4,2023-03-25,0.127 3.93 6 1,234,2023-03-25 00:00,234


In [36]:
vdf = xdf[['time','codice spira','value']]

In [37]:
vdf

Unnamed: 0,time,codice spira,value
0,2023-03-25 00:00,0.127 3.89 6 1,97
1,2023-03-25 00:00,0.127 3.89 8 1,52
2,2023-03-25 00:00,0.127 3.90 6 1,167
3,2023-03-25 00:00,0.127 3.93 4 1,5
4,2023-03-25 00:00,0.127 3.93 6 1,234


In [38]:
ls = []
for key in keys:
    k = key.split("-")[0]
    xdf = rdf[['data','codice spira',key]]
    xdf['time'] = xdf.data.apply(lambda x: x+' ' +k)
    xdf['value'] = xdf[key]
    vdf = xdf[['time','codice spira','value']]
    ls.append(vdf)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xdf['time'] = xdf.data.apply(lambda x: x+' ' +k)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xdf['value'] = xdf[key]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xdf['time'] = xdf.data.apply(lambda x: x+' ' +k)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[r

In [39]:
edf = pd.concat(ls)

In [40]:
edf

Unnamed: 0,time,codice spira,value
0,2023-03-25 00:00,0.127 3.89 6 1,97
1,2023-03-25 00:00,0.127 3.89 8 1,52
2,2023-03-25 00:00,0.127 3.90 6 1,167
3,2023-03-25 00:00,0.127 3.93 4 1,5
4,2023-03-25 00:00,0.127 3.93 6 1,234
...,...,...,...
313062,2023-11-24 23:00,0.127 3.56 2 2,25
313063,2023-11-24 23:00,0.127 3.56 4 1,260
313064,2023-11-24 23:00,0.127 3.59 4 1,179
313065,2023-11-24 23:00,0.127 3.64 6 1,608


write function to process and save in store

In [41]:
%%writefile "src/process-measures.py"

from digitalhub_runtime_python import handler
import pandas as pd

KEYS = ['00:00-01:00', '01:00-02:00', '02:00-03:00', '03:00-04:00', '04:00-05:00', '05:00-06:00', '06:00-07:00', '07:00-08:00', '08:00-09:00', '09:00-10:00', '10:00-11:00', '11:00-12:00', '12:00-13:00', '13:00-14:00', '14:00-15:00', '15:00-16:00', '16:00-17:00', '17:00-18:00', '18:00-19:00', '19:00-20:00', '20:00-21:00', '21:00-22:00', '22:00-23:00', '23:00-24:00']
COLUMNS=['data','codice spira']

@handler(outputs=["dataset-measures"])
def process(project, di):
    df = di.as_df()
    rdf = df[COLUMNS+KEYS]
    ls = []
    for key in KEYS:
        k = key.split("-")[0]
        xdf = rdf[COLUMNS + [key]]
        xdf['time'] = xdf.data.apply(lambda x: x+' ' +k)
        xdf['value'] = xdf[key]
        ls.append(xdf[['time','codice spira','value']])
    edf = pd.concat(ls)
    return edf


Writing src/process-measures.py


register the function

In [42]:
process_measures_func = project.new_function(
                         name="process-measures",
                         kind="python",
                         python_version="PYTHON3_9",
                         source={"source": "src/process-measures.py", "handler": "process"})

and execute (locally)

In [44]:
process_measures_run = process_measures_func.run(action="job", inputs={'di': dataset_di.key}, outputs={'dataset-measures': 'dataset-measures'}, local_execution=True)

2024-07-12 09:32:26,257 - INFO - Validating task.
2024-07-12 09:32:26,258 - INFO - Validating run.
2024-07-12 09:32:26,258 - INFO - Starting task.
2024-07-12 09:32:26,259 - INFO - Configuring execution.
2024-07-12 09:32:26,260 - INFO - Composing function arguments.
2024-07-12 09:32:26,310 - INFO - Executing run.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xdf['time'] = xdf.data.apply(lambda x: x+' ' +k)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xdf['value'] = xdf[key]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[r

inspect the resulting data artifact 

In [45]:
measures_di = project.get_dataitem('dataset-measures')
measures_df = measures_di.as_df()
measures_df.head()

Unnamed: 0,time,codice spira,value
0,2023-03-25 00:00,0.127 3.89 6 1,97
1,2023-03-25 00:00,0.127 3.89 8 1,52
2,2023-03-25 00:00,0.127 3.90 6 1,167
3,2023-03-25 00:00,0.127 3.93 4 1,5
4,2023-03-25 00:00,0.127 3.93 6 1,234


# Workflow
Define a simple workflow which will execute all the ETL steps by composing functions

In [46]:
%%writefile "src/pipeline.py"

from digitalhub_runtime_kfp.dsl import pipeline_context

def pipeline(url):
    with pipeline_context() as pc:
        downloader = pc.step(
            name="download-data",
            function="download-data",
            action="job",
            inputs={"url": url},
            outputs={"dataset": "dataset"},
        )

        process_spire = pc.step(
            name="process-spire",
            function="process-spire",
            action="job",
            inputs={"di": downloader.outputs["dataset"]}
        )

        process_measures = pc.step(
            name="process-measures",
            function="process-measures",
            action="job",
            inputs={"di": downloader.outputs["dataset"]}
        )


Writing src/pipeline.py


register the workflow 

In [49]:
workflow = project.new_workflow(name="pipeline", kind="kfp", source={"source": "src/pipeline.py"}, handler="pipeline")

and run (remote)

In [50]:
workflow.run(parameters={"url": di.key})

{'project': 'demo-etl', 'id': '9c20debc-d44e-48ec-bb0a-0cc9f764dfe7', 'kind': 'kfp+run', 'key': 'store://demo-etl/runs/kfp+run/9c20debc-d44e-48ec-bb0a-0cc9f764dfe7', 'metadata': {'project': 'demo-etl', 'name': '9c20debc-d44e-48ec-bb0a-0cc9f764dfe7', 'created': '2024-07-12T09:36:06.993Z', 'updated': '2024-07-12T09:36:07.001Z', 'created_by': 'tenant1userid', 'updated_by': 'tenant1userid'}, 'spec': {'task': 'kfp+pipeline://demo-etl/pipeline:3cb359bd-7208-4618-9532-2e21345ac2a1', 'local_execution': False, 'source': {'source': 'src/pipeline.py', 'handler': 'pipeline', 'base64': 'CmZyb20gZGlnaXRhbGh1Yl9ydW50aW1lX2tmcC5kc2wgaW1wb3J0IHBpcGVsaW5lX2NvbnRleHQKCmRlZiBwaXBlbGluZSh1cmwpOgogICAgd2l0aCBwaXBlbGluZV9jb250ZXh0KCkgYXMgcGM6CiAgICAgICAgZG93bmxvYWRlciA9IHBjLnN0ZXAoCiAgICAgICAgICAgIG5hbWU9ImRvd25sb2FkLWRhdGEiLAogICAgICAgICAgICBmdW5jdGlvbj0iZG93bmxvYWQtZGF0YSIsCiAgICAgICAgICAgIGFjdGlvbj0iam9iIiwKICAgICAgICAgICAgaW5wdXRzPXsidXJsIjogdXJsfSwKICAgICAgICAgICAgb3V0cHV0cz17ImRhdGFzZXQiOiAiZGF0YXNldCJ

# 3. Expose datasets as API
Define a simple api to expose data as REST.


In [75]:
%%writefile 'src/api.py'

import pandas as pd
import os


def init_context(context):
    di = context.project.get_dataitem('dataset-measures')
    df = di.as_df()
    setattr(context, "df", df)

def handler(context, event):
    df = context.df
    
    if df is None:
        return ""

    # mock REST api
    method = event.method
    path = event.path
    fields = event.fields

    id = False

    # pagination
    page = 0
    pageSize = 50

    if "page" in fields:
        page = int(fields['page'])

    if "size" in fields:
        pageSize = int(fields['size'])

    if page < 0:
        page = 0

    if pageSize < 1:
        pageSize = 1

    if pageSize > 100:
        pageSize = 100

    start = page * pageSize
    end = start + pageSize
    total = len(df)

    if end > total:
        end = total

    ds = df.iloc[start:end]
    json = ds.to_json(orient="records")

    res = {"data": json, "page": page, "size": pageSize, "total": total}

    return res

Overwriting src/api.py


register the function

In [79]:
api_func = project.new_function(
                         name="api",
                         kind="python",
                         python_version="PYTHON3_9",
                         source={"source": "src/api.py", "handler": "handler", "init_function": "init_context"})

Deploy the function (perform ``serve`` action):

In [80]:
run_serve_model = api_func.run(action="serve")

Wait for the function to complete the deploymed:

In [107]:
run_serve_model.refresh()

{'project': 'demo-etl', 'id': 'd2d1a67c-3814-45f1-b9d6-5790ea82be79', 'kind': 'python+run', 'key': 'store://demo-etl/runs/python+run/d2d1a67c-3814-45f1-b9d6-5790ea82be79', 'metadata': {'project': 'demo-etl', 'name': 'd2d1a67c-3814-45f1-b9d6-5790ea82be79', 'created': '2024-07-12T11:30:01.542Z', 'updated': '2024-07-12T11:47:57.568Z', 'created_by': 'tenant1userid', 'updated_by': 'tenant1userid'}, 'spec': {'task': 'python+serve://demo-etl/api:acab3cf7-8e4a-4d66-85a5-e9d9d46462df', 'local_execution': False, 'source': {'source': 'src/api.py', 'handler': 'handler', 'base64': 'CmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG9zCgoKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIGRpID0gY29udGV4dC5wcm9qZWN0LmdldF9kYXRhaXRlbShlbnRpdHlfbmFtZT0nZGF0YXNldC1tZWFzdXJlcycpCiAgICBkZiA9IGRpLmFzX2RmKCkKICAgIHNldGF0dHIoY29udGV4dCwgImRmIiwgZGYpCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCdSZWFkaW5nIGRhdGFzZXQtbWVhc3VyZXMnLCBkZi5oZWFkKCkpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICBkZiA9IGNvbnRleHQuZGYKICAgIAogICAgaWYgZGYgaXMgTm9uZToK

Get the service instance name:

In [108]:
SERVICE_URL = f"http://{run_serve_model.status.to_dict()['service']['url']}"
SERVICE_URL

'http://s-python-python-serve-d2d1a67c-3814-45f1-b9d6-5790ea82be79:8080'

Test the function exposed:

In [110]:
with requests.get(f'{SERVICE_URL}/?page=5&size=10') as r:
    res = r.json()

In [111]:
rdf = pd.read_json(res['data'], orient='records')

  rdf = pd.read_json(res['data'], orient='records')


In [112]:
rdf.head()

Unnamed: 0,time,codice spira,value
0,2023-03-25 00:00,1.8 1.11 6 1,100
1,2023-03-25 00:00,1.8 1.12 2 1,140
2,2023-03-25 00:00,1.8 1.18 6 1,106
3,2023-03-25 00:00,1.9 1.8 2 1,146
4,2023-03-25 00:00,1.11 1.8 2 1,155


# 4. Visualize data with Streamlit
We can use Streamlit, a library to build web apps via Python scripts, to visualize data in a graph.

Write data to file:

In [None]:
with open("result.json", "w") as file:
    file.write(res['data'])

Write the script for Streamlit:

In [None]:
%%writefile 'streamlit-app.py'

import pandas as pd
import streamlit as st

rdf = pd.read_json("result.json", orient="records")

# Replace colons in column names as they can cause issues with Streamlit
rdf.columns = rdf.columns.str.replace(":", "")

st.write("""My data""")
st.line_chart(rdf, x="codice spira", y="1200-1300")

The following will install Streamlit in the workspace. It's actually not code: the `!` at the beginning tells Jupyter to run the contents as a shell command.

In [None]:
!pip install streamlit

The following will start hosting the Streamlit web app, so the cell will remain running. The `browser.gatherUsageStats` flag is set to `false` because, otherwise, Streamlit will automatically gather usage stats and print a warning about it.

In [None]:
!streamlit run streamlit-app.py --browser.gatherUsageStats false

Next, go to your Coder instance and access the Jupyter workspace you've been using. Click on *Ports*, type `8501`, then click the button next to it. It will open a tab to the Streamlit app, where you can visualize data!

Don't forget to stop the above code cell, to stop the app.