# Inserting data into the database and estimating execution time

In [1]:
from onehealth_db import postgresql_database as db
from pathlib import Path
import time
import xarray as xr

### Set up necessary variables

In [2]:
# PostgreSQL database URL
db_url = "postgresql+psycopg2://postgres:postgres@localhost:5432/postgres"
# initialize the database
engine = db.initialize_database(db_url, replace=True)

PostGIS extension installed.
All tables dropped.
All tables created.
Database initialized successfully.


In [3]:
# record running time
run_time = {}

# variable types
var_types = [
    {
        "name": "t2m",
        "unit": "Celsius",
        "description": "2m temperature"
    },
    {
        "name": "tp",
        "unit": "mm",
        "description": "Total precipitation"
    },
    {
        "name": "total-population",
        "unit": "1",
        "description": "Total population"
    }
]

### Add data into the database

In [4]:
# start recording time
t0 = time.time()

In [5]:
# add NUTS definition data
data_in = Path("../data/in")
shapefile_path = data_in / "NUTS_RG_20M_2024_4326.shp"
db.insert_nuts_def(engine, shapefile_path)
t_nuts_def = time.time()

# add variable types
db.insert_var_types(engine, var_types)
t_var_type = time.time()

# processed era5-land data
data_out = Path("../data/out")
era5_land_path = data_out / "era5_data_2020_to_2025_all_2t_tp_monthly_celsius_mm_resampled_05degree_trim.nc"
isimip_path = data_in / "population_histsoc_30arcmin_annual_1901_2021_renamed.nc"

with xr.open_dataset(era5_land_path, chunks={}) as era5_ds:
    # rechunk the dataset
    era5_ds = era5_ds.chunk({"time": 1, "latitude": 360, "longitude": 720})

    # add grid points
    db.insert_grid_points(engine, 
                          latitudes=era5_ds.latitude.values, 
                          longitudes=era5_ds.longitude.values)
    t_grid_point = time.time()

    # add time points
    db.insert_time_points(engine, time_point_data=era5_ds.time.values)
    t_time_point = time.time()

    # get id maps for grid, time, and variable types
    grid_id_map, time_id_map, var_type_id_map = db.get_id_maps(engine)
    t_get_id_map = time.time()

    # add variable values
    t_start_insert_t2m = db.insert_var_values(engine, era5_ds, "t2m", grid_id_map, time_id_map, var_type_id_map)
    t_inserted_t2m = time.time()
    t_start_insert_tp = db.insert_var_values(engine, era5_ds, "tp", grid_id_map, time_id_map, var_type_id_map)
    t_inserted_tp = time.time()

with xr.open_dataset(isimip_path, chunks={}) as isimip_ds:
    # rechunk the dataset
    isimip_ds = isimip_ds.chunk({"time": 1, "latitude": 360, "longitude": 720})
    
    # add population data
    t_start_insert_popu = db.insert_var_values(engine, isimip_ds, "total-population", grid_id_map, time_id_map, var_type_id_map)
    t_inserted_popu = time.time()

t_end = time.time()

Variable types inserted.
Grid points inserted.
Time points inserted.
Start inserting t2m values...
Values of t2m inserted.
Start inserting tp values...
Values of tp inserted.
Start inserting total-population values...
Values of total-population inserted.


In [6]:
# calculate execution time
run_time["nuts_def"] = t_nuts_def - t0
run_time["var_type"] = t_var_type - t_nuts_def
run_time["grid_point"] = t_grid_point - t_var_type
run_time["time_point"] = t_time_point - t_grid_point
run_time["get_id_map"] = t_get_id_map - t_time_point
run_time["prepare_insert_t2m"] = t_start_insert_t2m - t_get_id_map
run_time["inserted_t2m"] = t_inserted_t2m - t_start_insert_t2m
run_time["prepare_insert_tp"] = t_start_insert_tp - t_inserted_t2m
run_time["inserted_tp"] = t_inserted_tp - t_start_insert_tp
run_time["prepare_insert_popu"] = t_start_insert_popu - t_inserted_tp
run_time["inserted_popu"] = t_inserted_popu - t_start_insert_popu
total_time = t_end - t0


In [7]:
print(f"NUTS definition data inserted in {run_time['nuts_def']} seconds.")
print(f"Variable types inserted in {run_time['var_type']} seconds.")
print(f"Grid points inserted in {run_time['grid_point']} seconds.")
print(f"Time points inserted in {run_time['time_point']} seconds.")
print(f"ID maps retrieved in {run_time['get_id_map']} seconds.")
print(f"t2m variable values prepared in {run_time['prepare_insert_t2m']} seconds.")
print(f"t2m variable values inserted in {run_time['inserted_t2m']} seconds.")
print(f"tp variable values prepared in {run_time['prepare_insert_tp']} seconds.")
print(f"tp variable values inserted in {run_time['inserted_tp']} seconds.")
print(f"Population variable values prepared in {run_time['prepare_insert_popu']} seconds.")
print(f"Population variable values inserted in {run_time['inserted_popu']} seconds.")
print(f"Total execution time: {total_time} seconds.")

NUTS definition data inserted in 0.1398172378540039 seconds.
Variable types inserted in 0.004373073577880859 seconds.
Grid points inserted in 5.645850896835327 seconds.
Time points inserted in 0.0037517547607421875 seconds.
ID maps retrieved in 0.5988433361053467 seconds.
t2m variable values prepared in 11.091571807861328 seconds.
t2m variable values inserted in 375.8198022842407 seconds.
tp variable values prepared in 10.657374143600464 seconds.
tp variable values inserted in 367.9908113479614 seconds.
Population variable values prepared in 7.319183349609375 seconds.
Population variable values inserted in 0.01121830940246582 seconds.
Total execution time: 779.2830321788788 seconds.
