# Notebook to explore DuckDB
DuckDB files are created from dataframes of reading netCDF files. These netCDF files can be found under `data/in`.

By default, the DuckDB files are stored in `data/`

In [None]:
from pathlib import Path
import xarray as xr
from onehealth_db import database
import duckdb

In [None]:
folder_path = Path("../data/in")
f_no_area_before_celsius = folder_path / "era5_data_2025_03_monthly.nc"
f_no_area_after_celsius = folder_path / "era5_data_2025_03_monthly_celsius.nc"
f_area_before_celsius = folder_path / "era5_data_2025_03_monthly_area.nc"
f_area_after_celsius = folder_path / "era5_data_2025_03_monthly_area_celsius.nc"
f_multi_before_celsicus = folder_path / "era5_data_2022_2023_2024_all_monthly.nc"
f_multi_after_celsicus = folder_path / "era5_data_2022_2023_2024_all_monthly_celsius.nc"
f_2024_area_after_celsius = folder_path / "era5_data_2024_01_02_monthly_area_celsius.nc"

The following cells aim to explore the DB created from dataframes of netCDF files.

In [None]:
# convert to dataframe
df = database.file_to_dataframe(f_2024_area_after_celsius, columns="all")
df

In [None]:
out_data = df[["valid_time", "latitude", "longitude", "t2m"]]
out_data

In [None]:
# save dataframe to duckdb
# by default, database file is data/onehealth.db, table name is onehealth
# data/onehealth.db will be deleted if you run test_import_data_none_path in test_database.py
database_file_path = Path("../data/onehealth.duckdb")
table_name = "onehealth"
database.import_data(out_data, database_file_path, table_name)

In [None]:
# inspect the database
with duckdb.connect(database_file_path) as con:
    t2m_data = con.sql(f"""
        SELECT t2m
        FROM {table_name}
        LIMIT 10
    """).df()
t2m_data

In [None]:
# check data type of each column
with duckdb.connect(database_file_path) as con:
    data_types = con.sql(f"""
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_name = '{table_name}'
    """).df()
data_types

In [None]:
# save dataframe to duckdb with geometry extension
with duckdb.connect(database_file_path) as con:
    con.execute("INSTALL spatial;")
    con.execute("LOAD spatial;")
    con.execute(f"""
        CREATE  OR REPLACE TABLE {table_name}_geom AS
        SELECT *,
            ST_Point(longitude, latitude) AS geom
        FROM {table_name}
    """)
    geo_data = con.sql(f"""
        SELECT *
        FROM {table_name}_geom
        LIMIT 10
    """).df()
geo_data

In [None]:
# use spatial functions
with duckdb.connect(database_file_path) as con:
    con.execute("INSTALL spatial;")
    con.execute("LOAD spatial;")
    spatial_data = con.sql(f"""
        SELECT *
        FROM {table_name}_geom
        WHERE ST_Distance(geom, ST_Point(0, 0)) < 1000000
        LIMIT 10
    """).df()
spatial_data