# Preparing data files

Prepating data files according to the [data flowchart]()

In [None]:
from onehealth_db import inout
from onehealth_db import preprocess
from pathlib import Path

## Download ERA5-Land data

To download ERA5-Land data using CDS's API:
* Select the target dataset, e.g. ERA5-Land monthly averaged data from 1950 to present
* Go to tab `Download` of the dataset and select the data variables, time range, geographical area, etc. that you want to download
* At the end of the page, click on `Show API request code` and take notes of the following information
    * `dataset`: name of the dataset
    * `request`: a dictionary summarizes your download request
* Replace the values of `dataset` and `request` in the below cell correspondingly

In [None]:
# replace dataset and request with your own values
dataset = "reanalysis-era5-land-monthly-means"
request = {
    "product_type": ["monthly_averaged_reanalysis"],
    "variable": ["2m_temperature", "total_precipitation"],
    "year": ["2020", "2021", "2022", "2023", "2024", "2025"],
    "month": [
        "01",
        "02",
        "03",
        "04",
        "05",
        "06",
        "07",
        "08",
        "09",
        "10",
        "11",
        "12",
    ],
    "time": ["00:00"],
    "data_format": "netcdf",
    "download_format": "unarchived",
}

In [None]:
# change to your own data folder, if needed
data_folder = Path("../data/in/")

In [None]:
data_format = request.get("data_format")

# file name of downladed data
file_name = inout.get_filename(
    ds_name=dataset,
    data_format=data_format,
    years=request["year"],
    months=request["month"],
    has_area=bool("area" in request),
    base_name="era5_data",
    variable=request["variable"],
)
output_file = data_folder / file_name

In [None]:
# download data
if not output_file.exists():
    print("Downloading data...")
    inout.download_data(output_file, dataset, request)
else:
    print("Data already exists at {}".format(output_file))

## Load settings

First we need to load the default settings which setup preprocessing steps.

## Preprocess data

In [None]:
# code from inout.py, will be refactored later
#     celsius_file_name = file_name.split(".")[0] + "_celsius.nc"
#     output_celsius_file = data_folder / celsius_file_name
#     with xr.open_dataset(output_file) as ds:
#         # adjust longitude
#         print("Adjusting longitude from 0-360 to -180-180...")
#         ds = adjust_longitude_360_to_180(ds, inplace=True)

#         print("Converting temperature to Celsius...")
#         # convert temperature to Celsius
#         ds = convert_to_celsius_with_attributes(
#             ds, limited_area="area" in request, inplace=True
#         )
#         # and save to a new NetCDF file
#         encoding = {
#             var: {
#                 "zlib": True,  # Enable compression
#                 "complevel": 1,  # Compression level (1–9)
#                 "dtype": "float32",  # Use float32 to match original
#             }
#             for var in ds.data_vars
#         }
#         save_to_netcdf(ds, output_celsius_file, encoding)