Skip to content

Commit

Permalink
Merge pull request #116 from roocs/enable-kerchunk
Browse files Browse the repository at this point in the history
Enable kerchunk
  • Loading branch information
cehbrecht committed Apr 16, 2024
2 parents e4d1d8a + 424b71f commit 606a720
Show file tree
Hide file tree
Showing 9 changed files with 116 additions and 31 deletions.
61 changes: 44 additions & 17 deletions Dockerfile
Expand Up @@ -27,29 +27,39 @@ FROM ubuntu:20.04

SHELL ["/bin/bash", "-c"]

ENV BASH_ENV=~/.bashrc \
MAMBA_ROOT_PREFIX=/srv/conda \
PATH=$PATH:/srv/conda/envs/daops/bin

ENV BASH_ENV=~/.bashrc \
PATH=$PATH:/srv/conda/envs/daops/bin:/srv/conda/bin \
MINICONDA_PREFIX=/srv/conda
# MAMBA_ROOT_PREFIX=/srv/conda \

# ==== Install apt-packages and micromamba ====

RUN apt-get update && \
apt-get install -y ca-certificates ttf-dejavu file wget bash bzip2 git && \
wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba --strip-components=1 && \
./micromamba shell init -s bash -p ~/micromamba && \
apt-get clean autoremove --yes && \
cp ./micromamba /usr/bin && \
rm -fr /srv/conda/pkgs
ARG $MINICONDA_PREFIX=/srv/conda

RUN apt-get update && \
apt-get install -y ca-certificates ttf-dejavu file wget bash bzip2 git

#RUN curl -L https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba --strip-components=1 && \
# ./micromamba shell init -s bash -p ~/micromamba && \
# apt-get clean autoremove --yes && \
# cp ./micromamba /usr/bin && \
# rm -fr /srv/conda/pkgs

ARG mconda=Miniconda3-py311_23.10.0-1-Linux-x86_64.sh
RUN wget https://repo.anaconda.com/miniconda/$mconda && \
bash ./$mconda -b -p $MINICONDA_PREFIX && \
apt-get clean autoremove --yes && \
rm -fr $MINICONDA_PREFIX/pkgs

# wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba --strip-components=1 && \


# ==== Set up conda environment from yml file ====

ARG tmp_env=/tmp/environment.yml
ADD environment.yml $tmp_env
RUN micromamba create -f $tmp_env && \
rm -fr $tmp_env /srv/conda/pkgs

RUN conda env create -f $tmp_env && \
rm -fr $MINICONDA_PREFIX/pkgs

# ==== Clone the data repo ====

Expand All @@ -61,7 +71,6 @@ RUN git clone $data_repo_url $data_dir && \
git checkout $data_repo_branch && \
rm -fr .git


# ==== Set up the roocs.ini file with paths pointing to the data repo ====
# ==== and ensure that ROOCS_CONFIG environment variable points to it ====

Expand All @@ -79,14 +88,32 @@ ARG tmp_install_dir=/tmp/daops-install
RUN mkdir $tmp_install_dir
COPY . $tmp_install_dir
RUN cd $tmp_install_dir && \
/srv/conda/envs/daops/bin/python setup.py install && \
$MINICONDA_PREFIX/envs/daops/bin/python setup.py install && \
rm -fr $tmp_install_dir && \
echo "export USE_PYGEOS=0" >> /root/.bashrc

# ==== Activate the env and install packages with pip ====

#RUN source activate $MINICONDA_PREFIX/bin/conda && \
# conda activate daops && \
RUN $MINICONDA_PREFIX/envs/daops/bin/pip uninstall roocs_utils -y && \
$MINICONDA_PREFIX/envs/daops/bin/pip install \
roocs-utils@git+https://github.com/roocs/roocs-utils.git@master#egg=roocs_utils

# ==== Create a directory that we can bind-mount ====
RUN mkdir /outputs


# ==== Some tidying up (NB further apt-install not possible after this) ====

RUN rm -fr /var/lib/{apt,dpkg,cache,log}

# ==== Run a test script ====
#COPY ./kc-script.py /tmp/kc-script.py
#RUN $MINICONDA_PREFIX/envs/daops/bin/python /tmp/kc-script.py

# ==== Test run daops at the command-line ====
#RUN ROOCS_CONFIG=$config_file $MINICONDA_PREFIX/envs/daops/bin/daops subset --area 0,-10,120,40 \
# --time 2085-01-16/2120-12-16 --levels / --time-components year:2090,2091,2092 \
# --output-dir /tmp --file-namer simple cmip5.output1.MOHC.HadGEM2-ES.rcp85.mon.atmos.Amon.r1i1p1.latest.tas


3 changes: 2 additions & 1 deletion HISTORY.rst
Expand Up @@ -13,7 +13,8 @@ New Features
^^^^^^^^^^^^

* Add clisops.ops.average_shape to daops.ops.average

* Added support for opening `kerchunk` files.
* Updated `Dockerfile` and `app-package.cwl` file for use with ADES.

v0.10.0 (2023-11-27)
-------------------
Expand Down
39 changes: 32 additions & 7 deletions app-package.cwl
Expand Up @@ -15,9 +15,18 @@ $graph:
- class: ScatterFeatureRequirement

inputs:
area:
doc: area
type: string
time:
doc: time
type: string
time_components:
doc: time_components
type: string
levels:
doc: levels
type: string
collection:
doc: collection
type: string
Expand All @@ -38,7 +47,10 @@ $graph:
subset:
run: "#clt"
in:
area: area
time: time
time_components: time_components
levels: levels
collection: collection
file_namer: file_namer
output_dir: output_dir
Expand All @@ -53,37 +65,50 @@ $graph:
envDef:
ROOCS_CONFIG: /root/roocs.ini
# PATH: /srv/conda/envs/env_crop/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
# PYTHONPATH: /home/jovyan/ogc-eo-application-package-hands-on/water-bodies/command-line-tools/crop:/home/jovyan/water-bodies/command-line-tools/crop:/workspaces/vscode-binder/command-line-tools/crop
# PROJ_LIB: /srv/conda/envs/env_crop/share/proj/
# ResourceRequirement:
# coresMax: 1
# ramMax: 500Mb
hints:
DockerRequirement:
dockerPull: alaniwi/daops:latest
dockerPull: agstephens/daops-kerchunk:v0.4
baseCommand: ["daops", "subset"]

arguments: []
inputs:
area:
type: string
inputBinding:
prefix: --area
position: 1
time:
type: string
inputBinding:
prefix: --time
position: 1
position: 2
time_components:
type: string
inputBinding:
prefix: --time-components
position: 3
levels:
type: string
inputBinding:
prefix: --levels
position: 4
file_namer:
type: string
inputBinding:
prefix: --file-namer
position: 2
position: 5
output_dir:
type: string
inputBinding:
prefix: --output-dir
position: 3
position: 6
collection:
type: string
inputBinding:
position: 4
position: 7
outputs:
results:
outputBinding:
Expand Down
13 changes: 10 additions & 3 deletions daops/utils/consolidate.py
Expand Up @@ -11,7 +11,7 @@
from roocs_utils.project_utils import get_project_base_dir
from roocs_utils.project_utils import get_project_name
from roocs_utils.utils.file_utils import FileMapper
from roocs_utils.xarray_utils.xarray_utils import open_xr_dataset
from roocs_utils.xarray_utils.xarray_utils import is_kerchunk_file, open_xr_dataset

from daops.catalog import get_catalog
from daops.utils.core import _wrap_sequence
Expand Down Expand Up @@ -124,7 +124,7 @@ def consolidate(collection, **kwargs):

collection = _wrap_sequence(collection.value)

if not isinstance(collection[0], FileMapper):
if not isinstance(collection[0], FileMapper) and not is_kerchunk_file(collection[0]):
project = get_project_name(collection[0])
catalog = get_catalog(project)

Expand All @@ -133,7 +133,13 @@ def consolidate(collection, **kwargs):
time_param = kwargs.get("time")

for dset in collection:
if not catalog:

# If dset looks like a Kerchunk file then pass it straight through
if is_kerchunk_file(dset):
filtered_refs[dset] = dset

# If no intake catalog is being used to constrain the data access
elif not catalog:
file_paths = dset_to_filepaths(dset, force=True)

if time_param:
Expand All @@ -145,6 +151,7 @@ def consolidate(collection, **kwargs):

filtered_refs[dset] = file_paths

# If an intake catalog is being used to constrain the data access
else:
ds_id = derive_ds_id(dset)
result = catalog.search(collection=ds_id, time=time_param)
Expand Down
4 changes: 2 additions & 2 deletions daops/utils/core.py
Expand Up @@ -3,7 +3,7 @@
import xarray as xr
from elasticsearch import exceptions
from loguru import logger
from roocs_utils.xarray_utils.xarray_utils import open_xr_dataset
from roocs_utils.xarray_utils.xarray_utils import open_xr_dataset, is_kerchunk_file

from .base_lookup import Lookup
from daops import CONFIG
Expand Down Expand Up @@ -74,7 +74,7 @@ def open_dataset(ds_id, file_paths, apply_fixes=True):
:param apply_fixes: Boolean. If True fixes will be applied to datasets if needed. Default is True.
:return: xarray Dataset with fixes applied to the data.
"""
if apply_fixes:
if apply_fixes and not is_kerchunk_file(ds_id):
fix = fixer.Fixer(ds_id)
if fix.pre_processor:
for pre_process in fix.pre_processors:
Expand Down
3 changes: 3 additions & 0 deletions environment.yml
Expand Up @@ -18,5 +18,8 @@ dependencies:
- loguru >=0.5.3
# catalog
- intake >=0.7.0,<2.0
# to support kerchunk
- fsspec
- aiohttp
- zarr==2.13.3
- zstandard
4 changes: 3 additions & 1 deletion requirements.txt
Expand Up @@ -13,5 +13,7 @@ roocs_grids>=0.1.2
loguru>=0.5.3
# catalog
intake>=0.7.0,<2.0
fsspec
# to support kerchunk
aiohttp
fsspec
zarr==2.13.3
3 changes: 3 additions & 0 deletions tests/_common.py
Expand Up @@ -97,3 +97,6 @@ def write_roocs_cfg():
MINI_ESGF_MASTER_DIR,
"test_data/badc/cmip6/data/CMIP6/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/s2004-r3i1p1f2/Amon/pr/gn/v20200417/pr_Amon_HadGEM3-GC31-MM_dcppA-hindcast_s2004-r3i1p1f2_gn_200411-200412.nc",
).as_posix()

CMIP6_KERCHUNK_HTTPS_OPEN_JSON = ("https://gws-access.jasmin.ac.uk/public/cmip6_prep/eodh-eocis/kc-indexes-cmip6-http-v1/"
"CMIP6.CMIP.MOHC.UKESM1-1-LL.1pctCO2.r1i1p1f2.Amon.tasmax.gn.v20220513.json")
17 changes: 17 additions & 0 deletions tests/test_operations/test_subset.py
Expand Up @@ -22,6 +22,7 @@
from tests._common import CMIP6_DAY
from tests._common import CMIP6_MONTH
from tests._common import MINI_ESGF_MASTER_DIR
from tests._common import CMIP6_KERCHUNK_HTTPS_OPEN_JSON

CMIP5_IDS = [
"cmip5.output1.INM.inmcm4.rcp45.mon.ocean.Omon.r1i1p1.latest.zostoga",
Expand Down Expand Up @@ -93,6 +94,22 @@ def test_subset_t(tmpdir, load_esgf_test_data):
assert ds.time.shape == (433,)


@pytest.mark.online
def test_subset_t_kerchunk(tmpdir):
result = subset(
CMIP6_KERCHUNK_HTTPS_OPEN_JSON,
time=time_interval("1948-01-16", "1952-12-16"),
area=(0, -10, 120, 40),
output_dir=tmpdir,
file_namer="simple",
)
_check_output_nc(result)
ds = xr.open_dataset(result.file_uris[0], use_cftime=True)
assert ds.time.shape == (60,)
assert ds.tasmax.shape == (60, 40, 64)
assert np.isclose(float(ds.tasmax.max()), 327.24411011)


@pytest.mark.online
def test_subset_no_collection(tmpdir):
with pytest.raises(TypeError):
Expand Down

0 comments on commit 606a720

Please sign in to comment.