Merge pull request #99 from roocs/cli

Added features for using `daops` with EOEPCA: - command-line interface and tests: `daops/cli.py` and `tests/test_cli.py` - Docker file: `Dockerfile` - CWL file: `app-package.cwl`
roocs · Apr 20, 2023 · 3282f63 · 3282f63
2 parents 0271f08 + a4d3585
commit 3282f63
Show file tree

Hide file tree

Showing 17 changed files with 1,029 additions and 28 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -25,6 +25,8 @@ jobs:
         # pip install flake8 black pytest
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
         if [ -f requirements_dev.txt ]; then pip install -r requirements_dev.txt; fi
+        # pip install the package, command-line unit tests work 
+        pip install --no-deps -e .
 #    - name: Lint with flake8
 #      run: flake8 daops tests
 #      if: matrix.python-version == 3.8

diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,92 @@
+##=================================================================================
+##
+##    EXAMPLE USAGE
+##    
+##    $ docker build -t daops .
+##    $ mkdir ~/container-outputs
+##    $ docker run -it \
+##                 --mount type=bind,source=$HOME/container-outputs,target=/outputs \
+##                 daops
+##
+##    # id=cmip5.output1.INM.inmcm4.rcp45.mon.ocean.Omon.r1i1p1.latest.zostoga
+##    # path=/root/.mini-esgf-data/test_data/badc/cmip5/data/$(echo $id | tr / .)
+##    # ncdump -h $path/*.nc | grep UNLIMITED
+##    	time = UNLIMITED ; // (1140 currently)
+##    # rm /outputs/*.nc
+##    # daops subset --output-dir /outputs --time=2010-1-1/2015-1-1 $id
+##    # ncdump -h /outputs/*.nc | grep UNLIMITED
+##    	time = UNLIMITED ; // (60 currently)
+##    # exit
+##
+##    $ ls ~/container-outputs/
+##    zostoga_mon_inmcm4_rcp45_r1i1p1_20100116-20141216.nc
+##
+##=================================================================================
+
+FROM ubuntu:20.04
+
+SHELL ["/bin/bash", "-c"]
+
+ENV BASH_ENV=~/.bashrc                       \
+    MAMBA_ROOT_PREFIX=/srv/conda             \
+    PATH=$PATH:/srv/conda/envs/daops/bin
+
+
+# ==== Install apt-packages and micromamba ====
+
+RUN apt-get update                                                                                                           && \
+    apt-get install -y ca-certificates ttf-dejavu file wget bash bzip2 git                                                   && \
+    wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba --strip-components=1  && \
+    ./micromamba shell init -s bash -p ~/micromamba                                                                          && \
+    apt-get clean autoremove --yes                                                                                           && \
+    cp ./micromamba /usr/bin                                                                                                 && \
+    rm -fr /srv/conda/pkgs
+
+
+# ==== Set up conda environment from yml file ====
+
+ARG tmp_env=/tmp/environment.yml
+ADD environment.yml $tmp_env
+RUN micromamba create -f $tmp_env    && \
+    rm -fr $tmp_env /srv/conda/pkgs
+
+
+# ==== Clone the data repo ====
+
+ARG data_dir=/root/.mini-esgf-data
+ARG data_repo_url=https://github.com/roocs/mini-esgf-data
+ARG data_repo_branch=master
+RUN git clone $data_repo_url $data_dir  && \
+    cd $data_dir                        && \
+    git checkout $data_repo_branch      && \
+    rm -fr .git
+
+
+# ==== Set up the roocs.ini file with paths pointing to the data repo ====
+# ==== and ensure that ROOCS_CONFIG environment variable points to it ====
+
+ARG config_file=/root/roocs.ini
+ARG config_tmpl=/tmp/roocs.ini.tmpl
+COPY roocs.ini.tmpl $config_tmpl
+RUN sed "s,DATA_DIR,$data_dir,g" $config_tmpl > $config_file  && \
+    rm $config_tmpl                                           && \
+    echo "export ROOCS_CONFIG=$config_file" >> /root/.bashrc
+
+
+# ==== Install the daops app ====
+
+ARG tmp_install_dir=/tmp/daops-install
+RUN mkdir $tmp_install_dir
+COPY . $tmp_install_dir
+RUN    cd $tmp_install_dir                                 && \
+       /srv/conda/envs/daops/bin/python setup.py install   && \
+       rm -fr $tmp_install_dir                             && \
+       echo "export USE_PYGEOS=0" >> /root/.bashrc
+
+# ==== Create a directory that we can bind-mount ====
+RUN mkdir /outputs
+
+
+# ==== Some tidying up (NB further apt-install not possible after this) ====
+
+RUN rm -fr /var/lib/{apt,dpkg,cache,log}
diff --git a/app-package.cwl b/app-package.cwl
@@ -0,0 +1,129 @@
+$graph:
+
+- class: Workflow
+  doc: Runs daops subsetting process
+  id: daops
+  requirements:
+  - class: ScatterFeatureRequirement
+  inputs:
+    area:
+      doc: Area
+      label: Area
+      type: string[]
+    time:
+      doc: Time
+      label: Time
+      type: string[]
+    time_components:
+      doc: Time Components
+      label: Time Components
+      type: string[]
+    level:
+      doc: Level
+      label: Level
+      type: string[]
+    output_format:
+      doc: Output Format
+      label: Output Format
+      type: string[]
+    file_namer:
+      doc: File Namer
+      label: File Namer
+      type: string[]
+    output_dir:
+      doc: Output dir
+      label: Output dir
+      type: string[]
+    collection:
+      doc: Collection
+      label: Collection
+      type: string[]  
+  label: data-aware operations (daops)
+  outputs:
+  - id: wf_outputs
+    outputSource:
+    - step_1/results
+    type:
+      Directory[]
+
+  steps:
+    step_1:
+      in:
+        area: area
+        time: time
+        time_components: time_components
+        level: level
+        output_format: output_format
+        file_namer: file_namer
+        output_dir: output_dir
+        collection: collection
+      out:
+      - results
+      run: '#clt'
+      scatter: [area, time, time_components, level, output_format, file_namer, output_dir, collection]
+      scatterMethod: flat_crossproduct
+
+- baseCommand: daops
+  class: CommandLineTool
+
+  id: clt
+
+  arguments:
+  - --area
+  - valueFrom: $( inputs.area )
+  - --time
+  - valueFrom: ${ inputs.time }
+  - --time-components
+  - valueFrom: ${ inputs.time_components }
+  - --levels
+  - valueFrom: ${ inputs.levels }
+  - --output-format
+  - valueFrom: ${ inputs.output_format }
+  - --file-namer
+  - valueFrom: ${ inputs.file_namer }
+  - --output-dir
+  - valueFrom: ${ inputs.output_dir }
+  - --collection
+  - valueFrom: ${ inputs.collection }
+
+  inputs:
+    area:
+      type: string
+    time:
+      type: string
+    time_components:
+      type: string 
+    level:
+      type: string
+    output_format:
+      type: string
+    file_namer:
+      type: string
+    output_dir:
+      type: string
+    collection:
+      type: string
+
+  outputs:
+    results:
+      outputBinding:
+        glob: .
+      type: Directory
+  requirements:
+    EnvVarRequirement:
+      envDef:
+        PATH: /bin:/srv/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+    ResourceRequirement: {}
+    InlineJavascriptRequirement: {}
+    DockerRequirement:
+      dockerPull: iwi/daops:0.0.1
+  #stderr: std.err
+  #stdout: std.out
+
+cwlVersion: v1.0
+
+$namespaces:
+  s: https://schema.org/
+s:softwareVersion: 0.3.0
+schemas:
+- http://schema.org/version/9.0/schemaorg-current-http.rdf
diff --git a/daops/cli.py b/daops/cli.py
@@ -0,0 +1,88 @@
+"""Console script for daops."""
+
+__author__ = """Alan Iwi"""
+__contact__ = 'alan.iwi@stfc.ac.uk'
+__copyright__ = "Copyright 2023 United Kingdom Research and Innovation"
+__license__ = "BSD - see LICENSE file in top-level package directory"
+
+import os
+import sys
+import argparse
+import dateutil.parser
+import configparser
+
+from daops.ops.subset import subset
+from roocs_utils.utils.file_utils import FileMapper
+
+def parse_args():
+
+    parser = argparse.ArgumentParser()
+    sub_parsers = parser.add_subparsers()
+    sub_parsers.required = True
+
+    parser_subset = sub_parsers.add_parser('subset', help='subset data')
+    parser_subset.add_argument('--area', '-a', type=str, 
+                               help=('area in format w,s,e,n. Hint: if w is negative, include an "=" sign '
+                                     'e.g. --area=-10,...'))
+    parser_subset.add_argument('--time', '-t', type=str, metavar='time_window',
+                               help='time window e.g. 1999-01-01T00:00:00/2100-12-30T00:00:00')
+    parser_subset.add_argument('--time-components', '-c', type=str, 
+                               help="time components e.g. month:dec,jan,feb or 'year:1970,1980|month:01,02,03'")
+    parser_subset.add_argument('--levels', '-l', type=str,
+                               help=('comma-separated list of levels (e.g. 500,1000,2000) '
+                                     'or slash-separated range (e.g. 50/2000 for 50 to 2000)'))
+    parser_subset.add_argument('--output-format', '-f', type=str, metavar='format',
+                               choices=('netcdf', 'nc', 'zarr'), default='netcdf')
+    parser_subset.add_argument('--file-namer', '-F', type=str, 
+                               choices=('simple', 'standard'), default='standard')
+    parser_subset.add_argument('--output-dir', '-d', type=str, metavar='output_directory', required=True)
+    parser_subset.add_argument('collection', type=str, nargs='+', default=list)
+
+    return parser.parse_args()
+
+
+def get_params(args):
+
+    collection = args.collection if len(args.collection) == 1 else FileMapper(args.collection)
+
+    return {'collection': collection,
+            'time': args.time,
+            'time_components': args.time_components,
+            'area': args.area,
+            'level': args.levels,
+            'output_type': args.output_format,
+            'output_dir': args.output_dir,
+            'file_namer': args.file_namer,
+            'apply_fixes': False            
+            }
+
+
+def check_env():
+    """
+    Check that ROOCS_CONFIG points to a valid config file
+    (although for certain types of invalid file, in fact main is never called,
+    so exit might not always be graceful in these cases).
+    Call this after get_params() so that 'help' still works even if this is not set.
+    """
+    config_env_var = 'ROOCS_CONFIG'
+    c = configparser.ConfigParser()
+    try:
+        ret = c.read(os.environ[config_env_var])
+    except (KeyError, configparser.Error):
+        ret = None
+    if not ret:
+        print(f'Environment variable {config_env_var} must contain the path name of a config file in ini format')
+        sys.exit(1)
+
+
+def main():
+    args = parse_args()
+    params = get_params(args)
+    check_env()
+    ret = subset(**params)
+    for uri in ret.file_uris:
+        print(uri)
+
+
+if __name__ == "__main__":
+    sys.exit(main())  # pragma: no cover
diff --git a/daops/fix_utils/decadal_utils.py b/daops/fix_utils/decadal_utils.py
@@ -35,7 +35,6 @@ def get_time_calendar(ds_id, ds):
 
 
 def get_lead_times(ds_id, ds):
-
     start_date = datetime.fromisoformat(get_start_date(ds_id, ds))
 
     cal = get_time_calendar(ds_id, ds)

diff --git a/daops/utils/consolidate.py b/daops/utils/consolidate.py
@@ -86,7 +86,6 @@ def get_files_matching_time_range(time_param, file_paths):
 
     # Handle times differently depending on the type of time parameter
     if time_param.type == "interval":
-
         tp_start, tp_end = time_param.get_bounds()
         req_start_year = get_year(tp_start, default=-99999999)
         req_end_year = get_year(tp_end, default=999999999)
@@ -98,7 +97,6 @@ def get_files_matching_time_range(time_param, file_paths):
                 files_in_time_range.append(fpath)
 
     elif time_param.type == "series":
-
         # Get requested years and match to files whose years intersect
         req_years = {to_year(tm) for tm in time_param.asdict().get("time_values", [])}
 
@@ -135,7 +133,6 @@ def consolidate(collection, **kwargs):
     time_param = kwargs.get("time")
 
     for dset in collection:
-
         if not catalog:
             file_paths = dset_to_filepaths(dset, force=True)
 

diff --git a/daops/utils/fixer.py b/daops/utils/fixer.py
@@ -36,7 +36,6 @@ def _gather_fixes(self, content):
         """Gathers pre and post processing fixes together"""
         if content["_source"]["fixes"]:
             for fix in content["_source"]["fixes"]:
-
                 ref_implementation = fix["reference_implementation"]
                 func = locate(ref_implementation)
 

diff --git a/daops/utils/normalise.py b/daops/utils/normalise.py
@@ -19,7 +19,6 @@ def normalise(collection, apply_fixes=True):
     norm_collection = collections.OrderedDict()
 
     for dset, file_paths in collection.items():
-
         ds = open_dataset(dset, file_paths, apply_fixes)
         norm_collection[dset] = ds
 

diff --git a/requirements.txt b/requirements.txt
@@ -5,7 +5,7 @@ cftime
 netcdf4
 elasticsearch>=8.0.1
 clisops>=0.9.1
-# clisops @ git+https://github.com/roocs/clisops.git@master#egg=clisops
+## clisops @ git+https://github.com/roocs/clisops.git@master#egg=clisops
 roocs-utils>=0.6.2
 # logging
 loguru>=0.5.3

diff --git a/roocs.ini.tmpl b/roocs.ini.tmpl
@@ -0,0 +1,17 @@
+[project:cmip5]
+base_dir = DATA_DIR/test_data/badc/cmip5/data/cmip5
+
+[project:cmip6]
+base_dir = DATA_DIR/test_data/badc/cmip6/data/CMIP6
+
+[project:cordex]
+base_dir = DATA_DIR/test_data/badc/cordex/data/cordex
+
+[project:c3s-cmip5]
+base_dir = DATA_DIR/test_data/gws/nopw/j04/cp4cds1_vol1/data/c3s-cmip5
+
+[project:c3s-cmip6]
+base_dir = DATA_DIR/test_data/badc/cmip6/data/CMIP6
+
+[project:c3s-cordex]
+base_dir = DATA_DIR/test_data/gws/nopw/j04/cp4cds1_vol1/data/c3s-cordex