Skip to content

Commit

Permalink
Merge pull request #99 from roocs/cli
Browse files Browse the repository at this point in the history
Added features for using `daops` with EOEPCA:
- command-line interface and tests: `daops/cli.py` and `tests/test_cli.py`
- Docker file: `Dockerfile`
- CWL file: `app-package.cwl`
  • Loading branch information
agstephens committed Apr 20, 2023
2 parents 0271f08 + a4d3585 commit 3282f63
Show file tree
Hide file tree
Showing 17 changed files with 1,029 additions and 28 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/main.yml
Expand Up @@ -25,6 +25,8 @@ jobs:
# pip install flake8 black pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
if [ -f requirements_dev.txt ]; then pip install -r requirements_dev.txt; fi
# pip install the package, command-line unit tests work
pip install --no-deps -e .
# - name: Lint with flake8
# run: flake8 daops tests
# if: matrix.python-version == 3.8
Expand Down
92 changes: 92 additions & 0 deletions Dockerfile
@@ -0,0 +1,92 @@
##=================================================================================
##
## EXAMPLE USAGE
##
## $ docker build -t daops .
## $ mkdir ~/container-outputs
## $ docker run -it \
## --mount type=bind,source=$HOME/container-outputs,target=/outputs \
## daops
##
## # id=cmip5.output1.INM.inmcm4.rcp45.mon.ocean.Omon.r1i1p1.latest.zostoga
## # path=/root/.mini-esgf-data/test_data/badc/cmip5/data/$(echo $id | tr / .)
## # ncdump -h $path/*.nc | grep UNLIMITED
## time = UNLIMITED ; // (1140 currently)
## # rm /outputs/*.nc
## # daops subset --output-dir /outputs --time=2010-1-1/2015-1-1 $id
## # ncdump -h /outputs/*.nc | grep UNLIMITED
## time = UNLIMITED ; // (60 currently)
## # exit
##
## $ ls ~/container-outputs/
## zostoga_mon_inmcm4_rcp45_r1i1p1_20100116-20141216.nc
##
##=================================================================================

FROM ubuntu:20.04

SHELL ["/bin/bash", "-c"]

ENV BASH_ENV=~/.bashrc \
MAMBA_ROOT_PREFIX=/srv/conda \
PATH=$PATH:/srv/conda/envs/daops/bin


# ==== Install apt-packages and micromamba ====

RUN apt-get update && \
apt-get install -y ca-certificates ttf-dejavu file wget bash bzip2 git && \
wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba --strip-components=1 && \
./micromamba shell init -s bash -p ~/micromamba && \
apt-get clean autoremove --yes && \
cp ./micromamba /usr/bin && \
rm -fr /srv/conda/pkgs


# ==== Set up conda environment from yml file ====

ARG tmp_env=/tmp/environment.yml
ADD environment.yml $tmp_env
RUN micromamba create -f $tmp_env && \
rm -fr $tmp_env /srv/conda/pkgs


# ==== Clone the data repo ====

ARG data_dir=/root/.mini-esgf-data
ARG data_repo_url=https://github.com/roocs/mini-esgf-data
ARG data_repo_branch=master
RUN git clone $data_repo_url $data_dir && \
cd $data_dir && \
git checkout $data_repo_branch && \
rm -fr .git


# ==== Set up the roocs.ini file with paths pointing to the data repo ====
# ==== and ensure that ROOCS_CONFIG environment variable points to it ====

ARG config_file=/root/roocs.ini
ARG config_tmpl=/tmp/roocs.ini.tmpl
COPY roocs.ini.tmpl $config_tmpl
RUN sed "s,DATA_DIR,$data_dir,g" $config_tmpl > $config_file && \
rm $config_tmpl && \
echo "export ROOCS_CONFIG=$config_file" >> /root/.bashrc


# ==== Install the daops app ====

ARG tmp_install_dir=/tmp/daops-install
RUN mkdir $tmp_install_dir
COPY . $tmp_install_dir
RUN cd $tmp_install_dir && \
/srv/conda/envs/daops/bin/python setup.py install && \
rm -fr $tmp_install_dir && \
echo "export USE_PYGEOS=0" >> /root/.bashrc

# ==== Create a directory that we can bind-mount ====
RUN mkdir /outputs


# ==== Some tidying up (NB further apt-install not possible after this) ====

RUN rm -fr /var/lib/{apt,dpkg,cache,log}
129 changes: 129 additions & 0 deletions app-package.cwl
@@ -0,0 +1,129 @@
$graph:

- class: Workflow
doc: Runs daops subsetting process
id: daops
requirements:
- class: ScatterFeatureRequirement
inputs:
area:
doc: Area
label: Area
type: string[]
time:
doc: Time
label: Time
type: string[]
time_components:
doc: Time Components
label: Time Components
type: string[]
level:
doc: Level
label: Level
type: string[]
output_format:
doc: Output Format
label: Output Format
type: string[]
file_namer:
doc: File Namer
label: File Namer
type: string[]
output_dir:
doc: Output dir
label: Output dir
type: string[]
collection:
doc: Collection
label: Collection
type: string[]
label: data-aware operations (daops)
outputs:
- id: wf_outputs
outputSource:
- step_1/results
type:
Directory[]

steps:
step_1:
in:
area: area
time: time
time_components: time_components
level: level
output_format: output_format
file_namer: file_namer
output_dir: output_dir
collection: collection
out:
- results
run: '#clt'
scatter: [area, time, time_components, level, output_format, file_namer, output_dir, collection]
scatterMethod: flat_crossproduct

- baseCommand: daops
class: CommandLineTool

id: clt

arguments:
- --area
- valueFrom: $( inputs.area )
- --time
- valueFrom: ${ inputs.time }
- --time-components
- valueFrom: ${ inputs.time_components }
- --levels
- valueFrom: ${ inputs.levels }
- --output-format
- valueFrom: ${ inputs.output_format }
- --file-namer
- valueFrom: ${ inputs.file_namer }
- --output-dir
- valueFrom: ${ inputs.output_dir }
- --collection
- valueFrom: ${ inputs.collection }

inputs:
area:
type: string
time:
type: string
time_components:
type: string
level:
type: string
output_format:
type: string
file_namer:
type: string
output_dir:
type: string
collection:
type: string

outputs:
results:
outputBinding:
glob: .
type: Directory
requirements:
EnvVarRequirement:
envDef:
PATH: /bin:/srv/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ResourceRequirement: {}
InlineJavascriptRequirement: {}
DockerRequirement:
dockerPull: iwi/daops:0.0.1
#stderr: std.err
#stdout: std.out

cwlVersion: v1.0

$namespaces:
s: https://schema.org/
s:softwareVersion: 0.3.0
schemas:
- http://schema.org/version/9.0/schemaorg-current-http.rdf
88 changes: 88 additions & 0 deletions daops/cli.py
@@ -0,0 +1,88 @@
"""Console script for daops."""

__author__ = """Alan Iwi"""
__contact__ = 'alan.iwi@stfc.ac.uk'
__copyright__ = "Copyright 2023 United Kingdom Research and Innovation"
__license__ = "BSD - see LICENSE file in top-level package directory"

import os
import sys
import argparse
import dateutil.parser
import configparser

from daops.ops.subset import subset
from roocs_utils.utils.file_utils import FileMapper

def parse_args():

parser = argparse.ArgumentParser()
sub_parsers = parser.add_subparsers()
sub_parsers.required = True

parser_subset = sub_parsers.add_parser('subset', help='subset data')
parser_subset.add_argument('--area', '-a', type=str,
help=('area in format w,s,e,n. Hint: if w is negative, include an "=" sign '
'e.g. --area=-10,...'))
parser_subset.add_argument('--time', '-t', type=str, metavar='time_window',
help='time window e.g. 1999-01-01T00:00:00/2100-12-30T00:00:00')
parser_subset.add_argument('--time-components', '-c', type=str,
help="time components e.g. month:dec,jan,feb or 'year:1970,1980|month:01,02,03'")
parser_subset.add_argument('--levels', '-l', type=str,
help=('comma-separated list of levels (e.g. 500,1000,2000) '
'or slash-separated range (e.g. 50/2000 for 50 to 2000)'))
parser_subset.add_argument('--output-format', '-f', type=str, metavar='format',
choices=('netcdf', 'nc', 'zarr'), default='netcdf')
parser_subset.add_argument('--file-namer', '-F', type=str,
choices=('simple', 'standard'), default='standard')
parser_subset.add_argument('--output-dir', '-d', type=str, metavar='output_directory', required=True)
parser_subset.add_argument('collection', type=str, nargs='+', default=list)

return parser.parse_args()


def get_params(args):

collection = args.collection if len(args.collection) == 1 else FileMapper(args.collection)

return {'collection': collection,
'time': args.time,
'time_components': args.time_components,
'area': args.area,
'level': args.levels,
'output_type': args.output_format,
'output_dir': args.output_dir,
'file_namer': args.file_namer,
'apply_fixes': False
}


def check_env():
"""
Check that ROOCS_CONFIG points to a valid config file
(although for certain types of invalid file, in fact main is never called,
so exit might not always be graceful in these cases).
Call this after get_params() so that 'help' still works even if this is not set.
"""
config_env_var = 'ROOCS_CONFIG'
c = configparser.ConfigParser()
try:
ret = c.read(os.environ[config_env_var])
except (KeyError, configparser.Error):
ret = None
if not ret:
print(f'Environment variable {config_env_var} must contain the path name of a config file in ini format')
sys.exit(1)


def main():
args = parse_args()
params = get_params(args)
check_env()
ret = subset(**params)
for uri in ret.file_uris:
print(uri)


if __name__ == "__main__":
sys.exit(main()) # pragma: no cover
1 change: 0 additions & 1 deletion daops/fix_utils/decadal_utils.py
Expand Up @@ -35,7 +35,6 @@ def get_time_calendar(ds_id, ds):


def get_lead_times(ds_id, ds):

start_date = datetime.fromisoformat(get_start_date(ds_id, ds))

cal = get_time_calendar(ds_id, ds)
Expand Down
3 changes: 0 additions & 3 deletions daops/utils/consolidate.py
Expand Up @@ -86,7 +86,6 @@ def get_files_matching_time_range(time_param, file_paths):

# Handle times differently depending on the type of time parameter
if time_param.type == "interval":

tp_start, tp_end = time_param.get_bounds()
req_start_year = get_year(tp_start, default=-99999999)
req_end_year = get_year(tp_end, default=999999999)
Expand All @@ -98,7 +97,6 @@ def get_files_matching_time_range(time_param, file_paths):
files_in_time_range.append(fpath)

elif time_param.type == "series":

# Get requested years and match to files whose years intersect
req_years = {to_year(tm) for tm in time_param.asdict().get("time_values", [])}

Expand Down Expand Up @@ -135,7 +133,6 @@ def consolidate(collection, **kwargs):
time_param = kwargs.get("time")

for dset in collection:

if not catalog:
file_paths = dset_to_filepaths(dset, force=True)

Expand Down
1 change: 0 additions & 1 deletion daops/utils/fixer.py
Expand Up @@ -36,7 +36,6 @@ def _gather_fixes(self, content):
"""Gathers pre and post processing fixes together"""
if content["_source"]["fixes"]:
for fix in content["_source"]["fixes"]:

ref_implementation = fix["reference_implementation"]
func = locate(ref_implementation)

Expand Down
1 change: 0 additions & 1 deletion daops/utils/normalise.py
Expand Up @@ -19,7 +19,6 @@ def normalise(collection, apply_fixes=True):
norm_collection = collections.OrderedDict()

for dset, file_paths in collection.items():

ds = open_dataset(dset, file_paths, apply_fixes)
norm_collection[dset] = ds

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Expand Up @@ -5,7 +5,7 @@ cftime
netcdf4
elasticsearch>=8.0.1
clisops>=0.9.1
# clisops @ git+https://github.com/roocs/clisops.git@master#egg=clisops
## clisops @ git+https://github.com/roocs/clisops.git@master#egg=clisops
roocs-utils>=0.6.2
# logging
loguru>=0.5.3
Expand Down
17 changes: 17 additions & 0 deletions roocs.ini.tmpl
@@ -0,0 +1,17 @@
[project:cmip5]
base_dir = DATA_DIR/test_data/badc/cmip5/data/cmip5

[project:cmip6]
base_dir = DATA_DIR/test_data/badc/cmip6/data/CMIP6

[project:cordex]
base_dir = DATA_DIR/test_data/badc/cordex/data/cordex

[project:c3s-cmip5]
base_dir = DATA_DIR/test_data/gws/nopw/j04/cp4cds1_vol1/data/c3s-cmip5

[project:c3s-cmip6]
base_dir = DATA_DIR/test_data/badc/cmip6/data/CMIP6

[project:c3s-cordex]
base_dir = DATA_DIR/test_data/gws/nopw/j04/cp4cds1_vol1/data/c3s-cordex

0 comments on commit 3282f63

Please sign in to comment.