diff --git a/CHANGELOG.md b/CHANGELOG.md index 615498ce..47c1c8ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- history_json attribute is populated ### Changed ### Deprecated ### Removed diff --git a/podaac/merger/preprocess_worker.py b/podaac/merger/preprocess_worker.py index 49d19201..25429245 100644 --- a/podaac/merger/preprocess_worker.py +++ b/podaac/merger/preprocess_worker.py @@ -1,8 +1,13 @@ """Preprocessing methods and the utilities to automagically run them in single-thread/multiprocess modes""" +import json +import os +import queue from copy import deepcopy +from datetime import datetime, timezone from multiprocessing import Manager, Process -import queue + +import importlib_metadata import netCDF4 as nc import numpy as np @@ -71,6 +76,54 @@ def merge_metadata(merged_metadata, subset_metadata): merged_attrs[attr_name] = False # mark as inconsistent +def construct_history(input_files): + """ + Construct history JSON entry for this concatenation operation + https://wiki.earthdata.nasa.gov/display/TRT/In-File+Provenance+Metadata+-+TRT-42 + + Parameters + ---------- + input_files : list + List of input files + + Returns + ------- + dict + History JSON constructed for this concat operation + """ + base_names = list(map(os.path.basename, input_files)) + history_json = { + "date_time": datetime.now(tz=timezone.utc).isoformat(), + "derived_from": base_names, + "program": 'concise', + "version": importlib_metadata.distribution('podaac-concise').version, + "parameters": f'input_files={input_files}', + "program_ref": "https://cmr.earthdata.nasa.gov:443/search/concepts/S2153799015-POCLOUD", + "$schema": "https://harmony.earthdata.nasa.gov/schemas/history/0.1.0/history-v0.1.0.json" + } + return history_json + + +def retrieve_history(dataset): + """ + Retrieve history_json field from NetCDF dataset, if it exists + + Parameters + ---------- + dataset : netCDF4.Dataset + NetCDF Dataset representing a single granule + + Returns + ------- + dict + history_json field + """ + if 'history_json' not in dataset.ncattrs(): + return [] + history_json = dataset.getncattr('history_json') + return json.loads(history_json) + + def _run_single_core(file_list): """ Run the granule preprocessing in the current thread/single-core mode @@ -90,14 +143,22 @@ def _run_single_core(file_list): max_dims = {} var_metadata = {} group_metadata = {} + history_json = [] for file in file_list: with nc.Dataset(file, 'r') as dataset: dataset.set_auto_maskandscale(False) process_groups(dataset, group_list, max_dims, group_metadata, var_metadata, var_info) + history_json.extend(retrieve_history(dataset)) group_list.sort() # Ensure insertion order doesn't matter between granules + history_json.append(construct_history(file_list)) + group_metadata[group_list[0]]['history_json'] = json.dumps( + history_json, + default=str + ) + return { 'group_list': group_list, 'max_dims': max_dims, @@ -156,6 +217,7 @@ def _run_multi_core(file_list, process_count): max_dims = {} var_metadata = {} group_metadata = {} + history_json = [] for result in results: # The following data should be consistent between granules and @@ -176,6 +238,15 @@ def _run_multi_core(file_list, process_count): merge_metadata(var_metadata, result['var_metadata']) merge_metadata(group_metadata, result['group_metadata']) + # Merge history_json entries from input files + history_json.extend(result['history_json']) + + history_json.append(construct_history(file_list)) + group_metadata[group_list[0]]['history_json'] = json.dumps( + history_json, + default=str + ) + return { 'group_list': group_list, 'max_dims': max_dims, @@ -207,6 +278,7 @@ def _run_worker(in_queue, results): var_info = {} var_metadata = {} group_metadata = {} + history_json = [] while not in_queue.empty(): try: @@ -218,6 +290,7 @@ def _run_worker(in_queue, results): with nc.Dataset(file, 'r') as dataset: dataset.set_auto_maskandscale(False) process_groups(dataset, group_list, max_dims, group_metadata, var_metadata, var_info) + history_json.extend(retrieve_history(dataset)) group_list.sort() # Ensure insertion order doesn't matter between granules @@ -227,7 +300,8 @@ def _run_worker(in_queue, results): 'max_dims': max_dims, 'var_info': var_info, 'var_metadata': var_metadata, - 'group_metadata': group_metadata + 'group_metadata': group_metadata, + 'history_json': history_json }) diff --git a/poetry.lock b/poetry.lock index 210a956c..61e11480 100644 --- a/poetry.lock +++ b/poetry.lock @@ -8,7 +8,7 @@ python-versions = "*" [[package]] name = "astroid" -version = "2.8.5" +version = "2.8.6" description = "An abstract syntax tree for Python with inference support." category = "dev" optional = false @@ -54,14 +54,14 @@ pytz = ">=2015.7" [[package]] name = "boto3" -version = "1.20.8" +version = "1.20.10" description = "The AWS SDK for Python" category = "main" optional = false python-versions = ">= 3.6" [package.dependencies] -botocore = ">=1.23.8,<1.24.0" +botocore = ">=1.23.10,<1.24.0" jmespath = ">=0.7.1,<1.0.0" s3transfer = ">=0.5.0,<0.6.0" @@ -70,7 +70,7 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.23.8" +version = "1.23.10" description = "Low-level, data-driven core of boto 3." category = "main" optional = false @@ -215,7 +215,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" name = "importlib-metadata" version = "4.8.2" description = "Read metadata from Python packages" -category = "dev" +category = "main" optional = false python-versions = ">=3.6" @@ -317,14 +317,14 @@ python-versions = ">=3.7" [[package]] name = "packaging" -version = "21.2" +version = "21.3" description = "Core utilities for Python packages" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -pyparsing = ">=2.0.2,<3" +pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" [[package]] name = "platformdirs" @@ -425,11 +425,14 @@ tests = ["pytest (>=3.2.1,!=3.3.0)", "hypothesis (>=3.27.0)"] [[package]] name = "pyparsing" -version = "2.4.7" +version = "3.0.6" description = "Python parsing module" category = "main" optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +python-versions = ">=3.6" + +[package.extras] +diagrams = ["jinja2", "railroad-diagrams"] [[package]] name = "pystac" @@ -715,7 +718,7 @@ python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" name = "zipp" version = "3.6.0" description = "Backport of pathlib-compatible object wrapper for zip files" -category = "dev" +category = "main" optional = false python-versions = ">=3.6" @@ -726,7 +729,7 @@ testing = ["pytest (>=4.6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytes [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "e070d79ffce2e490b7e3d70113099bd57a1be7a1145933b1d2e9e9f52b894078" +content-hash = "7e3d0c7caf34d7bb910b293bb02fd61c8fb8945ea963db81f91e4b172e7ce559" [metadata.files] alabaster = [ @@ -734,8 +737,8 @@ alabaster = [ {file = "alabaster-0.7.12.tar.gz", hash = "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02"}, ] astroid = [ - {file = "astroid-2.8.5-py3-none-any.whl", hash = "sha256:abc423a1e85bc1553954a14f2053473d2b7f8baf32eae62a328be24f436b5107"}, - {file = "astroid-2.8.5.tar.gz", hash = "sha256:11f7356737b624c42e21e71fe85eea6875cb94c03c82ac76bd535a0ff10b0f25"}, + {file = "astroid-2.8.6-py3-none-any.whl", hash = "sha256:cd8326b424c971e7d87678609cf6275d22028afd37d6ac59c16d47f1245882f6"}, + {file = "astroid-2.8.6.tar.gz", hash = "sha256:5f6f75e45f15290e73b56f9dfde95b4bf96382284cde406ef4203e928335a495"}, ] atomicwrites = [ {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"}, @@ -750,12 +753,12 @@ babel = [ {file = "Babel-2.9.1.tar.gz", hash = "sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0"}, ] boto3 = [ - {file = "boto3-1.20.8-py3-none-any.whl", hash = "sha256:c0ac23cc36dc484edd1edd28903b5712cb07507af1ae19b2e8d6db176416d9e2"}, - {file = "boto3-1.20.8.tar.gz", hash = "sha256:81ebdcabc534a52e2b7a2bfcbe1a1d7f1e34f028f7fe1cb16ccd80e34cea867a"}, + {file = "boto3-1.20.10-py3-none-any.whl", hash = "sha256:e2b5ce2679424a6c2bfc2ee4bb42d9100c8c08b21eff8d74cff85a7243a76d7b"}, + {file = "boto3-1.20.10.tar.gz", hash = "sha256:20a5109a37414a52c55d2048388f02cb7cf46fc0ca7be08b3bf81f4c5c053feb"}, ] botocore = [ - {file = "botocore-1.23.8-py3-none-any.whl", hash = "sha256:a0c7cfea155a0202ab197a016736dd4e6a26f9e416bdd9cdd2c9a3fb88ffa5a8"}, - {file = "botocore-1.23.8.tar.gz", hash = "sha256:ae4ed9666199020a9e53c3d3efc0a7d417315cd2313b70cb013282afe70ac358"}, + {file = "botocore-1.23.10-py3-none-any.whl", hash = "sha256:11670d3ac14eed1122e0154a7e1563c2c270beef43996466f8d11fbf5cf31611"}, + {file = "botocore-1.23.10.tar.gz", hash = "sha256:0adda9a4a95221027312eaaee0ec9fe2239fb2f285fced3ddca54b1310b864ee"}, ] certifi = [ {file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"}, @@ -1085,8 +1088,8 @@ numpy = [ {file = "numpy-1.21.1.zip", hash = "sha256:dff4af63638afcc57a3dfb9e4b26d434a7a602d225b42d746ea7fe2edf1342fd"}, ] packaging = [ - {file = "packaging-21.2-py3-none-any.whl", hash = "sha256:14317396d1e8cdb122989b916fa2c7e9ca8e2be9e8060a6eff75b6b7b4d8a7e0"}, - {file = "packaging-21.2.tar.gz", hash = "sha256:096d689d78ca690e4cd8a89568ba06d07ca097e3306a4381635073ca91479966"}, + {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, + {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, ] platformdirs = [ {file = "platformdirs-2.4.0-py3-none-any.whl", hash = "sha256:8868bbe3c3c80d42f20156f22e7131d2fb321f5bc86a2a345375c6481a67021d"}, @@ -1141,8 +1144,8 @@ pynacl = [ {file = "PyNaCl-1.4.0.tar.gz", hash = "sha256:54e9a2c849c742006516ad56a88f5c74bf2ce92c9f67435187c3c5953b346505"}, ] pyparsing = [ - {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"}, - {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"}, + {file = "pyparsing-3.0.6-py3-none-any.whl", hash = "sha256:04ff808a5b90911829c55c4e26f75fa5ca8a2f5f36aa3a51f68e27033341d3e4"}, + {file = "pyparsing-3.0.6.tar.gz", hash = "sha256:d9bdec0013ef1eb5a84ab39a3b3868911598afa494f5faa038647101504e2b81"}, ] pystac = [ {file = "pystac-0.5.6-py3-none-any.whl", hash = "sha256:3dfb9068169570714276e54a80832e555a136595a4d5316e6998bcc55cc2d639"}, diff --git a/pyproject.toml b/pyproject.toml index 9ef52ae5..2613507d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ python = "^3.9" netCDF4 = "^1.5.6" numpy = "^1.20.3" harmony-service-lib = "^1.0.9" +importlib-metadata = "^4.8.1" [tool.poetry.dev-dependencies] pytest = "^6.1" diff --git a/tests/data/l2ss_py_output/ascat_20150702_084200_metopa_45145_eps_o_250_2300_ovw.l2_subsetted.nc b/tests/data/l2ss_py_output/ascat_20150702_084200_metopa_45145_eps_o_250_2300_ovw.l2_subsetted.nc new file mode 100644 index 00000000..460b1fb7 Binary files /dev/null and b/tests/data/l2ss_py_output/ascat_20150702_084200_metopa_45145_eps_o_250_2300_ovw.l2_subsetted.nc differ diff --git a/tests/data/l2ss_py_output/ascat_20150702_102400_metopa_45146_eps_o_250_2300_ovw.l2_subsetted.nc b/tests/data/l2ss_py_output/ascat_20150702_102400_metopa_45146_eps_o_250_2300_ovw.l2_subsetted.nc new file mode 100644 index 00000000..ccc709f3 Binary files /dev/null and b/tests/data/l2ss_py_output/ascat_20150702_102400_metopa_45146_eps_o_250_2300_ovw.l2_subsetted.nc differ diff --git a/tests/test_merge.py b/tests/test_merge.py index b7905996..9505a754 100644 --- a/tests/test_merge.py +++ b/tests/test_merge.py @@ -6,6 +6,9 @@ import netCDF4 as nc import numpy as np import pytest +import json +import os +import importlib_metadata from podaac.merger import merge @@ -169,3 +172,61 @@ def test_compare_java_single(self): def test_compare_java_multi(self): self.run_java_verification('python_merge_multi.nc') + + def test_history(self): + data_dir = 'no_groups' + output_name_single = 'test_history_single.nc' + output_name_multi = 'test_history_multi.nc' + data_path = self.__test_data_path.joinpath(data_dir) + input_files = list(data_path.iterdir()) + + def assert_valid_history(merged_dataset, input_files): + input_files = [os.path.basename(file_name) for file_name in input_files] + history_json = json.loads(merged_dataset.getncattr('history_json'))[-1] + assert 'date_time' in history_json + assert history_json.get('program') == 'concise' + assert history_json.get('derived_from') == input_files + assert history_json.get('version') == importlib_metadata.distribution('podaac-concise').version + assert 'input_files=' in history_json.get('parameters') + assert history_json.get('program_ref') == 'https://cmr.earthdata.nasa.gov:443/search/concepts/S2153799015-POCLOUD' + assert history_json.get('$schema') == 'https://harmony.earthdata.nasa.gov/schemas/history/0.1.0/history-v0.1.0.json' + + # Single core mode + merge.merge_netcdf_files( + input_files=input_files, + output_file=self.__output_path.joinpath(output_name_single), + process_count=1 + ) + merged_dataset = nc.Dataset(self.__output_path.joinpath(output_name_single)) + assert_valid_history(merged_dataset, input_files) + + merged_dataset.close() + + # Multi core mode + merge.merge_netcdf_files( + input_files=input_files, + output_file=self.__output_path.joinpath(output_name_multi), + process_count=2 + ) + merged_dataset = nc.Dataset(self.__output_path.joinpath(output_name_multi)) + assert_valid_history(merged_dataset, input_files) + + merged_dataset.close() + + # Run again, but use l2ss-py output which contains existing + # history_json. Concise history should contain new entry plus + # all entries from input files + data_path = self.__test_data_path.joinpath('l2ss_py_output') + input_files = list(data_path.iterdir()) + merge.merge_netcdf_files( + input_files=input_files, + output_file=self.__output_path.joinpath(output_name_single), + process_count=1 + ) + merged_dataset = nc.Dataset(self.__output_path.joinpath(output_name_single)) + assert_valid_history(merged_dataset, input_files) + history_json = json.loads(merged_dataset.getncattr('history_json')) + assert len(history_json) == 3 + assert history_json[0]['program'] == 'l2ss-py' + assert history_json[1]['program'] == 'l2ss-py' + assert history_json[2]['program'] == 'concise'