Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add group_files and from_files utility functions for creating Scenes from multiple files #576

Merged
merged 9 commits into from
Jan 29, 2019
3 changes: 2 additions & 1 deletion doc/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import os
import sys
from datetime import datetime

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
Expand Down Expand Up @@ -75,7 +76,7 @@ def __getattr__(cls, name):

# General information about the project.
project = u'SatPy'
copyright = u'2009-2016, The PyTroll Team'
copyright = u'2009-{}, The PyTroll Team'.format(datetime.utcnow().strftime("%Y"))

# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
Expand Down
25 changes: 21 additions & 4 deletions doc/source/multiscene.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,27 @@ The MultiScene can take "frames" of data and join them together in a single
animation movie file. Saving animations required the `imageio` python library
and for most available formats the ``ffmpeg`` command line tool suite should
also be installed. The below example saves a series of GOES-EAST ABI channel
1 and channel 2 frames to MP4 movie files. Note that currently there is no
easy way to map files from multiple time steps/orbits in to individual Scene
objects. The `glob` function and for loops are used to group files into Scene
objects that, if used individually, could load the data we want.
1 and channel 2 frames to MP4 movie files. We can use the
:meth:`MultiScene.from_files <satpy.multiscene.MultiScene.from_files>` class
method to create a `MultiScene` from a series of files. This uses the
:func:`~satpy.readers.group_files` utility function to group files by start
time.

>>> from satpy import Scene, MultiScene
>>> from glob import glob
>>> mscn = MultiScene.from_files(glob('/data/abi/day_1/*C0[12]*.nc'), reader='abi_l1b')
>>> mscn.load(['C01', 'C02'])
>>> mscn.save_animation('{name}_{start_time:%Y%m%d_%H%M%S}.mp4', fps=2)

.. versionadded:: 0.12

The ``from_files`` and ``group_files`` functions were added in SatPy 0.12.
See below for an alternative solution.

For older versions of SatPy we can manually create the `Scene` objects used.
The :func:`~glob.glob` function and for loops are used to group files into
Scene objects that, if used individually, could load the data we want. The
code below is equivalent to the ``from_files`` code above:

>>> from satpy import Scene, MultiScene
>>> from glob import glob
Expand Down
16 changes: 16 additions & 0 deletions satpy/multiscene.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,22 @@ def __init__(self, scenes=None):
"""
self._scenes = scenes or []

@classmethod
def from_files(cls, files_to_sort, reader=None, **kwargs):
"""Create multiple Scene objects from multiple files.

This uses the :func:`satpy.readers.group_files` function to group
files. See this function for more details on possible keyword
arguments.

.. versionadded:: 0.12

"""
from satpy.readers import group_files
file_groups = group_files(files_to_sort, reader=reader, **kwargs)
scenes = [Scene(filenames=fg) for fg in file_groups]
return cls(scenes)

def __iter__(self):
"""Iterate over the provided Scenes once."""
return self.scenes
Expand Down
98 changes: 92 additions & 6 deletions satpy/readers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import numbers
import os
import warnings
from datetime import datetime, timedelta

import six
import yaml
Expand Down Expand Up @@ -407,6 +408,95 @@ def __delitem__(self, key):
return super(DatasetDict, self).__delitem__(key)


def group_files(files_to_sort, reader=None, time_threshold=10,
group_keys=('start_time',), ppp_config_dir=None, reader_kwargs=None):
"""Group series of files by file pattern information.

By default this will group files by their filename ``start_time``
assuming it exists in the pattern. By passing the individual
dictionaries returned by this function to the Scene classes'
``filenames``, a series `Scene` objects can be easily created.

.. versionadded:: 0.12

Args:
files_to_sort (iterable): File paths to sort in to group
reader (str): Reader whose file patterns should be used to sort files.
This
time_threshold (int): Number of seconds used to consider time elements
in a group as being equal. For example, if the 'start_time' item
is used to group files then any time within `time_threshold`
seconds of the first file's 'start_time' will be seen as occurring
at the same time.
group_keys (list or tuple): File pattern information to use to group
files. Keys are sorted in order and only the first key is used when
comparing datetime elements with `time_threshold` (see above). This
means it is recommended that datetime values should only come from
the first key in ``group_keys``. Otherwise, there is a good chance
that files will not be grouped properly (datetimes being barely
unequal). Defaults to ``('start_time',)``.
ppp_config_dir (str): Root usser configuration directory for SatPy.
This will be deprecated in the future, but is here for consistency
with other SatPy features.
reader_kwargs (dict): Additional keyword arguments to pass to reader
creation.

Returns:
List of dictionaries mapping 'reader' to a list of filenames.
Each of these dictionaries can be passed as ``filenames`` to
a `Scene` object.

"""
# FUTURE: Find the best reader for each filename using `find_files_and_readers`
if reader is None:
raise ValueError("'reader' keyword argument is required.")
elif not isinstance(reader, (list, tuple)):
reader = [reader]

# FUTURE: Handle multiple readers
reader = reader[0]
reader_configs = list(configs_for_reader(reader, ppp_config_dir))[0]
reader_kwargs = reader_kwargs or {}
try:
reader_instance = load_reader(reader_configs, **reader_kwargs)
except (KeyError, IOError, yaml.YAMLError) as err:
LOG.info('Cannot use %s', str(reader_configs))
LOG.debug(str(err))
# if reader and (isinstance(reader, str) or len(reader) == 1):
# # if it is a single reader then give a more usable error
# raise
raise

file_keys = []
for filetype, filetype_info in reader_instance.sorted_filetype_items():
for f, file_info in reader_instance.filename_items_for_filetype(files_to_sort, filetype_info):
group_key = tuple(file_info.get(k) for k in group_keys)
file_keys.append((group_key, f))

prev_key = None
threshold = timedelta(seconds=time_threshold)
file_groups = {}
for gk, f in sorted(file_keys):
# use first element of key as time identifier (if datetime type)
if prev_key is None:
is_new_group = True
elif isinstance(gk[0], datetime):
# datetimes within threshold difference are "the same time"
is_new_group = (gk[0] - prev_key[0]) > threshold
else:
is_new_group = gk[0] != prev_key[0]

# if this is a new group based on the first element
if is_new_group or gk[1:] != prev_key[1:]:
file_groups[gk] = [f]
prev_key = gk
else:
file_groups[prev_key].append(f)
sorted_group_keys = sorted(file_groups)
# passable to Scene as 'filenames'
return [{reader: file_groups[group_key]} for group_key in sorted_group_keys]


def read_reader_config(config_files, loader=yaml.Loader):
"""Read the reader `config_files` and return the info extracted."""

Expand All @@ -427,13 +517,9 @@ def read_reader_config(config_files, loader=yaml.Loader):


def load_reader(reader_configs, **reader_kwargs):
"""Import and setup the reader from *reader_info*
"""
"""Import and setup the reader from *reader_info*."""
reader_info = read_reader_config(reader_configs)
reader_instance = reader_info['reader'](
config_files=reader_configs,
**reader_kwargs
)
reader_instance = reader_info['reader'](config_files=reader_configs, **reader_kwargs)
return reader_instance


Expand Down
17 changes: 17 additions & 0 deletions satpy/tests/test_multiscene.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,23 @@ def test_properties(self):
self.assertSetEqual(mscn.shared_dataset_ids, {ds1_id, ds2_id})
self.assertFalse(mscn.all_same_area)

def test_from_files(self):
"""Test creating a multiscene from multiple files."""
from satpy import MultiScene
input_files = [
"OR_ABI-L1b-RadC-M3C01_G16_s20171171502203_e20171171504576_c20171171505018.nc",
"OR_ABI-L1b-RadC-M3C01_G16_s20171171507203_e20171171509576_c20171171510018.nc",
"OR_ABI-L1b-RadC-M3C01_G16_s20171171512203_e20171171514576_c20171171515017.nc",
"OR_ABI-L1b-RadC-M3C01_G16_s20171171517203_e20171171519577_c20171171520019.nc",
"OR_ABI-L1b-RadC-M3C01_G16_s20171171522203_e20171171524576_c20171171525020.nc",
"OR_ABI-L1b-RadC-M3C01_G16_s20171171527203_e20171171529576_c20171171530017.nc",
]
with mock.patch('satpy.multiscene.Scene') as scn_mock:
mscn = MultiScene.from_files(input_files, reader='abi_l1b')
calls = [mock.call(filenames={'abi_l1b': [in_file]}) for in_file in input_files]
scn_mock.assert_has_calls(calls)
self.assertTrue(len(mscn.scenes), 6)


class TestMultiSceneSave(unittest.TestCase):
"""Test saving a MultiScene to various formats."""
Expand Down
90 changes: 86 additions & 4 deletions satpy/tests/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,8 +527,7 @@ def test_reader_load_failed(self):
# touch the file so it exists on disk
with mock.patch('yaml.load') as load:
load.side_effect = yaml.YAMLError("Import problems")
self.assertRaises(yaml.YAMLError, find_files_and_readers,
reader='viirs_sdr')
self.assertRaises(yaml.YAMLError, find_files_and_readers, reader='viirs_sdr')

def test_old_reader_name_mapping(self):
"""Test that requesting old reader names raises a warning."""
Expand Down Expand Up @@ -582,15 +581,98 @@ def test_available_readers(self):
self.assertIn('name', reader_info)


class TestGroupFiles(unittest.TestCase):
"""Test the 'group_files' utility function."""

def setUp(self):
"""Set up test filenames to use."""
input_files = [
"OR_ABI-L1b-RadC-M3C01_G16_s20171171502203_e20171171504576_c20171171505018.nc",
"OR_ABI-L1b-RadC-M3C01_G16_s20171171507203_e20171171509576_c20171171510018.nc",
"OR_ABI-L1b-RadC-M3C01_G16_s20171171512203_e20171171514576_c20171171515017.nc",
"OR_ABI-L1b-RadC-M3C01_G16_s20171171517203_e20171171519577_c20171171520019.nc",
"OR_ABI-L1b-RadC-M3C01_G16_s20171171522203_e20171171524576_c20171171525020.nc",
"OR_ABI-L1b-RadC-M3C01_G16_s20171171527203_e20171171529576_c20171171530017.nc",
"OR_ABI-L1b-RadC-M3C02_G16_s20171171502203_e20171171504576_c20171171505008.nc",
"OR_ABI-L1b-RadC-M3C02_G16_s20171171507203_e20171171509576_c20171171510012.nc",
"OR_ABI-L1b-RadC-M3C02_G16_s20171171512203_e20171171514576_c20171171515007.nc",
"OR_ABI-L1b-RadC-M3C02_G16_s20171171517203_e20171171519576_c20171171520010.nc",
"OR_ABI-L1b-RadC-M3C02_G16_s20171171522203_e20171171524576_c20171171525008.nc",
"OR_ABI-L1b-RadC-M3C02_G16_s20171171527203_e20171171529576_c20171171530008.nc",
]
self.g16_files = input_files
self.g17_files = [x.replace('G16', 'G17') for x in input_files]

def test_no_reader(self):
"""Test that reader must be provided."""
from satpy.readers import group_files
self.assertRaises(ValueError, group_files, [])

def test_bad_reader(self):
"""Test that reader not existing causes an error."""
from satpy.readers import group_files
import yaml
# touch the file so it exists on disk
with mock.patch('yaml.load') as load:
load.side_effect = yaml.YAMLError("Import problems")
self.assertRaises(yaml.YAMLError, group_files, [], reader='abi_l1b')

def test_default_behavior(self):
"""Test the default behavior with the 'abi_l1b' reader."""
from satpy.readers import group_files
groups = group_files(self.g16_files, reader='abi_l1b')
self.assertTrue(len(groups), 6)
self.assertTrue(len(groups[0]['abi_l1b']), 2)

def test_non_datetime_group_key(self):
"""Test what happens when the start_time isn't used for grouping."""
from satpy.readers import group_files
groups = group_files(self.g16_files, reader='abi_l1b', group_keys=('platform_shortname',))
self.assertTrue(len(groups), 1)
self.assertTrue(len(groups[0]['abi_l1b']), 6)

def test_large_time_threshold(self):
"""Test what happens when the time threshold holds multiple files."""
from satpy.readers import group_files
groups = group_files(self.g16_files, reader='abi_l1b', time_threshold=60*8)
self.assertTrue(len(groups), 3)
self.assertTrue(len(groups[0]['abi_l1b']), 2)

def test_two_instruments_files(self):
"""Test the default behavior when two instruments files are provided.

This is undesired from a user point of view since we don't want G16
and G17 files in the same Scene.

"""
from satpy.readers import group_files
groups = group_files(self.g16_files + self.g17_files, reader='abi_l1b')
self.assertTrue(len(groups), 6)
self.assertTrue(len(groups[0]['abi_l1b']), 4)

def test_two_instruments_files_split(self):
"""Test the default behavior when two instruments files are provided and split.

Tell the sorting to include the platform identifier as another field
to use for grouping.

"""
from satpy.readers import group_files
groups = group_files(self.g16_files + self.g17_files, reader='abi_l1b',
group_keys=('start_time', 'platform_shortname'))
self.assertTrue(len(groups), 12)
self.assertTrue(len(groups[0]['abi_l1b']), 2)


def suite():
"""The test suite for test_scene.
"""
"""The test suite for test_readers."""
loader = unittest.TestLoader()
mysuite = unittest.TestSuite()
mysuite.addTest(loader.loadTestsFromTestCase(TestDatasetDict))
mysuite.addTest(loader.loadTestsFromTestCase(TestReaderLoader))
mysuite.addTest(loader.loadTestsFromTestCase(TestFindFilesAndReaders))
mysuite.addTest(loader.loadTestsFromTestCase(TestYAMLFiles))
mysuite.addTest(loader.loadTestsFromTestCase(TestGroupFiles))

return mysuite

Expand Down