Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add group_files and from_files utility functions for creating Scenes from multiple files #576

Merged
merged 9 commits into from
Jan 29, 2019
3 changes: 2 additions & 1 deletion doc/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import os
import sys
from datetime import datetime

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
Expand Down Expand Up @@ -75,7 +76,7 @@ def __getattr__(cls, name):

# General information about the project.
project = u'SatPy'
copyright = u'2009-2016, The PyTroll Team'
copyright = u'2009-{}, The PyTroll Team'.format(datetime.utcnow().strftime("%Y"))

# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
Expand Down
25 changes: 21 additions & 4 deletions doc/source/multiscene.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,27 @@ The MultiScene can take "frames" of data and join them together in a single
animation movie file. Saving animations required the `imageio` python library
and for most available formats the ``ffmpeg`` command line tool suite should
also be installed. The below example saves a series of GOES-EAST ABI channel
1 and channel 2 frames to MP4 movie files. Note that currently there is no
easy way to map files from multiple time steps/orbits in to individual Scene
objects. The `glob` function and for loops are used to group files into Scene
objects that, if used individually, could load the data we want.
1 and channel 2 frames to MP4 movie files. We can use the
:meth:`MultiScene.from_files <satpy.multiscene.MultiScene.from_files>` class
method to create a `MultiScene` from a series of files. This uses the
:func:`~satpy.readers.group_files` utility function to group files by start
time.

>>> from satpy import Scene, MultiScene
>>> from glob import glob
>>> mscn = MultiScene.from_files(glob('/data/abi/day_1/*C0[12]*.nc'), reader='abi_l1b')
>>> mscn.load(['C01', 'C02'])
>>> mscn.save_animation('{name}_{start_time:%Y%m%d_%H%M%S}.mp4', fps=2)

.. versionadded:: 0.12

The ``from_files`` and ``group_files`` functions were added in SatPy 0.12.
See below for an alternative solution.

For older versions of SatPy we can manually create the `Scene` objects used.
The :func:`~glob.glob` function and for loops are used to group files into
Scene objects that, if used individually, could load the data we want. The
code below is equivalent to the ``from_files`` code above:

>>> from satpy import Scene, MultiScene
>>> from glob import glob
Expand Down
2 changes: 2 additions & 0 deletions satpy/etc/readers/abi_l1b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ reader:
sensors: [abi]
default_channels:
reader: !!python/name:satpy.readers.yaml_reader.FileYAMLReader
# file pattern keys to sort files by with 'satpy.utils.group_files'
group_keys: ['start_time', 'platform_shortname', 'scene_abbr']

file_types:
# NOTE: observation_type == product acronym in PUG document
Expand Down
3 changes: 2 additions & 1 deletion satpy/etc/readers/ahi_hrit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@ reader:
description: JMA HRIT Reader
name: ahi_hrit
sensors: [ahi]
default_channels: []
reader: !!python/name:satpy.readers.yaml_reader.FileYAMLReader
# file pattern keys to sort files by with 'satpy.utils.group_files'
group_keys: ['start_time', 'area']

file_types:
hrit_b01:
Expand Down
3 changes: 2 additions & 1 deletion satpy/etc/readers/ahi_hsd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ reader:
name: ahi_hsd
reader: !!python/name:satpy.readers.yaml_reader.FileYAMLReader ''
sensors: [ahi]
default_datasets:
# file pattern keys to sort files by with 'satpy.utils.group_files'
group_keys: ['start_time', 'platform_shortname', 'area']

datasets:
B01:
Expand Down
103 changes: 79 additions & 24 deletions satpy/multiscene.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,14 +108,33 @@ def __init__(self, scene_gen):
self._scene_cache = []
self._dataset_idx = {}
# this class itself is not an iterator, make one
self._self_iter = iter(self)
self._self_iter = self._create_cached_iter()

def __iter__(self):
@property
def first(self):
"""First element in the generator."""
return next(iter(self))

def _create_cached_iter(self):
"""Iterate over the provided scenes, caching them for later."""
for scn in self._scene_gen:
self._scene_cache.append(scn)
yield scn

def __iter__(self):
"""Iterate over the provided scenes, caching them for later."""
idx = 0
while True:
if idx >= len(self._scene_cache):
try:
scn = next(self._self_iter)
except StopIteration:
return
else:
scn = self._scene_cache[idx]
yield scn
idx += 1

def __getitem__(self, ds_id):
"""Get a specific dataset from the scenes."""
if ds_id in self._dataset_idx:
Expand Down Expand Up @@ -155,10 +174,40 @@ def __init__(self, scenes=None):

"""
self._scenes = scenes or []
scenes = iter(self._scenes)
self._scene_gen = _SceneGenerator(iter(scenes))
# if we were originally given a generator-like object then we want to
# coordinate the loading between _SceneGenerator and _scenes
# otherwise it doesn't really matter and other operations may prefer
# a list
if not isinstance(scenes, (list, tuple)):
self._scenes = iter(self._scene_gen)

@property
def first_scene(self):
"""First Scene of this MultiScene object."""
return self._scene_gen.first

@classmethod
def from_files(cls, files_to_sort, reader=None, **kwargs):
"""Create multiple Scene objects from multiple files.

This uses the :func:`satpy.readers.group_files` function to group
files. See this function for more details on possible keyword
arguments.

.. versionadded:: 0.12

"""
from satpy.readers import group_files
file_groups = group_files(files_to_sort, reader=reader, **kwargs)
scenes = (Scene(filenames=fg) for fg in file_groups)
return cls(scenes)

def __iter__(self):
"""Iterate over the provided Scenes once."""
return self.scenes
for scn in self._scenes:
yield scn

@property
def scenes(self):
Expand Down Expand Up @@ -210,27 +259,35 @@ def _all_same_area(self, dataset_ids):
def all_same_area(self):
return self._all_same_area(self.loaded_dataset_ids)

def _gen_load(self, gen, *args, **kwargs):
"""Perform a load in a generator so it is delayed."""
@staticmethod
def _call_scene_func(gen, func_name, create_new_scene, *args, **kwargs):
"""Abstract method for running a Scene method on each Scene."""
for scn in gen:
scn.load(*args, **kwargs)
yield scn
new_scn = getattr(scn, func_name)(*args, **kwargs)
if create_new_scene:
yield new_scn
else:
yield scn

def _generate_scene_func(self, gen, func_name, create_new_scene, *args, **kwargs):
"""Abstract method for running a Scene method on each Scene.

Additionally, modifies current MultiScene or creates a new one if needed.
"""
new_gen = self._call_scene_func(gen, func_name, create_new_scene, *args, **kwargs)
new_gen = new_gen if self.is_generator else list(new_gen)
if create_new_scene:
return self.__class__(new_gen)
self._scene_gen = _SceneGenerator(new_gen)
self._scenes = iter(self._scene_gen)

def load(self, *args, **kwargs):
"""Load the required datasets from the multiple scenes."""
scene_gen = self._gen_load(self._scenes, *args, **kwargs)
self._scenes = scene_gen if self.is_generator else list(scene_gen)

def _gen_resample(self, gen, destination=None, **kwargs):
for scn in gen:
new_scn = scn.resample(destination, **kwargs)
yield new_scn
self._generate_scene_func(self._scenes, 'load', False, *args, **kwargs)

def resample(self, destination=None, **kwargs):
"""Resample the multiscene."""
new_scenes = self._gen_resample(self._scenes, destination=destination, **kwargs)
new_scenes = new_scenes if self.is_generator else list(new_scenes)
return self.__class__(new_scenes)
return self._generate_scene_func(self._scenes, 'resample', True, destination=destination, **kwargs)

def blend(self, blend_function=stack):
"""Blend the datasets into one scene.
Expand Down Expand Up @@ -320,21 +377,19 @@ def save_animation(self, filename, datasets=None, fps=10, fill_value=None,
if imageio is None:
raise ImportError("Missing required 'imageio' library")

scenes = iter(self._scenes)
first_scene = next(scenes)
scene_gen = self._scene_gen
first_scene = self.first_scene
scenes = iter(self._scene_gen)
info_scenes = [first_scene]
if 'end_time' in filename:
# if we need the last scene to generate the filename
# then compute all the scenes so we can figure it out
log.debug("Generating scenes to compute end_time for filename")
scenes = list(scenes)
info_scenes.append(scenes[-1])
scene_gen = _SceneGenerator(chain([first_scene], scenes))

if not self.is_generator:
available_ds = self.loaded_dataset_ids
else:
available_ds = list(first_scene.keys())
available_ds = [first_scene.datasets.get(ds) for ds in first_scene.wishlist]
available_ds = [ds for ds in available_ds if ds is not None]
dataset_ids = datasets or available_ds

writers = []
Expand Down
107 changes: 101 additions & 6 deletions satpy/readers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import numbers
import os
import warnings
from datetime import datetime, timedelta

import six
import yaml
Expand Down Expand Up @@ -407,6 +408,104 @@ def __delitem__(self, key):
return super(DatasetDict, self).__delitem__(key)


def group_files(files_to_sort, reader=None, time_threshold=10,
group_keys=None, ppp_config_dir=None, reader_kwargs=None):
"""Group series of files by file pattern information.

By default this will group files by their filename ``start_time``
assuming it exists in the pattern. By passing the individual
dictionaries returned by this function to the Scene classes'
``filenames``, a series `Scene` objects can be easily created.

.. versionadded:: 0.12

Args:
files_to_sort (iterable): File paths to sort in to group
reader (str): Reader whose file patterns should be used to sort files.
This
time_threshold (int): Number of seconds used to consider time elements
in a group as being equal. For example, if the 'start_time' item
is used to group files then any time within `time_threshold`
seconds of the first file's 'start_time' will be seen as occurring
at the same time.
group_keys (list or tuple): File pattern information to use to group
files. Keys are sorted in order and only the first key is used when
comparing datetime elements with `time_threshold` (see above). This
means it is recommended that datetime values should only come from
the first key in ``group_keys``. Otherwise, there is a good chance
that files will not be grouped properly (datetimes being barely
unequal). Defaults to a reader's ``group_keys`` configuration (set
in YAML), otherwise ``('start_time',)``.
ppp_config_dir (str): Root usser configuration directory for SatPy.
This will be deprecated in the future, but is here for consistency
with other SatPy features.
reader_kwargs (dict): Additional keyword arguments to pass to reader
creation.

Returns:
List of dictionaries mapping 'reader' to a list of filenames.
Each of these dictionaries can be passed as ``filenames`` to
a `Scene` object.

"""
# FUTURE: Find the best reader for each filename using `find_files_and_readers`
if reader is None:
raise ValueError("'reader' keyword argument is required.")
elif not isinstance(reader, (list, tuple)):
reader = [reader]

# FUTURE: Handle multiple readers
reader = reader[0]
reader_configs = list(configs_for_reader(reader, ppp_config_dir))[0]
reader_kwargs = reader_kwargs or {}
try:
reader_instance = load_reader(reader_configs, **reader_kwargs)
except (KeyError, IOError, yaml.YAMLError) as err:
LOG.info('Cannot use %s', str(reader_configs))
LOG.debug(str(err))
# if reader and (isinstance(reader, str) or len(reader) == 1):
# # if it is a single reader then give a more usable error
# raise
raise

if group_keys is None:
group_keys = reader_instance.info.get('group_keys', ('start_time',))
file_keys = []
for filetype, filetype_info in reader_instance.sorted_filetype_items():
for f, file_info in reader_instance.filename_items_for_filetype(files_to_sort, filetype_info):
group_key = tuple(file_info.get(k) for k in group_keys)
file_keys.append((group_key, f))

prev_key = None
threshold = timedelta(seconds=time_threshold)
file_groups = {}
for gk, f in sorted(file_keys):
# use first element of key as time identifier (if datetime type)
if prev_key is None:
is_new_group = True
prev_key = gk
elif isinstance(gk[0], datetime):
# datetimes within threshold difference are "the same time"
is_new_group = (gk[0] - prev_key[0]) > threshold
else:
is_new_group = gk[0] != prev_key[0]

# compare keys for those that are found for both the key and
# this is a generator and is not computed until the if statement below
# when we know that `prev_key` is not None
vals_not_equal = (this_val != prev_val for this_val, prev_val in zip(gk[1:], prev_key[1:])
if this_val is not None and prev_val is not None)
# if this is a new group based on the first element
if is_new_group or any(vals_not_equal):
file_groups[gk] = [f]
prev_key = gk
else:
file_groups[prev_key].append(f)
sorted_group_keys = sorted(file_groups)
# passable to Scene as 'filenames'
return [{reader: file_groups[group_key]} for group_key in sorted_group_keys]


def read_reader_config(config_files, loader=yaml.Loader):
"""Read the reader `config_files` and return the info extracted."""

Expand All @@ -427,13 +526,9 @@ def read_reader_config(config_files, loader=yaml.Loader):


def load_reader(reader_configs, **reader_kwargs):
"""Import and setup the reader from *reader_info*
"""
"""Import and setup the reader from *reader_info*."""
reader_info = read_reader_config(reader_configs)
reader_instance = reader_info['reader'](
config_files=reader_configs,
**reader_kwargs
)
reader_instance = reader_info['reader'](config_files=reader_configs, **reader_kwargs)
return reader_instance


Expand Down
17 changes: 17 additions & 0 deletions satpy/tests/test_multiscene.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,23 @@ def test_properties(self):
self.assertSetEqual(mscn.shared_dataset_ids, {ds1_id, ds2_id})
self.assertFalse(mscn.all_same_area)

def test_from_files(self):
"""Test creating a multiscene from multiple files."""
from satpy import MultiScene
input_files = [
"OR_ABI-L1b-RadC-M3C01_G16_s20171171502203_e20171171504576_c20171171505018.nc",
"OR_ABI-L1b-RadC-M3C01_G16_s20171171507203_e20171171509576_c20171171510018.nc",
"OR_ABI-L1b-RadC-M3C01_G16_s20171171512203_e20171171514576_c20171171515017.nc",
"OR_ABI-L1b-RadC-M3C01_G16_s20171171517203_e20171171519577_c20171171520019.nc",
"OR_ABI-L1b-RadC-M3C01_G16_s20171171522203_e20171171524576_c20171171525020.nc",
"OR_ABI-L1b-RadC-M3C01_G16_s20171171527203_e20171171529576_c20171171530017.nc",
]
with mock.patch('satpy.multiscene.Scene') as scn_mock:
mscn = MultiScene.from_files(input_files, reader='abi_l1b')
self.assertTrue(len(mscn.scenes), 6)
calls = [mock.call(filenames={'abi_l1b': [in_file]}) for in_file in input_files]
scn_mock.assert_has_calls(calls)


class TestMultiSceneSave(unittest.TestCase):
"""Test saving a MultiScene to various formats."""
Expand Down
Loading