Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor available datasets logic to be more flexible #739

Merged
merged 13 commits into from
May 6, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 31 additions & 2 deletions satpy/readers/clavrx.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,19 +102,48 @@ def start_time(self):
def end_time(self):
return self.filename_info.get('end_time', self.start_time)

def available_datasets(self):
def available_datasets(self, configured_datasets=None):
"""Automatically determine datasets provided by this file"""
sensor = self.get_sensor(self['/attr/sensor'])
nadir_resolution = self.get_nadir_resolution(sensor)
coordinates = ('longitude', 'latitude')
handled_variables = set()

# update previously configured datasets
for is_avail, ds_info in (configured_datasets or []):
this_res = ds_info.get('resolution')
this_coords = ds_info.get('coordinates')
# some other file handler knows how to load this
if is_avail is not None:
yield is_avail, ds_info

djhoese marked this conversation as resolved.
Show resolved Hide resolved
var_name = ds_info.get('file_key', ds_info['name'])
matches = self.file_type_matches(ds_info['file_type'])
# we can confidently say that we can provide this dataset and can
# provide more info
if matches and var_name in self and this_res != nadir_resolution:
handled_variables.add(var_name)
new_info = ds_info.copy() # don't mess up the above yielded
new_info['resolution'] = nadir_resolution
if not self.is_geo and this_coords is None:
new_info['coordinates'] = coordinates
yield True, new_info
elif is_avail is None:
# if we didn't know how to handle this dataset and no one else did
# then we should keep it going down the chain
yield is_avail, ds_info

# add new datasets
for var_name, val in self.file_content.items():
if isinstance(val, SDS):
ds_info = {
'file_type': self.filetype_info['file_type'],
'resolution': nadir_resolution,
'name': var_name,
}
if self._is_polar():
ds_info['coordinates'] = ['longitude', 'latitude']
yield DatasetID(name=var_name, resolution=nadir_resolution), ds_info
yield True, ds_info

def get_shape(self, dataset_id, ds_info):
var_name = ds_info.get('file_key', dataset_id.name)
Expand Down
70 changes: 62 additions & 8 deletions satpy/readers/file_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,15 +122,69 @@ def sensor_names(self):
"""List of sensors represented in this file."""
raise NotImplementedError

def available_datasets(self):
"""Get information of available datasets in file.
def file_type_matches(self, ds_ftype):
"""This file handler's type can handle this dataset's file type.

This is used for dynamically specifying what datasets are available
from a file instead of those listed in a YAML configuration file.
Args:
ds_ftype (str or list): File type or list of file types that a
dataset is configured to be loaded from.

Returns: Iterator of (DatasetID, dict) pairs where dict is the
dataset's metadata, similar to that specified in the YAML
configuration files.
Returns: ``True`` if this file handler object's type matches the
dataset's file type(s), ``False`` otherwise.

"""
raise NotImplementedError
if isinstance(ds_ftype, str) and ds_ftype == self.filetype_info['file_type']:
return True
elif self.filetype_info['file_type'] in ds_ftype:
return True
return None

def available_datasets(self, configured_datasets=None):
"""Get information of available datasets in this file.

This is used for dynamically specifying what datasets are available
from a file in addition to what's configured in a YAML configuration
file. Note that this method will only be called once for each
"file type"; the first file handler for each type.

This method should **not** update values of the dataset information
dictionary **unless* this file handler has a matching file type
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
dictionary **unless* this file handler has a matching file type
dictionary **unless** this file handler has a matching file type

(the data could be loaded from this object in the future) and at least
**one** :class:`satpy.dataset.DatasetID` key is also modified.
Otherwise, this file type may override the information provided by
a more preferred file type (as specified in the YAML file).
It is recommended that any non-ID metadata be updated during the
:meth:`BaseFileHandler.get_dataset` part of loading.
This method is not guaranteed that it will be called before any
other file type's handler.
The availability "boolean" not being ``None`` does not mean that a
file handler called later can't provide an additional dataset, but
it must provide more identifying (DatasetID) information to do so
and should yield its new dataset in addition to the previous one.

Args:
configured_datasets (list): Series of (bool, dict) in the same
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could it be mentioned already here that the fist element can be None ? eg (bool or None, dict)

way as is returned by this method (see below). The bool is
whether or not the dataset is available from at least one
of the current file handlers. It can also be ``None`` if
no file handler knows before us knows how to handle it.
The dictionary is existing dataset metadata. The dictionaries
are typically provided from a YAML configuration file and may
be modified, updated, or used as a "template" for additional
available datasets. This argument could be the result of a
previous file handler's implementation of this method.

Returns: Iterator of (bool, dict) pairs where dict is the
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above

dataset's metadata. If the dataset is available in the current
file type then the boolean value should be ``True``, ``False``
if we **know** about the dataset but it is unavailable, or
``None`` if this file object is not responsible for it.

"""
for is_avail, ds_info in (configured_datasets or []):
if is_avail is not None:
# some other file handler said it has this dataset
# we don't know any more information than the previous
# file handler so let's yield early
yield is_avail, ds_info
yield self.file_type_matches(ds_info['file_type']), ds_info
49 changes: 45 additions & 4 deletions satpy/readers/geocat.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,19 +129,60 @@ def _calc_area_resolution(self, ds_res):
return self.resolutions.get(sensor, {}).get(int(elem_res),
elem_res * 1000.)

def available_datasets(self):
"""Automatically determine datasets provided by this file"""
def available_datasets(self, configured_datasets=None):
"""Update information for or add datasets provided by this file.

If this file handler can load a dataset then it will supplement the
dataset info with the resolution and possibly coordinate datasets
needed to load it. Otherwise it will continue passing the dataset
information down the chain.

See
:meth:`satpy.readers.file_handlers.BaseFileHandler.available_datasets`
for details.

"""
res = self.resolution
coordinates = ['pixel_longitude', 'pixel_latitude']
coordinates = ('pixel_longitude', 'pixel_latitude')
handled_variables = set()

# update previously configured datasets
for is_avail, ds_info in (configured_datasets or []):
this_res = ds_info.get('resolution')
this_coords = ds_info.get('coordinates')
# some other file handler knows how to load this
if is_avail is not None:
yield is_avail, ds_info

djhoese marked this conversation as resolved.
Show resolved Hide resolved
var_name = ds_info.get('file_key', ds_info['name'])
matches = self.file_type_matches(ds_info['file_type'])
# we can confidently say that we can provide this dataset and can
# provide more info
if matches and var_name in self and this_res != res:
handled_variables.add(var_name)
new_info = ds_info.copy() # don't mess up the above yielded
new_info['resolution'] = res
if not self.is_geo and this_coords is None:
new_info['coordinates'] = coordinates
yield True, new_info
elif is_avail is None:
# if we didn't know how to handle this dataset and no one else did
# then we should keep it going down the chain
yield is_avail, ds_info

# Provide new datasets
for var_name, val in self.file_content.items():
if var_name in handled_variables:
continue
if isinstance(val, netCDF4.Variable):
ds_info = {
'file_type': self.filetype_info['file_type'],
'resolution': res,
'name': var_name,
}
if not self.is_geo:
ds_info['coordinates'] = coordinates
yield DatasetID(name=var_name, resolution=res), ds_info
yield True, ds_info

def get_shape(self, dataset_id, ds_info):
var_name = ds_info.get('file_key', dataset_id.name)
Expand Down
10 changes: 8 additions & 2 deletions satpy/readers/grib.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,15 @@ def end_time(self):
"""
return self._end_time

def available_datasets(self):
def available_datasets(self, configured_datasets=None):
"""Automatically determine datasets provided by this file"""
return self._msg_datasets.items()
# previously configured or provided datasets
# we can't provide any additional information
for is_avail, ds_info in (configured_datasets or []):
yield is_avail, ds_info
# new datasets
for ds_info in self._msg_datasets.values():
yield True, ds_info

def _get_message(self, ds_info):
with pygrib.open(self.filename) as grib_file:
Expand Down
95 changes: 71 additions & 24 deletions satpy/readers/yaml_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ def __init__(self,
super(FileYAMLReader, self).__init__(config_files)

self.file_handlers = {}
self.available_ids = {}
self.filter_filenames = self.info.get('filter_filenames', filter_filenames)
self.filter_parameters = filter_parameters or {}
if kwargs:
Expand All @@ -287,12 +288,7 @@ def sensor_names(self):

@property
def available_dataset_ids(self):
for ds_id in self.all_dataset_ids:
fts = self.ids[ds_id]["file_type"]
if isinstance(fts, str) and fts in self.file_handlers:
yield ds_id
elif any(ft in self.file_handlers for ft in fts):
yield ds_id
return self.available_ids.keys()

@property
def start_time(self):
Expand Down Expand Up @@ -545,30 +541,66 @@ def update_ds_ids_from_file_handlers(self):
continue
if ds_id.resolution is not None:
continue
# TODO: Update the file handlers using this functionality to use available_datasets
ds_info['resolution'] = res
new_id = DatasetID.from_dict(ds_info)
self.ids[new_id] = ds_info
del self.ids[ds_id]

def _file_handlers_available_datasets(self):
"""Generate a series of available dataset information.

This is done by chaining file handler's
:meth:`satpy.readers.file_handlers.BaseFileHandler.available_datasets`
together. See that method's documentation for more information.

Returns:
Generator of (bool, dict) where the boolean tells whether the
current dataset is available from any of the file handlers. The
boolean can also be None in the case where no loaded file handler
is configured to load the dataset. The
dictionary is the metadata provided either by the YAML
configuration files or by the file handler itself if it is a new
dataset. The file handler may have also supplemented or modified
the information.

"""
first_fhs = (fhs[0] for fhs in self.file_handlers.values())
configured_datasets = ((None, ds_info) for ds_info in self.ids.values())
for fh in first_fhs:
# chain the 'available_datasets' methods together by calling the
# current file handler's method with the previous ones result
configured_datasets = fh.available_datasets(configured_datasets=configured_datasets)
return configured_datasets

def add_ds_ids_from_files(self):
"""Check files for more dynamically discovered datasets."""
for file_handlers in self.file_handlers.values():
try:
fh = file_handlers[0]
avail_ids = fh.available_datasets()
except NotImplementedError:
continue
"""Add or modify available dataset information.

# dynamically discover other available datasets
for ds_id, ds_info in avail_ids:
# don't overwrite an existing dataset
# especially from the yaml config
coordinates = ds_info.get('coordinates')
if isinstance(coordinates, list):
# xarray doesn't like concatenating attributes that are
# lists: https://github.com/pydata/xarray/issues/2060
ds_info['coordinates'] = tuple(ds_info['coordinates'])
self.ids.setdefault(ds_id, ds_info)
Each file handler is consulted on whether or not it can load the
dataset with the provided information dictionary.
See
:meth:`satpy.readers.file_handlers.BaseFileHandler.available_datasets`
for more information.

"""
avail_datasets = self._file_handlers_available_datasets()
for is_avail, ds_info in avail_datasets:
# especially from the yaml config
coordinates = ds_info.get('coordinates')
if isinstance(coordinates, list):
# xarray doesn't like concatenating attributes that are
# lists: https://github.com/pydata/xarray/issues/2060
ds_info['coordinates'] = tuple(ds_info['coordinates'])

ds_info.setdefault('modifiers', tuple()) # default to no mods
ds_id = DatasetID.from_dict(ds_info)
# all datasets
self.ids[ds_id] = ds_info
# available datasets
# False == we have the file type but it doesn't have this dataset
# None == we don't have the file type object to ask
if is_avail:
self.available_ids[ds_id] = ds_info

@staticmethod
def _load_dataset(dsid, ds_info, file_handlers, dim='y'):
Expand Down Expand Up @@ -767,10 +799,25 @@ def _load_ancillary_variables(self, datasets):
new_vars.append(av_id)
dataset.attrs['ancillary_variables'] = new_vars

def get_dataset_key(self, key, prefer_available=False, **kwargs):
"""Get the fully qualified `DatasetID` matching `key`.

See `satpy.readers.get_key` for more information about kwargs.

"""
if prefer_available:
try:
return get_key(key, self.available_ids.keys(), **kwargs)
except KeyError:
return get_key(key, self.ids.keys(), **kwargs)
# FIXME: Only do the try/except above
return get_key(key, self.ids.keys(), **kwargs)

def load(self, dataset_keys, previous_datasets=None):
"""Load `dataset_keys`.

If `previous_datasets` is provided, do not reload those."""
If `previous_datasets` is provided, do not reload those.
"""
all_datasets = previous_datasets or DatasetDict()
datasets = DatasetDict()

Expand Down
7 changes: 5 additions & 2 deletions satpy/tests/test_yaml_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ def setUp(self, _, rec_up): # pylint: disable=arguments-differ
res_dict = {'reader': {'name': 'fake',
'sensors': ['canon']},
'file_types': {'ftype1': {'name': 'ft1',
'file_reader': BaseFileHandler,
'file_patterns': patterns}},
'datasets': {'ch1': {'name': 'ch01',
'wavelength': [0.5, 0.6, 0.7],
Expand Down Expand Up @@ -255,7 +256,8 @@ def test_all_dataset_names(self):

def test_available_dataset_ids(self):
"""Get ids of the available datasets."""
self.reader.file_handlers = ['ftype1']
loadables = self.reader.select_files_from_pathnames(['a001.bla'])
self.reader.create_filehandlers(loadables)
self.assertSetEqual(set(self.reader.available_dataset_ids),
{DatasetID(name='ch02',
wavelength=(0.7, 0.75, 0.8),
Expand All @@ -272,7 +274,8 @@ def test_available_dataset_ids(self):

def test_available_dataset_names(self):
"""Get ids of the available datasets."""
self.reader.file_handlers = ['ftype1']
loadables = self.reader.select_files_from_pathnames(['a001.bla'])
self.reader.create_filehandlers(loadables)
self.assertSetEqual(set(self.reader.available_dataset_names),
set(["ch01", "ch02"]))

Expand Down