Skip to content

Commit

Permalink
Merge pull request #1178 from djhoese/optimize-reader-globify
Browse files Browse the repository at this point in the history
Optimize readers searching for matching filenames
  • Loading branch information
mraspaud committed May 7, 2020
2 parents 2b7fdd4 + 11e3b4c commit 5343a83
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 29 deletions.
2 changes: 2 additions & 0 deletions satpy/readers/__init__.py
Expand Up @@ -443,6 +443,8 @@ def group_files(files_to_sort, reader=None, time_threshold=10,
if group_keys is None:
group_keys = reader_instance.info.get('group_keys', ('start_time',))
file_keys = []
# make a copy because filename_items_for_filetype will modify inplace
files_to_sort = set(files_to_sort)
for _, filetype_info in reader_instance.sorted_filetype_items():
for f, file_info in reader_instance.filename_items_for_filetype(files_to_sort, filetype_info):
group_key = tuple(file_info.get(k) for k in group_keys)
Expand Down
62 changes: 36 additions & 26 deletions satpy/readers/yaml_reader.py
Expand Up @@ -64,7 +64,7 @@ def listify_string(something):
return list()


def get_filebase(path, pattern):
def _get_filebase(path, pattern):
"""Get the end of *path* of same length as *pattern*."""
# convert any `/` on Windows to `\\`
path = os.path.normpath(path)
Expand All @@ -73,13 +73,13 @@ def get_filebase(path, pattern):
return os.path.join(*str(path).split(os.path.sep)[-tail_len:])


def match_filenames(filenames, pattern):
def _match_filenames(filenames, pattern):
"""Get the filenames matching *pattern*."""
matching = []

matching = set()
glob_pat = globify(pattern)
for filename in filenames:
if fnmatch(get_filebase(filename, pattern), globify(pattern)):
matching.append(filename)
if fnmatch(_get_filebase(filename, pattern), glob_pat):
matching.add(filename)

return matching

Expand Down Expand Up @@ -184,20 +184,25 @@ def select_files_from_directory(self, directory=None):
If directory is None or '', look in the current directory.
"""
filenames = []
filenames = set()
if directory is None:
directory = ''
for pattern in self.file_patterns:
matching = glob.iglob(os.path.join(directory, globify(pattern)))
filenames.extend(matching)
# all the glob patterns that we are going to look at
all_globs = {os.path.join(directory, globify(pattern))
for pattern in self.file_patterns}
# get all files matching these patterns
for glob_pat in all_globs:
filenames.update(glob.iglob(glob_pat))
return filenames

def select_files_from_pathnames(self, filenames):
"""Select the files from *filenames* this reader can handle."""
selected_filenames = []
filenames = set(filenames) # make a copy of the inputs

for pattern in self.file_patterns:
matching = match_filenames(filenames, pattern)
matching = _match_filenames(filenames, pattern)
filenames -= matching
for fname in matching:
if fname not in selected_filenames:
selected_filenames.append(fname)
Expand Down Expand Up @@ -394,21 +399,24 @@ def sorted_filetype_items(self):
@staticmethod
def filename_items_for_filetype(filenames, filetype_info):
"""Iterate over the filenames matching *filetype_info*."""
matched_files = []
if not isinstance(filenames, set):
# we perform set operations later on to improve performance
filenames = set(filenames)
for pattern in filetype_info['file_patterns']:
for filename in match_filenames(filenames, pattern):
if filename in matched_files:
continue
matched_files = set()
matches = _match_filenames(filenames, pattern)
for filename in matches:
try:
filename_info = parse(
pattern, get_filebase(filename, pattern))
pattern, _get_filebase(filename, pattern))
except ValueError:
logger.debug("Can't parse %s with %s.", filename, pattern)
continue
matched_files.append(filename)
matched_files.add(filename)
yield filename, filename_info
filenames -= matched_files

def new_filehandler_instances(self, filetype_info, filename_items, fh_kwargs=None):
def _new_filehandler_instances(self, filetype_info, filename_items, fh_kwargs=None):
"""Generate new filehandler instances."""
requirements = filetype_info.get('requires')
filetype_cls = filetype_info['file_reader']
Expand Down Expand Up @@ -496,6 +504,9 @@ def filter_fh_by_metadata(self, filehandlers):

def filter_selected_filenames(self, filenames):
"""Filter provided files based on metadata in the filename."""
if not isinstance(filenames, set):
# we perform set operations later on to improve performance
filenames = set(filenames)
for _, filetype_info in self.sorted_filetype_items():
filename_iter = self.filename_items_for_filetype(filenames,
filetype_info)
Expand All @@ -505,17 +516,17 @@ def filter_selected_filenames(self, filenames):
for fn, _ in filename_iter:
yield fn

def new_filehandlers_for_filetype(self, filetype_info, filenames, fh_kwargs=None):
def _new_filehandlers_for_filetype(self, filetype_info, filenames, fh_kwargs=None):
"""Create filehandlers for a given filetype."""
filename_iter = self.filename_items_for_filetype(filenames,
filetype_info)
if self.filter_filenames:
# preliminary filter of filenames based on start/end time
# to reduce the number of files to open
filename_iter = self.filter_filenames_by_info(filename_iter)
filehandler_iter = self.new_filehandler_instances(filetype_info,
filename_iter,
fh_kwargs=fh_kwargs)
filehandler_iter = self._new_filehandler_instances(filetype_info,
filename_iter,
fh_kwargs=fh_kwargs)
filtered_iter = self.filter_fh_by_metadata(filehandler_iter)
return list(filtered_iter)

Expand All @@ -529,11 +540,10 @@ def create_filehandlers(self, filenames, fh_kwargs=None):
created_fhs = {}
# load files that we know about by creating the file handlers
for filetype, filetype_info in self.sorted_filetype_items():
filehandlers = self.new_filehandlers_for_filetype(filetype_info,
filename_set,
fh_kwargs=fh_kwargs)
filehandlers = self._new_filehandlers_for_filetype(filetype_info,
filename_set,
fh_kwargs=fh_kwargs)

filename_set -= set([fhd.filename for fhd in filehandlers])
if filehandlers:
created_fhs[filetype] = filehandlers
self.file_handlers[filetype] = sorted(
Expand Down
11 changes: 11 additions & 0 deletions satpy/tests/test_readers.py
Expand Up @@ -657,6 +657,17 @@ def test_default_behavior(self):
self.assertEqual(6, len(groups))
self.assertEqual(2, len(groups[0]['abi_l1b']))

def test_default_behavior_set(self):
"""Test the default behavior with the 'abi_l1b' reader."""
from satpy.readers import group_files
files = set(self.g16_files)
num_files = len(files)
groups = group_files(files, reader='abi_l1b')
# we didn't modify it
self.assertEqual(len(files), num_files)
self.assertEqual(6, len(groups))
self.assertEqual(2, len(groups[0]['abi_l1b']))

def test_non_datetime_group_key(self):
"""Test what happens when the start_time isn't used for grouping."""
from satpy.readers import group_files
Expand Down
6 changes: 3 additions & 3 deletions satpy/tests/test_yaml_reader.py
Expand Up @@ -72,7 +72,7 @@ def test_get_filebase(self):
pattern = os.path.join(*pattern.split('/'))
filename = os.path.join(base_dir, 'Oa05_radiance.nc')
expected = os.path.join(base_data, 'Oa05_radiance.nc')
self.assertEqual(yr.get_filebase(filename, pattern), expected)
self.assertEqual(yr._get_filebase(filename, pattern), expected)

def test_match_filenames(self):
"""Check that matching filenames works."""
Expand All @@ -91,7 +91,7 @@ def test_match_filenames(self):
filenames = [os.path.join(base_dir, 'Oa05_radiance.nc'),
os.path.join(base_dir, 'geo_coordinates.nc')]
expected = os.path.join(base_dir, 'geo_coordinates.nc')
self.assertEqual(yr.match_filenames(filenames, pattern), [expected])
self.assertEqual(yr._match_filenames(filenames, pattern), {expected})

def test_match_filenames_windows_forward_slash(self):
"""Check that matching filenames works on Windows with forward slashes.
Expand All @@ -114,7 +114,7 @@ def test_match_filenames_windows_forward_slash(self):
filenames = [os.path.join(base_dir, 'Oa05_radiance.nc').replace(os.sep, '/'),
os.path.join(base_dir, 'geo_coordinates.nc').replace(os.sep, '/')]
expected = os.path.join(base_dir, 'geo_coordinates.nc').replace(os.sep, '/')
self.assertEqual(yr.match_filenames(filenames, pattern), [expected])
self.assertEqual(yr._match_filenames(filenames, pattern), {expected})

def test_listify_string(self):
"""Check listify_string."""
Expand Down

0 comments on commit 5343a83

Please sign in to comment.