Skip to content

Commit

Permalink
Merge 78b387c into dba39cf
Browse files Browse the repository at this point in the history
  • Loading branch information
djhoese committed Sep 27, 2018
2 parents dba39cf + 78b387c commit 79a0053
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 48 deletions.
104 changes: 71 additions & 33 deletions trollsift/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,26 @@ def convert_field(self, value, conversion):
spec_regexes['g'] = spec_regexes['f']
spec_regexes['X'] = spec_regexes['x']
allow_multiple = ['c', 'd', 'o', 's', 'x', 'X']
# format_spec ::= [[fill]align][sign][#][0][width][,][.precision][type]
# https://docs.python.org/3.4/library/string.html#format-specification-mini-language
fmt_spec_regex = re.compile(
r'(?P<align>(?P<fill>.)?[<>=^])?(?P<sign>[\+\-\s])?(?P<pound>#)?(?P<zero>0)?(?P<width>\d+)?'
r'(?P<comma>,)?(?P<precision>.\d+)?(?P<type>[bcdeEfFgGnosxX%])')


class RegexFormatter(string.Formatter):
"""String formatter that converts a format string to a regular expression.
>>> regex_formatter = RegexFormatter()
>>> regex_str = regex_formatter.format('{field_one:5d}_{field_two}')
Can also be used to extract values from a string given the format spec
for that string:
>>> regex_formatter.extract_values('{field_one:5d}_{field_two}', '12345_sometext')
{'field_one': '12345', 'field_two': 'sometext'}
"""

# special string to mark a parameter not being specified
UNPROVIDED_VALUE = '<trollsift unprovided value>'
Expand Down Expand Up @@ -200,36 +217,61 @@ def _regex_datetime(self, format_spec):
replace_str = replace_str.replace(fmt_key, regex)
return replace_str

def regex_field(self, value, format_spec):
@staticmethod
def format_spec_to_regex(field_name, format_spec):
"""Make an attempt at converting a format spec to a regular expression."""
# NOTE: remove escaped backslashes so regex matches
regex_match = fmt_spec_regex.match(format_spec.replace('\\', ''))
if regex_match is None:
raise ValueError("Invalid format specification: '{}'".format(format_spec))
regex_dict = regex_match.groupdict()
fill = regex_dict['fill']
ftype = regex_dict['type']
width = regex_dict['width']
align = regex_dict['align']
# NOTE: does not properly handle `=` alignment
if fill is None:
if width is not None and width[0] == '0':
fill = '0'
elif ftype in ['s', 'd']:
fill = ' '

char_type = spec_regexes[ftype]
if ftype == 's' and align and align.endswith('='):
raise ValueError("Invalid format specification: '{}'".format(format_spec))
final_regex = char_type
if ftype in allow_multiple and (not width or width == '0'):
final_regex += r'*'
elif width and width != '0':
if not fill:
# we know we have exactly this many characters
final_regex += r'{{{}}}'.format(int(width))
elif fill:
# we don't know how many fill characters we have compared to
# field characters so just match all characters and sort it out
# later during type conversion.
final_regex = r'.{{{}}}'.format(int(width))
elif ftype in allow_multiple:
final_regex += r'*'

return r'(?P<{}>{})'.format(field_name, final_regex)

def regex_field(self, field_name, value, format_spec):
if value != self.UNPROVIDED_VALUE:
return super(RegexFormatter, self).format_field(value, format_spec)

# Replace format spec with glob patterns (*, ?, etc)
if not format_spec:
return r'.*'
return r'(?P<{}>.*)'.format(field_name)
if '%' in format_spec:
return self._regex_datetime(format_spec)
char_type = spec_regexes[format_spec[-1]]
num_match = re.search('[0-9]+', format_spec)
num = 0 if num_match is None else int(num_match.group(0))
has_multiple = format_spec[-1] in allow_multiple
if num == 0 and has_multiple:
# don't know the count
return r'{}*'.format(char_type)
elif num == 0:
# floats and other types can't have multiple
return char_type
elif format_spec[-1] in allow_multiple:
return r'{}{{{:d}}}'.format(char_type, num)
else:
return r'{}'.format(char_type)
return r'(?P<{}>{})'.format(field_name, self._regex_datetime(format_spec))
return self.format_spec_to_regex(field_name, format_spec)

def format_field(self, value, format_spec):
if not isinstance(value, tuple) or value[1] != self.UNPROVIDED_VALUE:
return super(RegexFormatter, self).format_field(value, format_spec)
field_name, value = value
new_value = self.regex_field(value, format_spec)
return '(?P<{}>{})'.format(field_name, new_value)
return self.regex_field(field_name, value, format_spec)

def extract_values(self, fmt, stri):
regex = self.format(fmt)
Expand Down Expand Up @@ -261,20 +303,16 @@ def _convert(convdef, stri):
if '%' in convdef:
result = dt.datetime.strptime(stri, convdef)
elif 'd' in convdef or 's' in convdef:
try:
align = convdef[0]
if align in [">", "<", "^"]:
pad = " "
else:
align = convdef[1]
if align in [">", "<", "^"]:
pad = convdef[0]
else:
align = None
pad = None
except IndexError:
align = None
pad = None
regex_match = fmt_spec_regex.match(convdef)
match_dict = regex_match.groupdict() if regex_match else {}
align = match_dict.get('align')
pad = match_dict.get('fill')
if align:
# align character is the last one
align = align[-1]
if align and align in '<>^' and not pad:
pad = ' '

if align == '>':
stri = stri.lstrip(pad)
elif align == '<':
Expand Down
45 changes: 30 additions & 15 deletions trollsift/tests/unittests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,36 @@ def test_parse_align(self):
'segment': '000007',
'start_time': dt.datetime(2015, 6, 5, 17, 0)})

def test_parse_digits(self):
"""Test when a digit field is shorter than the format spec."""
result = parse(
"hrpt_{platform}{platnum:2s}_{time:%Y%m%d_%H%M}_{orbit:05d}{ext}",
"hrpt_noaa19_20140212_1412_02345.l1b")
self.assertDictEqual(result, {'platform': 'noaa', 'platnum': '19',
'time': dt.datetime(2014, 2, 12, 14, 12),
'orbit': 2345,
'ext': '.l1b'})
result = parse(
"hrpt_{platform}{platnum:2s}_{time:%Y%m%d_%H%M}_{orbit:5d}{ext}",
"hrpt_noaa19_20140212_1412_ 2345.l1b")
self.assertDictEqual(result, {'platform': 'noaa', 'platnum': '19',
'time': dt.datetime(2014, 2, 12, 14, 12),
'orbit': 2345,
'ext': '.l1b'})
result = parse(
"hrpt_{platform}{platnum:2s}_{time:%Y%m%d_%H%M}_{orbit:_>5d}{ext}",
"hrpt_noaa19_20140212_1412___345.l1b")
self.assertDictEqual(result, {'platform': 'noaa', 'platnum': '19',
'time': dt.datetime(2014, 2, 12, 14, 12),
'orbit': 345,
'ext': '.l1b'})

def test_parse_bad_pattern(self):
"""Test when a digit field is shorter than the format spec."""
self.assertRaises(ValueError, parse,
"hrpt_{platform}{platnum:-=2s}_{time:%Y%m%d_%H%M}_{orbit:05d}{ext}",
"hrpt_noaa19_20140212_1412_02345.l1b")

def test_globify_simple(self):
# Run
result = globify('{a}_{b}.end', {'a': 'a', 'b': 'b'})
Expand Down Expand Up @@ -261,21 +291,6 @@ def test_compose(self):
self.assertRaises(ValueError, compose, "{a!X}", key_vals)
self.assertEqual(new_str, 'this Is A-Test b_test c test')

def assertDictEqual(self, a, b):
for key in a:
self.assertTrue(key in b)
self.assertEqual(a[key], b[key])

self.assertEqual(len(a), len(b))

def assertItemsEqual(self, a, b):
for i in range(len(a)):
if isinstance(a[i], dict):
self.assertDictEqual(a[i], b[i])
else:
self.assertEqual(a[i], b[i])
self.assertEqual(len(a), len(b))


def suite():
"""The suite for test_parser
Expand Down

0 comments on commit 79a0053

Please sign in to comment.