diff --git a/trollsift/parser.py b/trollsift/parser.py index 050e676..7981c9c 100644 --- a/trollsift/parser.py +++ b/trollsift/parser.py @@ -153,9 +153,26 @@ def convert_field(self, value, conversion): spec_regexes['g'] = spec_regexes['f'] spec_regexes['X'] = spec_regexes['x'] allow_multiple = ['c', 'd', 'o', 's', 'x', 'X'] +# format_spec ::= [[fill]align][sign][#][0][width][,][.precision][type] +# https://docs.python.org/3.4/library/string.html#format-specification-mini-language +fmt_spec_regex = re.compile( + r'(?P(?P.)?[<>=^])?(?P[\+\-\s])?(?P#)?(?P0)?(?P\d+)?' + r'(?P,)?(?P.\d+)?(?P[bcdeEfFgGnosxX%])') class RegexFormatter(string.Formatter): + """String formatter that converts a format string to a regular expression. + + >>> regex_formatter = RegexFormatter() + >>> regex_str = regex_formatter.format('{field_one:5d}_{field_two}') + + Can also be used to extract values from a string given the format spec + for that string: + + >>> regex_formatter.extract_values('{field_one:5d}_{field_two}', '12345_sometext') + {'field_one': '12345', 'field_two': 'sometext'} + + """ # special string to mark a parameter not being specified UNPROVIDED_VALUE = '' @@ -200,36 +217,61 @@ def _regex_datetime(self, format_spec): replace_str = replace_str.replace(fmt_key, regex) return replace_str - def regex_field(self, value, format_spec): + @staticmethod + def format_spec_to_regex(field_name, format_spec): + """Make an attempt at converting a format spec to a regular expression.""" + # NOTE: remove escaped backslashes so regex matches + regex_match = fmt_spec_regex.match(format_spec.replace('\\', '')) + if regex_match is None: + raise ValueError("Invalid format specification: '{}'".format(format_spec)) + regex_dict = regex_match.groupdict() + fill = regex_dict['fill'] + ftype = regex_dict['type'] + width = regex_dict['width'] + align = regex_dict['align'] + # NOTE: does not properly handle `=` alignment + if fill is None: + if width is not None and width[0] == '0': + fill = '0' + elif ftype in ['s', 'd']: + fill = ' ' + + char_type = spec_regexes[ftype] + if ftype == 's' and align and align.endswith('='): + raise ValueError("Invalid format specification: '{}'".format(format_spec)) + final_regex = char_type + if ftype in allow_multiple and (not width or width == '0'): + final_regex += r'*' + elif width and width != '0': + if not fill: + # we know we have exactly this many characters + final_regex += r'{{{}}}'.format(int(width)) + elif fill: + # we don't know how many fill characters we have compared to + # field characters so just match all characters and sort it out + # later during type conversion. + final_regex = r'.{{{}}}'.format(int(width)) + elif ftype in allow_multiple: + final_regex += r'*' + + return r'(?P<{}>{})'.format(field_name, final_regex) + + def regex_field(self, field_name, value, format_spec): if value != self.UNPROVIDED_VALUE: return super(RegexFormatter, self).format_field(value, format_spec) # Replace format spec with glob patterns (*, ?, etc) if not format_spec: - return r'.*' + return r'(?P<{}>.*)'.format(field_name) if '%' in format_spec: - return self._regex_datetime(format_spec) - char_type = spec_regexes[format_spec[-1]] - num_match = re.search('[0-9]+', format_spec) - num = 0 if num_match is None else int(num_match.group(0)) - has_multiple = format_spec[-1] in allow_multiple - if num == 0 and has_multiple: - # don't know the count - return r'{}*'.format(char_type) - elif num == 0: - # floats and other types can't have multiple - return char_type - elif format_spec[-1] in allow_multiple: - return r'{}{{{:d}}}'.format(char_type, num) - else: - return r'{}'.format(char_type) + return r'(?P<{}>{})'.format(field_name, self._regex_datetime(format_spec)) + return self.format_spec_to_regex(field_name, format_spec) def format_field(self, value, format_spec): if not isinstance(value, tuple) or value[1] != self.UNPROVIDED_VALUE: return super(RegexFormatter, self).format_field(value, format_spec) field_name, value = value - new_value = self.regex_field(value, format_spec) - return '(?P<{}>{})'.format(field_name, new_value) + return self.regex_field(field_name, value, format_spec) def extract_values(self, fmt, stri): regex = self.format(fmt) @@ -261,20 +303,16 @@ def _convert(convdef, stri): if '%' in convdef: result = dt.datetime.strptime(stri, convdef) elif 'd' in convdef or 's' in convdef: - try: - align = convdef[0] - if align in [">", "<", "^"]: - pad = " " - else: - align = convdef[1] - if align in [">", "<", "^"]: - pad = convdef[0] - else: - align = None - pad = None - except IndexError: - align = None - pad = None + regex_match = fmt_spec_regex.match(convdef) + match_dict = regex_match.groupdict() if regex_match else {} + align = match_dict.get('align') + pad = match_dict.get('fill') + if align: + # align character is the last one + align = align[-1] + if align and align in '<>^' and not pad: + pad = ' ' + if align == '>': stri = stri.lstrip(pad) elif align == '<': diff --git a/trollsift/tests/unittests/test_parser.py b/trollsift/tests/unittests/test_parser.py index 82d50fd..47c1c25 100644 --- a/trollsift/tests/unittests/test_parser.py +++ b/trollsift/tests/unittests/test_parser.py @@ -136,6 +136,36 @@ def test_parse_align(self): 'segment': '000007', 'start_time': dt.datetime(2015, 6, 5, 17, 0)}) + def test_parse_digits(self): + """Test when a digit field is shorter than the format spec.""" + result = parse( + "hrpt_{platform}{platnum:2s}_{time:%Y%m%d_%H%M}_{orbit:05d}{ext}", + "hrpt_noaa19_20140212_1412_02345.l1b") + self.assertDictEqual(result, {'platform': 'noaa', 'platnum': '19', + 'time': dt.datetime(2014, 2, 12, 14, 12), + 'orbit': 2345, + 'ext': '.l1b'}) + result = parse( + "hrpt_{platform}{platnum:2s}_{time:%Y%m%d_%H%M}_{orbit:5d}{ext}", + "hrpt_noaa19_20140212_1412_ 2345.l1b") + self.assertDictEqual(result, {'platform': 'noaa', 'platnum': '19', + 'time': dt.datetime(2014, 2, 12, 14, 12), + 'orbit': 2345, + 'ext': '.l1b'}) + result = parse( + "hrpt_{platform}{platnum:2s}_{time:%Y%m%d_%H%M}_{orbit:_>5d}{ext}", + "hrpt_noaa19_20140212_1412___345.l1b") + self.assertDictEqual(result, {'platform': 'noaa', 'platnum': '19', + 'time': dt.datetime(2014, 2, 12, 14, 12), + 'orbit': 345, + 'ext': '.l1b'}) + + def test_parse_bad_pattern(self): + """Test when a digit field is shorter than the format spec.""" + self.assertRaises(ValueError, parse, + "hrpt_{platform}{platnum:-=2s}_{time:%Y%m%d_%H%M}_{orbit:05d}{ext}", + "hrpt_noaa19_20140212_1412_02345.l1b") + def test_globify_simple(self): # Run result = globify('{a}_{b}.end', {'a': 'a', 'b': 'b'}) @@ -261,21 +291,6 @@ def test_compose(self): self.assertRaises(ValueError, compose, "{a!X}", key_vals) self.assertEqual(new_str, 'this Is A-Test b_test c test') - def assertDictEqual(self, a, b): - for key in a: - self.assertTrue(key in b) - self.assertEqual(a[key], b[key]) - - self.assertEqual(len(a), len(b)) - - def assertItemsEqual(self, a, b): - for i in range(len(a)): - if isinstance(a[i], dict): - self.assertDictEqual(a[i], b[i]) - else: - self.assertEqual(a[i], b[i]) - self.assertEqual(len(a), len(b)) - def suite(): """The suite for test_parser