Merge 78b387c into dba39cf

pytroll · Sep 27, 2018 · 79a0053 · 79a0053
2 parents dba39cf + 78b387c
commit 79a0053
Show file tree

Hide file tree

Showing 2 changed files with 101 additions and 48 deletions.
diff --git a/trollsift/parser.py b/trollsift/parser.py
@@ -153,9 +153,26 @@ def convert_field(self, value, conversion):
 spec_regexes['g'] = spec_regexes['f']
 spec_regexes['X'] = spec_regexes['x']
 allow_multiple = ['c', 'd', 'o', 's', 'x', 'X']
+# format_spec ::=  [[fill]align][sign][#][0][width][,][.precision][type]
+# https://docs.python.org/3.4/library/string.html#format-specification-mini-language
+fmt_spec_regex = re.compile(
+    r'(?P<align>(?P<fill>.)?[<>=^])?(?P<sign>[\+\-\s])?(?P<pound>#)?(?P<zero>0)?(?P<width>\d+)?'
+    r'(?P<comma>,)?(?P<precision>.\d+)?(?P<type>[bcdeEfFgGnosxX%])')
 
 
 class RegexFormatter(string.Formatter):
+    """String formatter that converts a format string to a regular expression.
+    
+    >>> regex_formatter = RegexFormatter()
+    >>> regex_str = regex_formatter.format('{field_one:5d}_{field_two}')
+
+    Can also be used to extract values from a string given the format spec
+    for that string:
+
+    >>> regex_formatter.extract_values('{field_one:5d}_{field_two}', '12345_sometext')
+    {'field_one': '12345', 'field_two': 'sometext'}
+
+    """
 
     # special string to mark a parameter not being specified
     UNPROVIDED_VALUE = '<trollsift unprovided value>'
@@ -200,36 +217,61 @@ def _regex_datetime(self, format_spec):
             replace_str = replace_str.replace(fmt_key, regex)
         return replace_str
 
-    def regex_field(self, value, format_spec):
+    @staticmethod
+    def format_spec_to_regex(field_name, format_spec):
+        """Make an attempt at converting a format spec to a regular expression."""
+        # NOTE: remove escaped backslashes so regex matches
+        regex_match = fmt_spec_regex.match(format_spec.replace('\\', ''))
+        if regex_match is None:
+            raise ValueError("Invalid format specification: '{}'".format(format_spec))
+        regex_dict = regex_match.groupdict()
+        fill = regex_dict['fill']
+        ftype = regex_dict['type']
+        width = regex_dict['width']
+        align = regex_dict['align']
+        # NOTE: does not properly handle `=` alignment
+        if fill is None:
+            if width is not None and width[0] == '0':
+                fill = '0'
+            elif ftype in ['s', 'd']:
+                fill = ' '
+
+        char_type = spec_regexes[ftype]
+        if ftype == 's' and align and align.endswith('='):
+            raise ValueError("Invalid format specification: '{}'".format(format_spec))
+        final_regex = char_type
+        if ftype in allow_multiple and (not width or width == '0'):
+            final_regex += r'*'
+        elif width and width != '0':
+            if not fill:
+                # we know we have exactly this many characters
+                final_regex += r'{{{}}}'.format(int(width))
+            elif fill:
+                # we don't know how many fill characters we have compared to
+                # field characters so just match all characters and sort it out
+                # later during type conversion.
+                final_regex = r'.{{{}}}'.format(int(width))
+            elif ftype in allow_multiple:
+                final_regex += r'*'
+
+        return r'(?P<{}>{})'.format(field_name, final_regex)
+
+    def regex_field(self, field_name, value, format_spec):
         if value != self.UNPROVIDED_VALUE:
             return super(RegexFormatter, self).format_field(value, format_spec)
 
         # Replace format spec with glob patterns (*, ?, etc)
         if not format_spec:
-            return r'.*'
+            return r'(?P<{}>.*)'.format(field_name)
         if '%' in format_spec:
-            return self._regex_datetime(format_spec)
-        char_type = spec_regexes[format_spec[-1]]
-        num_match = re.search('[0-9]+', format_spec)
-        num = 0 if num_match is None else int(num_match.group(0))
-        has_multiple = format_spec[-1] in allow_multiple
-        if num == 0 and has_multiple:
-            # don't know the count
-            return r'{}*'.format(char_type)
-        elif num == 0:
-            # floats and other types can't have multiple
-            return char_type
-        elif format_spec[-1] in allow_multiple:
-            return r'{}{{{:d}}}'.format(char_type, num)
-        else:
-            return r'{}'.format(char_type)
+            return r'(?P<{}>{})'.format(field_name, self._regex_datetime(format_spec))
+        return self.format_spec_to_regex(field_name, format_spec)
 
     def format_field(self, value, format_spec):
         if not isinstance(value, tuple) or value[1] != self.UNPROVIDED_VALUE:
             return super(RegexFormatter, self).format_field(value, format_spec)
         field_name, value = value
-        new_value = self.regex_field(value, format_spec)
-        return '(?P<{}>{})'.format(field_name, new_value)
+        return self.regex_field(field_name, value, format_spec)
 
     def extract_values(self, fmt, stri):
         regex = self.format(fmt)
@@ -261,20 +303,16 @@ def _convert(convdef, stri):
     if '%' in convdef:
         result = dt.datetime.strptime(stri, convdef)
     elif 'd' in convdef or 's' in convdef:
-        try:
-            align = convdef[0]
-            if align in [">", "<", "^"]:
-                pad = " "
-            else:
-                align = convdef[1]
-                if align in [">", "<", "^"]:
-                    pad = convdef[0]
-                else:
-                    align = None
-                    pad = None
-        except IndexError:
-            align = None
-            pad = None
+        regex_match = fmt_spec_regex.match(convdef)
+        match_dict = regex_match.groupdict() if regex_match else {}
+        align = match_dict.get('align')
+        pad = match_dict.get('fill')
+        if align:
+            # align character is the last one
+            align = align[-1]
+        if align and align in '<>^' and not pad:
+            pad = ' '
+
         if align == '>':
             stri = stri.lstrip(pad)
         elif align == '<':

diff --git a/trollsift/tests/unittests/test_parser.py b/trollsift/tests/unittests/test_parser.py
@@ -136,6 +136,36 @@ def test_parse_align(self):
                                       'segment': '000007',
                                       'start_time': dt.datetime(2015, 6, 5, 17, 0)})
 
+    def test_parse_digits(self):
+        """Test when a digit field is shorter than the format spec."""
+        result = parse(
+            "hrpt_{platform}{platnum:2s}_{time:%Y%m%d_%H%M}_{orbit:05d}{ext}",
+            "hrpt_noaa19_20140212_1412_02345.l1b")
+        self.assertDictEqual(result, {'platform': 'noaa', 'platnum': '19',
+                                      'time': dt.datetime(2014, 2, 12, 14, 12),
+                                      'orbit': 2345,
+                                      'ext': '.l1b'})
+        result = parse(
+            "hrpt_{platform}{platnum:2s}_{time:%Y%m%d_%H%M}_{orbit:5d}{ext}",
+            "hrpt_noaa19_20140212_1412_ 2345.l1b")
+        self.assertDictEqual(result, {'platform': 'noaa', 'platnum': '19',
+                                      'time': dt.datetime(2014, 2, 12, 14, 12),
+                                      'orbit': 2345,
+                                      'ext': '.l1b'})
+        result = parse(
+            "hrpt_{platform}{platnum:2s}_{time:%Y%m%d_%H%M}_{orbit:_>5d}{ext}",
+            "hrpt_noaa19_20140212_1412___345.l1b")
+        self.assertDictEqual(result, {'platform': 'noaa', 'platnum': '19',
+                                      'time': dt.datetime(2014, 2, 12, 14, 12),
+                                      'orbit': 345,
+                                      'ext': '.l1b'})
+
+    def test_parse_bad_pattern(self):
+        """Test when a digit field is shorter than the format spec."""
+        self.assertRaises(ValueError, parse,
+                          "hrpt_{platform}{platnum:-=2s}_{time:%Y%m%d_%H%M}_{orbit:05d}{ext}",
+                          "hrpt_noaa19_20140212_1412_02345.l1b")
+
     def test_globify_simple(self):
         # Run
         result = globify('{a}_{b}.end', {'a': 'a', 'b': 'b'})
@@ -261,21 +291,6 @@ def test_compose(self):
         self.assertRaises(ValueError, compose, "{a!X}", key_vals)
         self.assertEqual(new_str, 'this Is A-Test b_test c test')
 
-    def assertDictEqual(self, a, b):
-        for key in a:
-            self.assertTrue(key in b)
-            self.assertEqual(a[key], b[key])
-
-        self.assertEqual(len(a), len(b))
-
-    def assertItemsEqual(self, a, b):
-        for i in range(len(a)):
-            if isinstance(a[i], dict):
-                self.assertDictEqual(a[i], b[i])
-            else:
-                self.assertEqual(a[i], b[i])
-        self.assertEqual(len(a), len(b))
-
 
 def suite():
     """The suite for test_parser