Merge pull request sciunto-org#64 from omangin/pyparsing

New parser based on pyparsing.
sciunto · Jan 9, 2016 · 798400c · 798400c
2 parents 6660bf9 + 7b5626d
commit 798400c
Show file tree

Hide file tree

Showing 27 changed files with 759 additions and 326 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -15,7 +15,7 @@ install:
   - if [[ $TEST_SUITE == suite_3_5 ]]; then
         pip install sphinx;
     fi;
-  - pip install coverage
+  - pip install coverage pyparsing
   - python setup.py install
 script:
   - nosetests --with-coverage  --cover-erase --cover-package=bibtexparser

diff --git a/CHANGELOG b/CHANGELOG
@@ -1,6 +1,7 @@
 v0.XXX
 ======
 
+* ENH: we use pyparsing (#64) by Olivier Magin.
 * DOC: fix typos in tutorial
 * DOC: include docs/ in manifest
 

diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt
@@ -30,3 +30,6 @@
 
 - Cschaffner
   New features in bwriter
+
+- Olivier Mangin
+  Pyparsing implementation of the parser.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,3 +1,4 @@
+include requirements.txt
 include *.md
 include docs/Makefile
 include docs/source/*
diff --git a/TODO b/TODO
diff --git a/bibtexparser/__init__.py b/bibtexparser/__init__.py
@@ -20,11 +20,11 @@
 """
 __all__ = [
     'loads', 'load', 'dumps', 'dump', 'bibdatabase',
-    'bparser', 'bwriter', 'latexenc', 'customization',
+    'bparser', 'bwriter', 'bibtexexpression', 'latexenc', 'customization',
 ]
 __version__ = '0.6.2'
 
-from . import bibdatabase, bparser, bwriter, latexenc, customization
+from . import bibdatabase, bibtexexpression, bparser, bwriter, latexenc, customization
 
 
 def loads(bibtex_str, parser=None):

diff --git a/bibtexparser/bibdatabase.py b/bibtexparser/bibdatabase.py
@@ -8,6 +8,37 @@
     TEXT_TYPE = str
 
 
+STANDARD_TYPES = set([
+    'article',
+    'book',
+    'booklet',
+    'conference',
+    'inbook',
+    'incollection',
+    'inproceedings',
+    'manual',
+    'mastersthesis',
+    'misc',
+    'phdthesis',
+    'proceedings',
+    'techreport',
+    'unpublished'])
+COMMON_STRINGS = {
+    'jan': 'January',
+    'feb': 'February',
+    'mar': 'March',
+    'apr': 'April',
+    'may': 'May',
+    'jun': 'June',
+    'jul': 'July',
+    'aug': 'August',
+    'sep': 'September',
+    'oct': 'October',
+    'nov': 'November',
+    'dec': 'December',
+    }
+
+
 class BibDatabase(object):
     """
     A bibliographic database object following the data structure of a BibTeX file.
@@ -27,6 +58,9 @@ def __init__(self):
         #: List of BibTeX preamble (`@preamble{...}`) blocks.
         self.preambles = []
 
+    def load_common_strings(self):
+        self.strings.update(COMMON_STRINGS)
+
     def get_entry_list(self):
         """Get a list of bibtex entries.
 
@@ -55,3 +89,33 @@ def get_entry_dict(self):
         return self._entries_dict
 
     entries_dict = property(get_entry_dict)
+
+    def expand_string(self, name):
+        try:
+            return self.strings[name]
+        except KeyError:
+            raise(KeyError("Unknown string: {}.".format(name)))
+
+
+class BibDataString(object):
+    """
+    Represents a bibtex string.
+
+    This object enables mainting string expressions as list of strings
+    and BibDataString. Can be interpolated from Bibdatabase.
+    """
+
+    def __init__(self, bibdatabase, name):
+        self._bibdatabase = bibdatabase
+        self.name = name.lower()
+
+    def __repr__(self):
+        return "BibDataString({})".format(self.name.__repr__())
+
+    def get_value(self):
+        """
+        Query value from string name.
+
+        :returns: string
+        """
+        return self._bibdatabase.expand_string(self.name)
diff --git a/bibtexparser/bibtexexpression.py b/bibtexparser/bibtexexpression.py
@@ -0,0 +1,246 @@
+import pyparsing as pp
+
+
+# General helpers
+
+def strip_after_new_lines(s):
+    """Removes leading and trailing whitespaces in all but first line."""
+    lines = s.splitlines()
+    if len(lines) > 1:
+        lines = [lines[0]] + [l.lstrip() for l in lines[1:]]
+    return '\n'.join(lines)
+
+
+def add_logger_parse_action(expr, log_func):
+    """Register a callback on expression parsing with the adequate message."""
+    def action(s, l, t):
+        log_func("Found {}: {}".format(expr.resultsName, t))
+    expr.addParseAction(action)
+
+
+# Parse action helpers
+# Helpers for returning values from the parsed tokens. Shaped as pyparsing's
+# parse actions. In pyparsing wording:
+# s, l, t, stand for string, location, token
+
+def first_token(s, l, t):
+    # TODO Handle this case correctly!
+    assert(len(t) == 1)
+    return t[0]
+
+
+def remove_trailing_newlines(s, l, t):
+    if t[0]:
+        return t[0].rstrip('\n')
+
+
+def remove_braces(s, l, t):
+    if len(t[0]) < 1:
+        return ''
+    else:
+        start = 1 if t[0][0] == '{' else 0
+        end = -1 if t[0][-1] == '}' else None
+        return t[0][start:end]
+
+
+def field_to_pair(s, l, t):
+    """
+    Looks for parsed element named 'Field'.
+    :returns: (name, value).
+    """
+    f = t.get('Field')
+    # Not sure it is desirable here to strip but it is for conformance
+    # to previous implementation
+    return (f.get('FieldName'),
+            strip_after_new_lines(f.get('Value')))
+
+
+# Expressions helpers
+
+def in_braces_or_pars(exp):
+    """
+    exp -> (exp)|{exp}
+    """
+    return ((pp.Suppress('{') + exp + pp.Suppress('}')) |
+            (pp.Suppress('(') + exp + pp.Suppress(')')))
+
+
+class BibtexExpression(object):
+    """Gives access to pyparsing expressions.
+
+    Attributes are pyparsing expressions for the following elements:
+        main_expression: the bibtex file
+        string_def: a string definition
+        preamble_decl: a preamble declaration
+        explicit_comment: an explicit comment
+        entry: an entry definition
+        implicit_comment: an implicit comment
+    """
+
+    ParseException = pp.ParseException
+
+    def __init__(self):
+
+        # Bibtex keywords
+
+        string_def_start = pp.CaselessKeyword("@string")
+        preamble_start = pp.CaselessKeyword("@preamble")
+        comment_line_start = pp.CaselessKeyword('@comment')
+
+        # String names
+        string_name = pp.Word(pp.alphanums + '_')('StringName')
+        self.set_string_name_parse_action(lambda s, l, t: None)
+        string_name.addParseAction(self._string_name_parse_action)
+
+        # Values inside bibtex fields
+        # Values can be integer or string expressions. The latter may use
+        # quoted or braced values.
+
+        # Integer values
+        integer = pp.Word(pp.nums)('Integer')
+
+        # Braced values: braced values can contain nested (but balanced) braces
+        braced_value_content = pp.CharsNotIn('{}')
+        braced_value = pp.Forward()  # Recursive definition for nested braces
+        braced_value <<= pp.originalTextFor(
+            '{' + pp.ZeroOrMore(braced_value | braced_value_content) + '}'
+            )('BracedValue')
+        braced_value.setParseAction(remove_braces)
+        # TODO add ignore for "\}" and "\{" ?
+        # TODO @ are not parsed by bibtex in braces
+
+        # Quoted values: may contain braced content with balanced braces
+        brace_in_quoted = pp.nestedExpr('{', '}')
+        text_in_quoted = pp.CharsNotIn('"{}')
+        # (quotes should be escaped in quoted value)
+        quoted_value = pp.originalTextFor(
+            '"' +
+            pp.ZeroOrMore(text_in_quoted | brace_in_quoted) +
+            '"')('QuotedValue')
+        quoted_value.addParseAction(pp.removeQuotes)
+
+        # String expressions
+        string_expr = pp.delimitedList(
+            (quoted_value | braced_value | string_name), delim='#'
+            )('StringExpression')
+        self.set_string_expression_parse_action(lambda s, l, t: None)
+        string_expr.addParseAction(self._string_expr_parse_action)
+
+        value = (integer | string_expr)('Value')
+
+        # Entries
+
+        # @EntryType { ...
+        entry_type = (pp.Suppress('@') + pp.Word(pp.alphas))('EntryType')
+        entry_type.setParseAction(first_token)
+
+        # Entry key: any character up to a ',' without leading and trailing
+        # spaces.
+        key = pp.SkipTo(',')('Key')  # Exclude @',\#}{~%
+        key.setParseAction(lambda s, l, t: first_token(s, l, t).strip())
+
+        # Field name: word of letters and underscores
+        field_name = pp.Word(pp.alphas + '_')('FieldName')
+        field_name.setParseAction(first_token)
+
+        # Field: field_name = value
+        field = pp.Group(field_name + pp.Suppress('=') + value)('Field')
+        field.setParseAction(field_to_pair)
+
+        # List of fields: comma separeted fields
+        field_list = (pp.delimitedList(field) + pp.Suppress(pp.Optional(','))
+                      )('Fields')
+        field_list.setParseAction(
+            lambda s, l, t: {k: v for (k, v) in reversed(t.get('Fields'))})
+
+        # Entry: type, key, and fields
+        self.entry = (entry_type +
+                      in_braces_or_pars(key + pp.Suppress(',') + field_list)
+                      )('Entry')
+
+        # Other stuff: comments, string definitions, and preamble declarations
+
+        # Explicit comments: @comment + everything up to next valid declaration
+        # starting on new line.
+        not_an_implicit_comment = (pp.LineStart() + pp.Literal('@')
+                                   ) | pp.stringEnd()
+        self.explicit_comment = (
+            pp.Suppress(comment_line_start) +
+            pp.originalTextFor(pp.SkipTo(not_an_implicit_comment),
+                               asString=True))('ExplicitComment')
+        self.explicit_comment.addParseAction(remove_trailing_newlines)
+        self.explicit_comment.addParseAction(remove_braces)
+        # Previous implementation included comment until next '}'.
+        # This is however not inline with bibtex behavior that is to only
+        # ignore until EOL. Brace stipping is arbitrary here but avoids
+        # duplication on bibtex write.
+
+        # Empty implicit_comments lead to infinite loop of zeroOrMore
+        def mustNotBeEmpty(t):
+            if not t[0]:
+                raise pp.ParseException("Match must not be empty.")
+
+        # Implicit comments: not anything else
+        self.implicit_comment = pp.originalTextFor(
+            pp.SkipTo(not_an_implicit_comment).setParseAction(mustNotBeEmpty),
+            asString=True)('ImplicitComment')
+        self.implicit_comment.addParseAction(remove_trailing_newlines)
+
+        # String definition
+        self.string_def = (pp.Suppress(string_def_start) + in_braces_or_pars(
+            string_name +
+            pp.Suppress('=') +
+            string_expr('StringValue')
+            ))('StringDefinition')
+
+        # Preamble declaration
+        self.preamble_decl = (pp.Suppress(preamble_start) +
+                              in_braces_or_pars(value))('PreambleDeclaration')
+
+        # Main bibtex expression
+
+        self.main_expression = pp.ZeroOrMore(
+                self.string_def |
+                self.preamble_decl |
+                self.explicit_comment |
+                self.entry |
+                self.implicit_comment)
+
+    def add_log_function(self, log_fun):
+        """Add notice to logger on entry, comment, preamble, string definitions.
+
+        :param log_fun: logger function
+        """
+        for e in [self.entry,
+                  self.implicit_comment,
+                  self.explicit_comment,
+                  self.preamble_decl,
+                  self.string_def]:
+            add_logger_parse_action(e, log_fun)
+
+    def set_string_name_parse_action(self, fun):
+        """Set the parseAction for string name expression.
+
+        Note:
+            For some reason pyparsing duplicates the string_name
+            expression so setting its parseAction a posteriori has no effect
+            in the context of a string expression. This is why this function
+            should be used instead.
+        """
+        self._string_name_parse_action_fun = fun
+
+    def _string_name_parse_action(self, s, l, t):
+        return self._string_name_parse_action_fun(s, l, t)
+
+    def set_string_expression_parse_action(self, fun):
+        """Set the paseAction for string_expression expression.
+
+        Note: see set_string_name_parse_action.
+        """
+        self._string_expr_parse_action_fun = fun
+
+    def _string_expr_parse_action(self, s, l, t):
+        return self._string_expr_parse_action_fun(s, l, t)
+
+    def parseFile(self, file_obj):
+        return self.main_expression.parseFile(file_obj, parseAll=True)