Skip to content

Commit

Permalink
Merge pull request sciunto-org#64 from omangin/pyparsing
Browse files Browse the repository at this point in the history
New parser based on pyparsing.
  • Loading branch information
sciunto committed Jan 9, 2016
2 parents 6660bf9 + 7b5626d commit 798400c
Show file tree
Hide file tree
Showing 27 changed files with 759 additions and 326 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ install:
- if [[ $TEST_SUITE == suite_3_5 ]]; then
pip install sphinx;
fi;
- pip install coverage
- pip install coverage pyparsing
- python setup.py install
script:
- nosetests --with-coverage --cover-erase --cover-package=bibtexparser
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
v0.XXX
======

* ENH: we use pyparsing (#64) by Olivier Magin.
* DOC: fix typos in tutorial
* DOC: include docs/ in manifest

Expand Down
3 changes: 3 additions & 0 deletions CONTRIBUTORS.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,6 @@

- Cschaffner
New features in bwriter

- Olivier Mangin
Pyparsing implementation of the parser.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
include requirements.txt
include *.md
include docs/Makefile
include docs/source/*
24 changes: 0 additions & 24 deletions TODO

This file was deleted.

4 changes: 2 additions & 2 deletions bibtexparser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@
"""
__all__ = [
'loads', 'load', 'dumps', 'dump', 'bibdatabase',
'bparser', 'bwriter', 'latexenc', 'customization',
'bparser', 'bwriter', 'bibtexexpression', 'latexenc', 'customization',
]
__version__ = '0.6.2'

from . import bibdatabase, bparser, bwriter, latexenc, customization
from . import bibdatabase, bibtexexpression, bparser, bwriter, latexenc, customization


def loads(bibtex_str, parser=None):
Expand Down
64 changes: 64 additions & 0 deletions bibtexparser/bibdatabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,37 @@
TEXT_TYPE = str


STANDARD_TYPES = set([
'article',
'book',
'booklet',
'conference',
'inbook',
'incollection',
'inproceedings',
'manual',
'mastersthesis',
'misc',
'phdthesis',
'proceedings',
'techreport',
'unpublished'])
COMMON_STRINGS = {
'jan': 'January',
'feb': 'February',
'mar': 'March',
'apr': 'April',
'may': 'May',
'jun': 'June',
'jul': 'July',
'aug': 'August',
'sep': 'September',
'oct': 'October',
'nov': 'November',
'dec': 'December',
}


class BibDatabase(object):
"""
A bibliographic database object following the data structure of a BibTeX file.
Expand All @@ -27,6 +58,9 @@ def __init__(self):
#: List of BibTeX preamble (`@preamble{...}`) blocks.
self.preambles = []

def load_common_strings(self):
self.strings.update(COMMON_STRINGS)

def get_entry_list(self):
"""Get a list of bibtex entries.
Expand Down Expand Up @@ -55,3 +89,33 @@ def get_entry_dict(self):
return self._entries_dict

entries_dict = property(get_entry_dict)

def expand_string(self, name):
try:
return self.strings[name]
except KeyError:
raise(KeyError("Unknown string: {}.".format(name)))


class BibDataString(object):
"""
Represents a bibtex string.
This object enables mainting string expressions as list of strings
and BibDataString. Can be interpolated from Bibdatabase.
"""

def __init__(self, bibdatabase, name):
self._bibdatabase = bibdatabase
self.name = name.lower()

def __repr__(self):
return "BibDataString({})".format(self.name.__repr__())

def get_value(self):
"""
Query value from string name.
:returns: string
"""
return self._bibdatabase.expand_string(self.name)
246 changes: 246 additions & 0 deletions bibtexparser/bibtexexpression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
import pyparsing as pp


# General helpers

def strip_after_new_lines(s):
"""Removes leading and trailing whitespaces in all but first line."""
lines = s.splitlines()
if len(lines) > 1:
lines = [lines[0]] + [l.lstrip() for l in lines[1:]]
return '\n'.join(lines)


def add_logger_parse_action(expr, log_func):
"""Register a callback on expression parsing with the adequate message."""
def action(s, l, t):
log_func("Found {}: {}".format(expr.resultsName, t))
expr.addParseAction(action)


# Parse action helpers
# Helpers for returning values from the parsed tokens. Shaped as pyparsing's
# parse actions. In pyparsing wording:
# s, l, t, stand for string, location, token

def first_token(s, l, t):
# TODO Handle this case correctly!
assert(len(t) == 1)
return t[0]


def remove_trailing_newlines(s, l, t):
if t[0]:
return t[0].rstrip('\n')


def remove_braces(s, l, t):
if len(t[0]) < 1:
return ''
else:
start = 1 if t[0][0] == '{' else 0
end = -1 if t[0][-1] == '}' else None
return t[0][start:end]


def field_to_pair(s, l, t):
"""
Looks for parsed element named 'Field'.
:returns: (name, value).
"""
f = t.get('Field')
# Not sure it is desirable here to strip but it is for conformance
# to previous implementation
return (f.get('FieldName'),
strip_after_new_lines(f.get('Value')))


# Expressions helpers

def in_braces_or_pars(exp):
"""
exp -> (exp)|{exp}
"""
return ((pp.Suppress('{') + exp + pp.Suppress('}')) |
(pp.Suppress('(') + exp + pp.Suppress(')')))


class BibtexExpression(object):
"""Gives access to pyparsing expressions.
Attributes are pyparsing expressions for the following elements:
main_expression: the bibtex file
string_def: a string definition
preamble_decl: a preamble declaration
explicit_comment: an explicit comment
entry: an entry definition
implicit_comment: an implicit comment
"""

ParseException = pp.ParseException

def __init__(self):

# Bibtex keywords

string_def_start = pp.CaselessKeyword("@string")
preamble_start = pp.CaselessKeyword("@preamble")
comment_line_start = pp.CaselessKeyword('@comment')

# String names
string_name = pp.Word(pp.alphanums + '_')('StringName')
self.set_string_name_parse_action(lambda s, l, t: None)
string_name.addParseAction(self._string_name_parse_action)

# Values inside bibtex fields
# Values can be integer or string expressions. The latter may use
# quoted or braced values.

# Integer values
integer = pp.Word(pp.nums)('Integer')

# Braced values: braced values can contain nested (but balanced) braces
braced_value_content = pp.CharsNotIn('{}')
braced_value = pp.Forward() # Recursive definition for nested braces
braced_value <<= pp.originalTextFor(
'{' + pp.ZeroOrMore(braced_value | braced_value_content) + '}'
)('BracedValue')
braced_value.setParseAction(remove_braces)
# TODO add ignore for "\}" and "\{" ?
# TODO @ are not parsed by bibtex in braces

# Quoted values: may contain braced content with balanced braces
brace_in_quoted = pp.nestedExpr('{', '}')
text_in_quoted = pp.CharsNotIn('"{}')
# (quotes should be escaped in quoted value)
quoted_value = pp.originalTextFor(
'"' +
pp.ZeroOrMore(text_in_quoted | brace_in_quoted) +
'"')('QuotedValue')
quoted_value.addParseAction(pp.removeQuotes)

# String expressions
string_expr = pp.delimitedList(
(quoted_value | braced_value | string_name), delim='#'
)('StringExpression')
self.set_string_expression_parse_action(lambda s, l, t: None)
string_expr.addParseAction(self._string_expr_parse_action)

value = (integer | string_expr)('Value')

# Entries

# @EntryType { ...
entry_type = (pp.Suppress('@') + pp.Word(pp.alphas))('EntryType')
entry_type.setParseAction(first_token)

# Entry key: any character up to a ',' without leading and trailing
# spaces.
key = pp.SkipTo(',')('Key') # Exclude @',\#}{~%
key.setParseAction(lambda s, l, t: first_token(s, l, t).strip())

# Field name: word of letters and underscores
field_name = pp.Word(pp.alphas + '_')('FieldName')
field_name.setParseAction(first_token)

# Field: field_name = value
field = pp.Group(field_name + pp.Suppress('=') + value)('Field')
field.setParseAction(field_to_pair)

# List of fields: comma separeted fields
field_list = (pp.delimitedList(field) + pp.Suppress(pp.Optional(','))
)('Fields')
field_list.setParseAction(
lambda s, l, t: {k: v for (k, v) in reversed(t.get('Fields'))})

# Entry: type, key, and fields
self.entry = (entry_type +
in_braces_or_pars(key + pp.Suppress(',') + field_list)
)('Entry')

# Other stuff: comments, string definitions, and preamble declarations

# Explicit comments: @comment + everything up to next valid declaration
# starting on new line.
not_an_implicit_comment = (pp.LineStart() + pp.Literal('@')
) | pp.stringEnd()
self.explicit_comment = (
pp.Suppress(comment_line_start) +
pp.originalTextFor(pp.SkipTo(not_an_implicit_comment),
asString=True))('ExplicitComment')
self.explicit_comment.addParseAction(remove_trailing_newlines)
self.explicit_comment.addParseAction(remove_braces)
# Previous implementation included comment until next '}'.
# This is however not inline with bibtex behavior that is to only
# ignore until EOL. Brace stipping is arbitrary here but avoids
# duplication on bibtex write.

# Empty implicit_comments lead to infinite loop of zeroOrMore
def mustNotBeEmpty(t):
if not t[0]:
raise pp.ParseException("Match must not be empty.")

# Implicit comments: not anything else
self.implicit_comment = pp.originalTextFor(
pp.SkipTo(not_an_implicit_comment).setParseAction(mustNotBeEmpty),
asString=True)('ImplicitComment')
self.implicit_comment.addParseAction(remove_trailing_newlines)

# String definition
self.string_def = (pp.Suppress(string_def_start) + in_braces_or_pars(
string_name +
pp.Suppress('=') +
string_expr('StringValue')
))('StringDefinition')

# Preamble declaration
self.preamble_decl = (pp.Suppress(preamble_start) +
in_braces_or_pars(value))('PreambleDeclaration')

# Main bibtex expression

self.main_expression = pp.ZeroOrMore(
self.string_def |
self.preamble_decl |
self.explicit_comment |
self.entry |
self.implicit_comment)

def add_log_function(self, log_fun):
"""Add notice to logger on entry, comment, preamble, string definitions.
:param log_fun: logger function
"""
for e in [self.entry,
self.implicit_comment,
self.explicit_comment,
self.preamble_decl,
self.string_def]:
add_logger_parse_action(e, log_fun)

def set_string_name_parse_action(self, fun):
"""Set the parseAction for string name expression.
Note:
For some reason pyparsing duplicates the string_name
expression so setting its parseAction a posteriori has no effect
in the context of a string expression. This is why this function
should be used instead.
"""
self._string_name_parse_action_fun = fun

def _string_name_parse_action(self, s, l, t):
return self._string_name_parse_action_fun(s, l, t)

def set_string_expression_parse_action(self, fun):
"""Set the paseAction for string_expression expression.
Note: see set_string_name_parse_action.
"""
self._string_expr_parse_action_fun = fun

def _string_expr_parse_action(self, s, l, t):
return self._string_expr_parse_action_fun(s, l, t)

def parseFile(self, file_obj):
return self.main_expression.parseFile(file_obj, parseAll=True)
Loading

0 comments on commit 798400c

Please sign in to comment.