Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Branch: master
Fetching contributors…

Cannot retrieve contributors at this time

executable file 259 lines (223 sloc) 8.355 kB
#!/usr/bin/env python
#
# Natural Language Toolkit: tree->image rendering script
#
# Copyright (C) 2001-2012 NLTK Project
# Author: Edward Loper <edloper@gradient.cis.upenn.edu>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Render treebank-like tree strings, and write the output to an image
file. Rendering is performed by L{tree_to_image()}. Usage:
- Simple trees use parenthases to define consituents. The first
element in each parenthasized substring is the node, and the
remaining elements are subtrees or leaves::
(S (NP John) (VP (V saw) (NP Mary)))
- Constituents that should be drawn with a 'roof' (i.e., a triangle
between the node & its children, rather than individual lines)
are marked using angle brackets::
(S (NP John) <VP saw Mary>)
- Subscripting is done using underscore (similar to latex). If
the subscripted string is more than one character long, it should
be enclosed in brackets::
(S (NP Mary_i) (VP was (VP seen t_i)))
- Substrings can be italicized by using '*...*' (like rst)::
(S (NP *Mary_i*) (VP was (VP seen *t_i*)))
- Backslash can be used to escape characters that would otherwise
be treated as markup (i.e., any of C{'<>()_* '}). Note that this
list includes space::
(S (NP Mary) (VP went (PP to (NP New\ York))))
Requires Imagemagick C{convert}.
"""
import re, sys, os
import tkFont
import tempfile
import pickle
from nltk.draw.util import SequenceWidget, TextWidget, SpaceWidget, CanvasFrame
from nltk.draw.tree import TreeSegmentWidget
CONVERT = 'convert'
# hackery..
# sys.path.append('/Users/edloper/nltk/')
if '/sw/bin' not in os.environ['PATH']:
os.environ['PATH'] += ':/sw/bin'
def tokenize(s, regexp):
pos = 0
for m in re.finditer(regexp, s):
if m.start() != pos: raise ValueError, 'tokenization error'
pos = m.end()
yield m.group()
def tree_to_widget(s, canvas):
"""
Parse a tree string, and return a corresponding widget. See the
module docstring for the format of C{s}.
"""
WORD = r'(\\\\|\\[^\\\n]|[^\\\s()<>])+'
TOKEN_RE = re.compile(r'\(\s*%s|<\s*%s|\)|>|%s|\s+' % (WORD, WORD, WORD))
stack = [[]]
for tok in tokenize(s.strip(), TOKEN_RE):
if tok.strip() == '':
pass
elif tok[:1] in '(<':
node = word_to_widget(tok[1:].strip(), canvas,
color='#004080', bold=True)
roof = (tok[:1] == '<')
stack[-1].append(dict(canvas=canvas, node=node, roof=roof))
stack.append([])
elif tok[:1] in ')>':
subtrees = stack.pop()
node_kwargs = stack[-1][-1]
stack[-1][-1] = TreeSegmentWidget(subtrees=subtrees, **node_kwargs)
else:
leaf = word_to_widget(tok.strip(), canvas, color='#008040')
stack[-1].append(leaf)
if not len(stack) == 1 and len(stack[0])==1:
raise ValueError, 'unbalanced parens?'
return stack[0][0]
def parse_word(s):
italic = False
subscript = False
piece = ''
for tok in tokenize(s, r'\*|_{|}|_[^{]|\\.|[^\\_\*]'):
if tok == '*':
yield italic, subscript, piece
italic = not italic
piece = ''
elif tok == '_{':
yield italic, subscript, piece
if subscript: raise ValueError, 'nested italics?'
subscript = True
piece = ''
elif tok.startswith('_'):
yield italic, subscript, piece
if subscript: raise ValueError, 'nested italics?'
yield italic, True, tok[1]
piece = ''
elif tok == '}':
yield italic, subscript, piece
if not subscript: raise ValueError, '} needs backslash'
subscript = False
piece = ''
else:
piece += tok
if italic: raise ValueError, 'expected * to close italics'
if subscript: raise ValueError, 'expected }'
yield italic, subscript, piece
metrics = {}
def word_to_widget(s, canvas, basefont='helvetica', fontsize=12,
color='black', bold=False):
"""
Parse a node or leaf substring from a tree string, and return a
corresponding widget.
"""
textwidgets = []
# Convert each piece of the word to a widget.
for (italic, subscript, text) in parse_word(s):
if not text: continue
# Strip remaining backslashes
text = re.sub(r'\\(.)', r'\1', text)
# Decide on a font
size = fontsize
if subscript: size = size*2/3
slant = italic and 'italic' or 'roman'
weight = bold and 'bold' or 'normal'
font = tkFont.Font(family=basefont, size=size, weight=weight,
slant=slant)
# Create the widget.
textwidgets.append(TextWidget(canvas, text, font=font, color=color))
global metrics
metrics[basefont, size, weight, slant] = font.metrics()
# If there's only one widget, return it; otherwise, use a
# sequencewidget to join them. Use align=bottom to make
# subscripting work.
if len(textwidgets) == 0:
w = SpaceWidget(canvas, 1, 1)
w.set_width(0)
w.set_height(0)
return w
if len(textwidgets) == 1:
return textwidgets[0]
else:
return SequenceWidget(canvas, align='bottom', space=-2, *textwidgets)
try: _canvas_frame
except NameError: _canvas_frame = None
def tree_to_ps(s, outfile):
global _canvas_frame
if _canvas_frame is None:
_canvas_frame = CanvasFrame()
# May throw ValueError:
widget = tree_to_widget(s, _canvas_frame.canvas())
_canvas_frame.canvas()['scrollregion'] = (0, 0, 1, 1)
_canvas_frame.add_widget(widget)
_canvas_frame.print_to_file(outfile)
bbox = widget.bbox()
_canvas_frame.destroy_widget(widget)
## Testing..
#for (key, val) in metrics.items():
# print key, '\n ', val
return bbox[2:]
# # hmm.. a bit of a hack.. e.g., no backslashing allowed:
# from nltk.parse.tree import bracket_parse, Tree
# def tree_to_qtree(s):
# return '\Tree %s' % _tree_to_qtree(bracket_parse(s))
# def _tree_to_qtree(tree):
# if isinstance(tree, Tree):
# return ('[.{%s} %s ]' %
# (tree.node, ' '.join([_tree_to_qtree(c) for c in tree])))
# elif tree.startswith('<') and tree.endswith('>'):
# return '\qroof{%s}' % ' '.join([_tree_to_qtree(c) for c in tree])
# else:
# return tree
def run(cmd):
try:
import subprocess # requires python 2.4
subprocess.call(cmd)
except ImportError:
os.system(' '.join(cmd))
def tree_to_image(s, outfile, density=72):
"""
Render the treebank-like tree C{s}, and write the rendered image
to an image file using the given filename (C{outfile}). Output
files are generated using Imagemagick 'convert', so the output
file format will be automatically determined from C{outfile}'s
extension.
"""
# Check the cache. If it's already there, do nothing.
cachefile = os.path.join(os.path.split(outfile)[0],
'treecache.pickle')
if os.path.exists(cachefile):
try:
cache = pickle.load(open(cachefile, 'r'))
if cache.get(outfile, None) == (s, density):
return
except:
cache = {}
else:
cache = {}
psfile = tempfile.mktemp(suffix='.ps')
try:
# Draw the tree to postscript
w,h = tree_to_ps(s, psfile)
# Convert to the desired format.
if outfile.endswith('.ps'):
run(['cp', psfile, outfile])
else:
run([CONVERT,
'-density', str(density),
'-crop', '0x0',
psfile, outfile])
finally:
if os.path.exists(psfile):
os.remove(psfile)
# Update the cache
cache[outfile] = (s, density)
out = open(cachefile, 'w')
pickle.dump(cache, out)
out.close()
def cli():
if len(sys.argv) != 3:
print 'Usage: %s <infile> <outfile>' % sys.argv[0]
sys.exit(-1)
print '%s -> %s' % (sys.argv[1], sys.argv[2])
tree_to_image(open(sys.argv[1]).read(), sys.argv[2])
if __name__ == '__main__':
cli()
Jump to Line
Something went wrong with that request. Please try again.