Permalink
Browse files

More stuff orking including treewalkers, parts of parse.py dom, (c)El…

…ementTree

--HG--
branch : svgmathml
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/branches/svgmathml%401266
  • Loading branch information...
1 parent 495b92b commit aa58129690b28d5e97ec68f63230e26fde9d7ac1 jgraham.html committed Feb 2, 2009
View
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3.0
"""usage: %prog [options] filename
Parse a document to a simpletree tree, with optional profiling
@@ -9,11 +9,16 @@
import os
from optparse import OptionParser
+print(sys.stdout.encoding)
+
#RELEASE remove
sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))
#END RELEASE
-from html5lib import html5parser, liberalxmlparser, sanitizer
+print(sys.path)
+import html5lib
+import html5lib.html5parser as html5parser
from html5lib.tokenizer import HTMLTokenizer
+from html5lib import treebuilders
from html5lib import treebuilders, serializer, treewalkers
from html5lib import constants
@@ -27,8 +32,8 @@ def parse():
# Try opening from the internet
if f.startswith('http://'):
try:
- import urllib, cgi
- f = urllib.urlopen(f)
+ from urllib import request
+ f = request.urlopen(f)
contentType = f.headers.get('content-type')
if contentType:
(mediaType, params) = cgi.parse_header(contentType)
@@ -39,7 +44,7 @@ def parse():
else:
try:
# Try opening from file system
- f = open(f)
+ f = open(f, "rb")
except IOError: pass
except IndexError:
sys.stderr.write("No filename provided. Use -h for help\n")
@@ -64,16 +69,16 @@ def parse():
if opts.profile:
#XXX should import cProfile instead and use that
- import hotshot
- import hotshot.stats
- prof = hotshot.Profile('stats.prof')
- prof.runcall(parseMethod, f, encoding=encoding)
+ try:
+ import cProfile as profile
+ except ImportError:
+ import profile
+ import pstats
+ prof = profile.run('parseMethod(f, encoding=encoding)', 'prof.out')
prof.close()
# XXX - We should use a temp file here
- stats = hotshot.stats.load('stats.prof')
- stats.strip_dirs()
- stats.sort_stats('time')
- stats.print_stats()
+ stats = pstats.stats('prof.out')
+ stats.strip_dirs().sort_stats('time').print_stats()
elif opts.time:
import time
t0 = time.time()
@@ -88,13 +93,14 @@ def parse():
def printOutput(parser, document, opts):
if opts.encoding:
- print "Encoding:", parser.tokenizer.stream.charEncoding
+ print("Encoding:", parser.tokenizer.stream.charEncoding)
if opts.xml:
sys.stdout.write(document.toxml("utf-8"))
elif opts.tree:
if not hasattr(document,'__getitem__'): document = [document]
for fragment in document:
- print parser.tree.testSerializer(fragment).encode("utf-8")
+ sys.stdout.write(parser.tree.testSerializer(fragment))
+ sys.stdout.write("\n")
elif opts.hilite:
sys.stdout.write(document.hilite("utf-8"))
elif opts.html:
@@ -103,7 +109,7 @@ def printOutput(parser, document, opts):
kwargs[opt] = getattr(opts,opt)
if not kwargs['quote_char']: del kwargs['quote_char']
tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
- for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding='utf-8'):
+ for text in serializer.HTMLSerializer(**kwargs).serialize(tokens):
sys.stdout.write(text)
if not text.endswith('\n'): sys.stdout.write('\n')
if opts.error:
View
@@ -10,10 +10,9 @@
f = open("my_document.html")
tree = html5lib.parse(f)
"""
-print(__path__)
-#from .html5parser import HTMLParser, parse
-#from treebuilders import getTreeBuilder
+from .html5parser import HTMLParser, parse
+from .treebuilders import getTreeBuilder
#from .liberalxmlparser import XMLParser, XHTMLParser
@@ -4,10 +4,10 @@
# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
#
-import _base
+from . import _base
from html5lib.constants import spaceCharacters
-spaceCharacters = u"".join(spaceCharacters)
+spaceCharacters = "".join(spaceCharacters)
class SimpleFilter(_base.Filter):
def __init__(self, source, fieldStorage):
@@ -29,13 +29,13 @@ def __iter__(self):
input_checked_index = -1
for i,(n,v) in enumerate(token["data"]):
n = n.lower()
- if n == u"name":
+ if n == "name":
field_name = v.strip(spaceCharacters)
- elif n == u"type":
+ elif n == "type":
field_type = v.strip(spaceCharacters)
- elif n == u"checked":
+ elif n == "checked":
input_checked_index = i
- elif n == u"value":
+ elif n == "value":
input_value_index = i
value_list = self.fieldStorage.getlist(field_name)
@@ -45,20 +45,20 @@ def __iter__(self):
else:
value = ""
- if field_type in (u"checkbox", u"radio"):
+ if field_type in ("checkbox", "radio"):
if value_list:
if token["data"][input_value_index][1] == value:
if input_checked_index < 0:
- token["data"].append((u"checked", u""))
+ token["data"].append(("checked", ""))
field_indices[field_name] = field_index + 1
elif input_checked_index >= 0:
del token["data"][input_checked_index]
- elif field_type not in (u"button", u"submit", u"reset"):
+ elif field_type not in ("button", "submit", "reset"):
if input_value_index >= 0:
- token["data"][input_value_index] = (u"value", value)
+ token["data"][input_value_index] = ("value", value)
else:
- token["data"].append((u"value", value))
+ token["data"].append(("value", value))
field_indices[field_name] = field_index + 1
field_type = None
@@ -96,7 +96,7 @@ def __iter__(self):
value = ""
if (is_select_multiple or not is_selected_option_found) and option_value == value:
if option_selected_index < 0:
- token["data"].append((u"selected", u""))
+ token["data"].append(("selected", ""))
field_indices[field_name] = field_index + 1
is_selected_option_found = True
elif option_selected_index >= 0:
@@ -1,4 +1,4 @@
-import _base
+from . import _base
class Filter(_base.Filter):
def __init__(self, source, encoding):
@@ -23,7 +23,7 @@ def __iter__(self):
content_index = -1
for i,(name,value) in enumerate(token["data"]):
if name.lower() == 'charset':
- token["data"][i] = (u'charset', self.encoding)
+ token["data"][i] = ('charset', self.encoding)
meta_found = True
break
elif name == 'http-equiv' and value.lower() == 'content-type':
@@ -32,7 +32,7 @@ def __iter__(self):
content_index = i
else:
if has_http_equiv_content_type and content_index >= 0:
- token["data"][content_index] = (u'content', u'text/html; charset=%s' % self.encoding)
+ token["data"][content_index] = ('content', 'text/html; charset=%s' % self.encoding)
meta_found = True
elif token["name"].lower() == "head" and not meta_found:
@@ -746,4 +746,4 @@ def isValidLangCode(value):
lang, sublang = value.split('-', 1)
else:
lang = value
- return isoLang.has_key(unicode.lower(unicode(lang)))
+ return str.lower(str(lang)) in isoLang
@@ -1,11 +1,11 @@
from gettext import gettext
_ = gettext
-import _base
+from . import _base
from html5lib.constants import cdataElements, rcdataElements, voidElements
from html5lib.constants import spaceCharacters
-spaceCharacters = u"".join(spaceCharacters)
+spaceCharacters = "".join(spaceCharacters)
class LintError(Exception): pass
@@ -19,22 +19,22 @@ def __iter__(self):
name = token["name"]
if contentModelFlag != "PCDATA":
raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
- if not isinstance(name, unicode):
- raise LintError(_(u"Tag name is not a string: %r") % name)
+ if not isinstance(name, str):
+ raise LintError(_("Tag name is not a string: %r") % name)
if not name:
- raise LintError(_(u"Empty tag name"))
+ raise LintError(_("Empty tag name"))
if type == "StartTag" and name in voidElements:
- raise LintError(_(u"Void element reported as StartTag token: %s") % name)
+ raise LintError(_("Void element reported as StartTag token: %s") % name)
elif type == "EmptyTag" and name not in voidElements:
- raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
+ raise LintError(_("Non-void element reported as EmptyTag token: %s") % token["name"])
if type == "StartTag":
open_elements.append(name)
for name, value in token["data"]:
- if not isinstance(name, unicode):
+ if not isinstance(name, str):
raise LintError(_("Attribute name is not a string: %r") % name)
if not name:
- raise LintError(_(u"Empty attribute name"))
- if not isinstance(value, unicode):
+ raise LintError(_("Empty attribute name"))
+ if not isinstance(value, str):
raise LintError(_("Attribute value is not a string: %r") % value)
if name in cdataElements:
contentModelFlag = "CDATA"
@@ -45,15 +45,15 @@ def __iter__(self):
elif type == "EndTag":
name = token["name"]
- if not isinstance(name, unicode):
- raise LintError(_(u"Tag name is not a string: %r") % name)
+ if not isinstance(name, str):
+ raise LintError(_("Tag name is not a string: %r") % name)
if not name:
- raise LintError(_(u"Empty tag name"))
+ raise LintError(_("Empty tag name"))
if name in voidElements:
- raise LintError(_(u"Void element reported as EndTag token: %s") % name)
+ raise LintError(_("Void element reported as EndTag token: %s") % name)
start_name = open_elements.pop()
if start_name != name:
- raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
+ raise LintError(_("EndTag (%s) does not match StartTag (%s)") % (name, start_name))
contentModelFlag = "PCDATA"
elif type == "Comment":
@@ -62,27 +62,27 @@ def __iter__(self):
elif type in ("Characters", "SpaceCharacters"):
data = token["data"]
- if not isinstance(data, unicode):
+ if not isinstance(data, str):
raise LintError(_("Attribute name is not a string: %r") % data)
if not data:
- raise LintError(_(u"%s token with empty data") % type)
+ raise LintError(_("%s token with empty data") % type)
if type == "SpaceCharacters":
data = data.strip(spaceCharacters)
if data:
- raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
+ raise LintError(_("Non-space character(s) found in SpaceCharacters token: ") % data)
elif type == "Doctype":
name = token["name"]
if contentModelFlag != "PCDATA":
raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
- if not isinstance(name, unicode):
- raise LintError(_(u"Tag name is not a string: %r") % name)
+ if not isinstance(name, str):
+ raise LintError(_("Tag name is not a string: %r") % name)
# XXX: what to do with token["data"] ?
elif type in ("ParseError", "SerializeError"):
pass
else:
- raise LintError(_(u"Unknown token type: %s") % type)
+ raise LintError(_("Unknown token type: %s") % type)
yield token
@@ -1,4 +1,4 @@
-import _base
+from . import _base
class Filter(_base.Filter):
def slider(self):
@@ -1,4 +1,4 @@
-import _base
+from . import _base
from html5lib.sanitizer import HTMLSanitizerMixin
class Filter(_base.Filter, HTMLSanitizerMixin):
Oops, something went wrong.

0 comments on commit aa58129

Please sign in to comment.