Permalink
Browse files

Attempt at merging svgmathml branch to the default branch

--HG--
branch : svgmathml
rename : python/parse.py => python3/parse.py
rename : python/src/html5lib/__init__.py => python3/src/html5lib/__init__.py
rename : python/src/html5lib/constants.py => python3/src/html5lib/constants.py
rename : python/src/html5lib/filters/optionaltags.py => python3/src/html5lib/filters/optionaltags.py
rename : python/src/html5lib/html5parser.py => python3/src/html5lib/html5parser.py
rename : python/src/html5lib/inputstream.py => python3/src/html5lib/inputstream.py
rename : python/src/html5lib/sanitizer.py => python3/src/html5lib/sanitizer.py
rename : python/src/html5lib/serializer/__init__.py => python3/src/html5lib/serializer/__init__.py
rename : python/src/html5lib/tokenizer.py => python3/src/html5lib/tokenizer.py
rename : python/src/html5lib/treebuilders/etree_lxml.py => python3/src/html5lib/treebuilders/etree_lxml.py
rename : python/src/html5lib/treebuilders/simpletree.py => python3/src/html5lib/treebuilders/simpletree.py
rename : python/tests/test_encoding.py => python3/tests/test_encoding.py
rename : python/tests/test_parser.py => python3/tests/test_parser.py
rename : python/tests/test_tokenizer.py => python3/tests/test_tokenizer.py
  • Loading branch information...
1 parent aa58129 commit 76f5cd32b8e86d9d15f5b0ad7bf7f19e19cfc0a6 @jgraham jgraham committed May 30, 2009
View
@@ -57,7 +57,7 @@ def parse():
else:
tokenizer = HTMLTokenizer
- if opts.xml:
+ if opts.liberalxml:
p = liberalxmlparser.XHTMLParser(tree=treebuilder, tokenizer=tokenizer)
else:
p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer)
@@ -1070,7 +1070,6 @@
'utf16': 'utf-16',
'utf16be': 'utf-16-be',
'utf16le': 'utf-16-le',
- 'utf7': 'utf-7',
'utf8': 'utf-8',
'windows1250': 'cp1250',
'windows1251': 'cp1251',
@@ -31,7 +31,11 @@ def is_optional_start(self, tagname, previous, next):
elif tagname == 'head':
# A head element's start tag may be omitted if the first thing
# inside the head element is an element.
- return type == "StartTag"
+ # XXX: we also omit the start tag if the head element is empty
+ if type in ("StartTag", "EmptyTag"):
+ return True
+ elif type == "EndTag":
+ return next["name"] == "head"
elif tagname == 'body':
# A body element's start tag may be omitted if the first thing
# inside the body element is not a space character or a comment,
@@ -52,7 +56,7 @@ def is_optional_start(self, tagname, previous, next):
# inside the colgroup element is a col element, and if the element
# is not immediately preceeded by another colgroup element whose
# end tag has been omitted.
- if type == "StartTag":
+ if type in ("StartTag", "EmptyTag"):
# XXX: we do not look at the preceding event, so instead we never
# omit the colgroup element's end tag when it is immediately
# followed by another colgroup element. See is_optional_end.
@@ -114,7 +118,7 @@ def is_optional_end(self, tagname, next):
# footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
# nav, ol, p, pre, section, table, or ul, element, or if
# there is no more content in the parent element.
- if type == "StartTag":
+ if type in ("StartTag", "EmptyTag"):
return next["name"] in ('address', 'article', 'aside', \
'blockquote', 'datagrid', 'dialog', 'dir', 'div', \
'dl', 'fieldset', 'footer', 'form', 'h1', 'h2', 'h3', \
@@ -108,7 +108,6 @@ def _parse(self, stream, innerHTML=False, container="div",
# We only seem to have InBodyPhase testcases where the following is
# relevant ... need others too
self.lastPhase = None
-
self.beforeRCDataPhase = None
CharactersToken = tokenTypes["Characters"]
@@ -120,6 +119,8 @@ def _parse(self, stream, innerHTML=False, container="div",
for token in self.normalizedTokens():
+ #print self.phase.__class__.__name__
+ #print token
type = token["type"]
if type == CharactersToken:
self.phase.processCharacters(token)
@@ -271,18 +272,6 @@ def __init__(self, parser, tree):
def processEOF(self):
raise NotImplementedError
- self.tree.generateImpliedEndTags()
- if len(self.tree.openElements) > 2:
- self.parser.parseError("expected-closing-tag-but-got-eof")
- elif len(self.tree.openElements) == 2 and\
- self.tree.openElements[1].name != "body":
- # This happens for framesets or something?
- self.parser.parseError("expected-closing-tag-but-got-eof")
- elif self.parser.innerHTML and len(self.tree.openElements) > 1 :
- # XXX This is not what the specification says. Not sure what to do
- # here.
- self.parser.parseError("eof-in-innerhtml")
- # Betting ends.
def processComment(self, token):
# For most phases the following is correct. Where it's not it will be
@@ -318,7 +307,7 @@ class InitialPhase(Phase):
# this.
def processEOF(self):
self.parser.parseError("expected-doctype-but-got-eof")
- self.compatMode = "quirks"
+ self.parser.compatMode = "quirks"
self.parser.phase = self.parser.phases["beforeHtml"]
self.parser.phase.processEOF()
@@ -346,8 +335,9 @@ def processDoctype(self, token):
if publicId != "":
publicId = publicId.translate(asciiUpper2Lower)
- if (not correct or token["name"] != "html"
- or publicId in
+
+ if ((not correct) or nameLower != "html"
+ or publicId in
("+//silmaril//dtd html pro v0r11 19970101//en",
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
"-//as//dtd html 3.0 aswedit + extensions//en",
@@ -419,19 +409,18 @@ def processDoctype(self, token):
"html")
or (publicId in
("-//w3c//dtd html 4.01 frameset//EN",
- "-//w3c//dtd html 4.01 transitional//EN") and
- systemId == None)
+ "-//w3c//dtd html 4.01 transitional//EN") and systemId == None)
or (systemId != None and
- systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")):
- self.compatMode = "quirks"
+ systemId ==
+ "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")):
+ self.parser.compatMode = "quirks"
elif (publicId in
- ("-//w3c//dtd xhtml 1.0 frameset//EN",
- "-//w3c//dtd xhtml 1.0 transitional//EN")
+ ("-//w3c//dtd xhtml 1.0 frameset//EN",
+ "-//w3c//dtd xhtml 1.0 transitional//EN")
or (publicId in
("-//w3c//dtd html 4.01 frameset//EN",
- "-//w3c//dtd html 4.01 transitional//EN") and
- systemId == None)):
- self.compatMode = "limited quirks"
+ "-//w3c//dtd html 4.01 transitional//EN") and systemId == None)):
+ self.parser.compatMode = "limited quirks"
self.parser.phase = self.parser.phases["beforeHtml"]
@@ -440,7 +429,7 @@ def processSpaceCharacters(self, token):
def processCharacters(self, token):
self.parser.parseError("expected-doctype-but-got-chars")
- self.compatMode = "quirks"
+ self.parser.compatMode = "quirks"
self.parser.phase = self.parser.phases["beforeHtml"]
self.parser.phase.processCharacters(token)
@@ -595,7 +584,8 @@ def startTagMeta(self, token):
codec = inputstream.codecName(attributes["charset"])
self.parser.tokenizer.stream.changeEncoding(codec)
elif "content" in attributes:
- data = inputstream.EncodingBytes(attributes["content"])
+ data = inputstream.EncodingBytes(
+ attributes["content"].encode(self.parser.tokenizer.stream.charEncoding[0]))
parser = inputstream.ContentAttrParser(data)
codec = parser.parse()
self.parser.tokenizer.stream.changeEncoding(codec)
@@ -1,6 +1,7 @@
import codecs
import re
import types
+import sys
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
from .constants import encodings, ReparseException
@@ -188,7 +189,8 @@ def openStream(self, source):
import io
stream = io.BytesIO(bytes(source))
- if not(hasattr(stream, "tell") and hasattr(stream, "seek")):
+ if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
+ stream is sys.stdin):
stream = BufferedStream(stream)
return stream
@@ -452,6 +454,9 @@ class EncodingBytes(bytes):
"""Bytes-like object with an assosiated position and various extra methods
If the position is ever greater than the string length then an exception is
raised"""
+ def __new__(self, value):
+ return str.__new__(self, value)
+
def __init__(self, value):
self._position = -1
@@ -152,7 +152,7 @@ def sanitize_token(self, token):
continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
unescape(attrs[attr])).lower()
- if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) or
+ if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
(val_unescaped.split(':')[0] not in
self.allowed_protocols)):
del attrs[attr]
@@ -142,7 +142,7 @@ def consumeNumberEntity(self, isHex):
# Certain characters get replaced with U+FFFD
if ((charAsInt <= 0x0008) or (charAsInt == 0x000B) or (0x000E <= charAsInt <= 0x001F)
or (0x007F <= charAsInt <= 0x009F)
- or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDDF)
+ or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDEF)
or (charAsInt & 0xFFFE == 0xFFFE) # catch all U+?FFFE and U+?FFFF, where ? is 0..10
or (0x10FFFF < charAsInt)):
char = "\uFFFD"
@@ -142,7 +142,8 @@ def buildTestSuite():
def testFunc(self, innerHTML=innerHTML, input=input,
expected=expected, errors=errors, treeCls=treeCls):
return self.runParserTest(innerHTML, input, expected, errors, treeCls)
- setattr(TestCase, "test_%s_%d_%s" % (testName,index+1,treeName),
+ testFunc.__name__ = "test_%s_%d_%s" % (testName,index+1,treeName)
+ setattr(TestCase, testFunc.__name__,
testFunc)
return unittest.TestLoader().loadTestsFromTestCase(TestCase)

0 comments on commit 76f5cd3

Please sign in to comment.