Skip to content
Permalink
Browse files Browse the repository at this point in the history
Resolve entities defined in DOCTYPE recursively
Entity expansion loops are detected and avoided.

In addition, there is a limit on the length of an
entity expansion (to guard against attacks like the
billion laughs attack). This can be adjusted using
the new exported field `psEntityExpansionSizeLimit`
on ParseSettings. By default it is set at 8192.

Add tests, including a test for the billion laughs attack.
  • Loading branch information
jgm committed Feb 26, 2021
1 parent adad7c7 commit 4be1021
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 7 deletions.
52 changes: 45 additions & 7 deletions xml-conduit/src/Text/XML/Stream/Parse.hs
Expand Up @@ -81,6 +81,7 @@ module Text.XML.Stream.Parse
, psDecodeEntities
, psDecodeIllegalCharacters
, psRetainNamespaces
, psEntityExpansionSizeLimit
-- *** Entity decoding
, decodeXmlEntities
, decodeHtmlEntities
Expand Down Expand Up @@ -196,7 +197,7 @@ tokenToEvent ps es n (TokenBeginElement name as isClosed _) =

addNS
| not (psRetainNamespaces ps) && (isPrefixed || isUnprefixed) = id
| otherwise = (((tname, map resolve val):) .)
| otherwise = (((tname, resolveEntities ps es val):) .)
where
tname
| isPrefixed = TName Nothing ("xmlns:" `T.append` kname)
Expand All @@ -212,9 +213,6 @@ tokenToEvent ps es n (TokenBeginElement name as isClosed _) =
else Just $ contentsToText val }
| otherwise = l

resolve (ContentEntity e)
| Just t <- lookup e es = ContentText t
resolve c = c
n' = if isClosed then n else l' : n
fixAttName (name', val) = (tnameToName True l' name', val)
elementName = tnameToName False l' name
Expand All @@ -227,13 +225,44 @@ tokenToEvent _ es n (TokenEndElement name) =
case n of
[] -> (NSLevel Nothing Map.empty, [])
x:xs -> (x, xs)
tokenToEvent _ es n (TokenContent (ContentEntity e))
| Just t <- lookup e es = (es, n, [EventContent $ ContentText t])
tokenToEvent ps es n (TokenContent (ContentEntity e))
= (es, n, map EventContent (resolveEntities ps es [ContentEntity e]))
tokenToEvent _ es n (TokenContent c) = (es, n, [EventContent c])
tokenToEvent _ es n (TokenComment c) = (es, n, [EventComment c])
tokenToEvent _ es n (TokenDoctype t eid es') = (es ++ es', n, [EventBeginDoctype t eid, EventEndDoctype])
tokenToEvent _ es n (TokenCDATA t) = (es, n, [EventCDATA t])

resolveEntities :: ParseSettings
-> [(Text, Text)] -- entity table
-> [Content]
-> [Content]
resolveEntities ps entities = foldr go []
where
go c@(ContentEntity e) cs
= case expandEntity entities e of
Just xs -> foldr go cs xs
Nothing -> c : cs
go c cs = c:cs
expandEntity es e
| Just t <- lookup e es =
case AT.parseOnly (manyTill
(parseContent ps False False :: Parser Content)
AT.endOfInput) t of
Left _ -> Nothing
Right xs -> let es' = filter (\(x,_) -> x /= e) es
in fst <$> foldr (goent es') (Just ([], 0)) xs
-- we delete e from the entity map in resolving its contents,
-- to avoid infinite loops in recursive expansion.
| otherwise = Nothing
goent _ _ Nothing = Nothing
goent es (ContentEntity e) (Just (cs, size))
= expandEntity es e >>= foldr (goent es) (Just (cs, size))
goent _ c@(ContentText t) (Just (cs, size)) =
case size + T.length t of
n | n > psEntityExpansionSizeLimit ps -> Nothing
| otherwise -> Just (c:cs, size + T.length t)


tnameToName :: Bool -> NSLevel -> TName -> Name
tnameToName _ _ (TName (Just "xml") name) =
Name name (Just "http://www.w3.org/XML/1998/namespace") (Just "xml")
Expand Down Expand Up @@ -407,13 +436,22 @@ data ParseSettings = ParseSettings
-- Default: @const Nothing@
--
-- Since 1.7.1
, psEntityExpansionSizeLimit :: Int
-- ^ Maximum number of characters allowed in expanding an
-- internal entity. This is intended to protect against the
-- billion laughs attack.
--
-- Default: @8192@
--
-- Since 1.9.1
}

instance Default ParseSettings where
def = ParseSettings
{ psDecodeEntities = decodeXmlEntities
, psRetainNamespaces = False
, psDecodeIllegalCharacters = const Nothing
, psEntityExpansionSizeLimit = 8192
}

conduitToken :: MonadThrow m => ParseSettings -> ConduitT T.Text (PositionRange, Token) m ()
Expand Down Expand Up @@ -555,7 +593,7 @@ parseContent :: ParseSettings
-> Bool -- break on double quote
-> Bool -- break on single quote
-> Parser Content
parseContent (ParseSettings decodeEntities _ decodeIllegalCharacters) breakDouble breakSingle = parseReference <|> parseTextContent where
parseContent (ParseSettings decodeEntities _ decodeIllegalCharacters _) breakDouble breakSingle = parseReference <|> parseTextContent where
parseReference = do
char' '&'
t <- parseEntityRef <|> parseHexCharRef <|> parseDecCharRef
Expand Down
35 changes: 35 additions & 0 deletions xml-conduit/test/unit.hs
Expand Up @@ -81,6 +81,10 @@ main = hspec $ do
it "works for resolvable entities" resolvedAllGood
it "merges adjacent content nodes" resolvedMergeContent
it "understands inline entity declarations" resolvedInline
it "can expand inline entities recursively" resolvedInlineRecursive
it "doesn't explode with an inline entity loop" resolvedInlineLoop
it "doesn't explode with the billion laughs attack" billionLaughs
it "allows entity expansion size limit to be adjusted" thousandLaughs
it "doesn't break on [] in doctype comments" doctypeComment
it "skips element declarations in doctype" doctypeElements
it "skips processing instructions in doctype" doctypePI
Expand Down Expand Up @@ -736,6 +740,37 @@ resolvedInline = do
Res.Document _ root2 _ <- return $ Res.parseLBS_ Res.def "<!DOCTYPE foo [<!ENTITY bar \"baz\">]><foo bar='&bar;'/>"
root2 @?= Res.Element "foo" (Map.singleton "bar" "baz") []

resolvedInlineRecursive :: Assertion
resolvedInlineRecursive = do
Res.Document _ root _ <- return $ Res.parseLBS_ Res.def
"<!DOCTYPE foo [<!ENTITY bim \"baz\"><!ENTITY bar \"&bim;&#73;&amp;\">]><foo>&bar;</foo>"
root @?= Res.Element "foo" Map.empty [Res.NodeContent "bazI&"]

resolvedInlineLoop :: Assertion
resolvedInlineLoop = do
res <- return $ Res.parseLBS Res.def
"<!DOCTYPE foo [<!ENTITY bim \"&bim;\">]><foo>&bim;</foo>"
Left (toException $ Res.UnresolvedEntityException (Set.fromList ["bim"]))
`showEq` res

billionLaughs :: Assertion
billionLaughs = do
res <- return $ Res.parseLBS Res.def
"<?xml version=\"1.0\"?><!DOCTYPE lolz [<!ENTITY lol \"lol\"><!ELEMENT lolz (#PCDATA)><!ENTITY lol1 \"&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;\"><!ENTITY lol2 \"&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;\"><!ENTITY lol3 \"&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;\"><!ENTITY lol4 \"&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;\"><!ENTITY lol5 \"&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;\"><!ENTITY lol6 \"&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;\"><!ENTITY lol7 \"&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;\"><!ENTITY lol8 \"&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;\"><!ENTITY lol9 \"&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;\">]><lolz>&lol9;</lolz>"
Left (toException $ Res.UnresolvedEntityException (Set.fromList ["lol9"]))
`showEq` res

thousandLaughs :: Assertion
thousandLaughs = do
res <- return $ Res.parseLBS Res.def{ P.psEntityExpansionSizeLimit = 2999 }
"<?xml version=\"1.0\"?><!DOCTYPE lolz [<!ENTITY lol \"lol\"><!ELEMENT lolz (#PCDATA)><!ENTITY lol1 \"&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;\"><!ENTITY lol2 \"&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;\"><!ENTITY lol3 \"&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;\">]><lolz>&lol3;</lolz>"
Left (toException $ Res.UnresolvedEntityException (Set.fromList ["lol3"]))
`showEq` res
-- Raise the entity expansion limit and it should work:
Right (Res.Document {Res.documentRoot = Res.Element{ Res.elementNodes = [Res.NodeContent t] }}) <- return $ Res.parseLBS Res.def{ P.psEntityExpansionSizeLimit = 3001 } "<?xml version=\"1.0\"?><!DOCTYPE lolz [<!ENTITY lol \"lol\"><!ELEMENT lolz (#PCDATA)><!ENTITY lol1 \"&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;\"><!ENTITY lol2 \"&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;\"><!ENTITY lol3 \"&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;\">]><lolz>&lol3;</lolz>"
t @?= T.replicate 1000 "lol"


doctypeComment :: Assertion
doctypeComment = do
Res.Document _ root _ <- return $ Res.parseLBS_
Expand Down

0 comments on commit 4be1021

Please sign in to comment.