From b4aa1bad8c8c6a89e584c08c7d128c8e90474fd9 Mon Sep 17 00:00:00 2001 From: Chris Done Date: Mon, 27 Jan 2014 19:46:48 +0100 Subject: [PATCH 1/5] Add HTML entity decoding --- Text/HTML/TagStream/ByteString.hs | 41 ++++++++++++----- Text/HTML/TagStream/Entities.hs | 75 +++++++++++++++++++++++++++++++ Text/HTML/TagStream/Text.hs | 59 ++++++++++++++++-------- tagstream-conduit.cabal | 3 ++ tests/Tests.hs | 23 ++++++---- 5 files changed, 164 insertions(+), 37 deletions(-) create mode 100644 Text/HTML/TagStream/Entities.hs diff --git a/Text/HTML/TagStream/ByteString.hs b/Text/HTML/TagStream/ByteString.hs index 98ceb13..281d496 100644 --- a/Text/HTML/TagStream/ByteString.hs +++ b/Text/HTML/TagStream/ByteString.hs @@ -3,18 +3,22 @@ {-# LANGUAGE TypeFamilies #-} module Text.HTML.TagStream.ByteString where -import Control.Applicative -import Control.Monad (unless) - -import Data.Monoid (mconcat) -import Data.ByteString (ByteString) +import qualified Blaze.ByteString.Builder as B +import Control.Applicative +import Control.Monad (unless) +import Data.Attoparsec.Char8 +import Data.ByteString (ByteString) import qualified Data.ByteString.Char8 as S -import Data.Attoparsec.Char8 -import Data.Conduit +import Data.Conduit +import qualified Data.Conduit.List as CL +import Data.Default +import Data.Monoid +import Data.Text.Encoding +import qualified Text.XML.Stream.Parse as XML -import qualified Blaze.ByteString.Builder as B -import Text.HTML.TagStream.Types -import Text.HTML.TagStream.Utils (splitAccum) +import Text.HTML.TagStream.Entities +import Text.HTML.TagStream.Types +import Text.HTML.TagStream.Utils (splitAccum) type Token = Token' ByteString type Attr = Attr' ByteString @@ -124,6 +128,23 @@ incomplete = Incomplete . S.cons '<' <$> takeByteString text :: Parser Token text = Text <$> atLeast 1 (takeTill (=='<')) +-- | Decode the HTML entities e.g. @&@ in some text into @&@. +decodeEntities :: ByteString -> ByteString +decodeEntities = + makeEntityDecoder + Dec { decToS = B.toByteString + , decBreak = S.break + , decBuilder = B.fromByteString + , decDrop = S.drop + , decEntity = decodeEntity + , decUncons = S.uncons } + where decodeEntity entity = + S.concat + $ map encodeUtf8 + $ CL.sourceList ["&",entity,";"] + $= XML.parseBytes def { XML.psDecodeEntities = XML.decodeHtmlEntities } + $$ XML.content + token :: Parser Token token = char '<' *> (tag <|> incomplete) <|> text diff --git a/Text/HTML/TagStream/Entities.hs b/Text/HTML/TagStream/Entities.hs new file mode 100644 index 0000000..ddc2afa --- /dev/null +++ b/Text/HTML/TagStream/Entities.hs @@ -0,0 +1,75 @@ +{-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE OverloadedStrings #-} + +-- | HTML entity decoding. + +module Text.HTML.TagStream.Entities + (Dec(..) + ,makeEntityDecoder + ,isNameChar + ,isNameStart) + where + +import Data.Char +import Data.Monoid +import Data.String + +-- | A decoder. +data Dec builder string = Dec + { decToS :: builder -> string + , decBreak :: (Char -> Bool) -> string -> (string,string) + , decBuilder :: string -> builder + , decDrop :: Int -> string -> string + , decEntity :: string -> string + , decUncons :: string -> Maybe (Char,string) + } + +-- | Decode the entities in a string type with a decoder. +makeEntityDecoder :: (Eq string,IsString string,Monoid builder) + => Dec builder string -> string -> string +makeEntityDecoder Dec{..} = decToS . go + where + go s = + case decBreak (=='&') s of + (_,"") -> decBuilder s + (before,decDrop 1 -> rest) -> + case decBreak (not . (\c -> isNameChar c || c == '#')) rest of + (_,"") -> decBuilder before <> decBuilder rest + (entity,after) -> + decBuilder before <> + decBuilder (decEntity entity) <> + go (case decUncons after of + Just (';',validAfter) -> validAfter + _ -> after) + +-- | Is the character a valid Name starter? +isNameStart :: Char -> Bool +isNameStart c = + c == ':' || + c == '_' || + isAsciiUpper c || + isAsciiLower c || + (c >= '\xC0' && c <= '\xD6') || + (c >= '\xD8' && c <= '\xF6') || + (c >= '\xF8' && c <= '\x2FF') || + (c >= '\x370' && c <= '\x37D') || + (c >= '\x37F' && c <= '\x1FFF') || + (c >= '\x200C' && c <= '\x200D') || + (c >= '\x2070' && c <= '\x218F') || + (c >= '\x2C00' && c <= '\x2FEF') || + (c >= '\x3001' && c <= '\xD7FF') || + (c >= '\xF900' && c <= '\xFDCF') || + (c >= '\xFDF0' && c <= '\xFFFD') || + (c >= '\x10000' && c <= '\xEFFFF') + +-- | Is the character valid in a Name? +isNameChar :: Char -> Bool +isNameChar c = + c == '-' || + c == '.' || + c == '\xB7' || + isDigit c || + isNameStart c || + (c >= '\x0300' && c <= '\x036F') || + (c >= '\x203F' && c <= '\x2040') diff --git a/Text/HTML/TagStream/Text.hs b/Text/HTML/TagStream/Text.hs index 8b1dcbc..8c72721 100644 --- a/Text/HTML/TagStream/Text.hs +++ b/Text/HTML/TagStream/Text.hs @@ -3,36 +3,40 @@ {-# LANGUAGE TypeFamilies #-} module Text.HTML.TagStream.Text where -import Prelude hiding (mapM) -import Control.Applicative -import Control.Monad (unless, when, liftM) -import Control.Monad.Trans.Class (lift) - -import Data.Traversable (mapM) -import Data.Maybe (fromMaybe) -import Data.Monoid (mconcat) -import Data.Char (isSpace) -import Data.ByteString (ByteString) -import Data.Text (Text) +import Control.Applicative +import Control.Monad (unless, when, liftM) +import Control.Monad.Trans.Class (lift) +import Data.Char +import qualified Data.Conduit.List as CL +import Data.Default +import Prelude hiding (mapM) + +import qualified Data.Attoparsec.ByteString.Char8 as S +import Data.Attoparsec.Text +import Data.ByteString (ByteString) +import qualified Data.CaseInsensitive as CI +import Data.Conduit +import Data.Maybe (fromMaybe) +import Data.Monoid (mconcat) +import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Lazy as L import qualified Data.Text.Lazy.Builder as B -import qualified Data.CaseInsensitive as CI -import qualified Data.Attoparsec.ByteString.Char8 as S -import Data.Attoparsec.Text -import Data.Conduit +import Data.Traversable (mapM) +import qualified Text.XML.Stream.Parse as XML #if MIN_VERSION_conduit(1, 0, 0) -import Data.Conduit.Internal (unConduitM) +import Data.Conduit.Internal (unConduitM) #else -import Data.Conduit.Internal (pipeL) +import Data.Conduit.Internal (pipeL) #endif import qualified Data.Conduit.List as C import qualified Data.Conduit.Attoparsec as C import qualified Data.Conduit.Text as C import qualified Text.HTML.TagStream.ByteString as S -import Text.HTML.TagStream.Types -import Text.HTML.TagStream.Utils (splitAccum) +import Text.HTML.TagStream.Entities +import Text.HTML.TagStream.Types +import Text.HTML.TagStream.Utils (splitAccum) type Token = Token' Text type Attr = Attr' Text @@ -142,6 +146,23 @@ incomplete = Incomplete . T.cons '<' <$> takeText text :: Parser Token text = Text <$> atLeast 1 (takeTill (=='<')) +-- | Decode the HTML entities e.g. @&@ in some text into @&@. +decodeEntities :: Text -> Text +decodeEntities = + makeEntityDecoder + Dec { decToS = L.toStrict . B.toLazyText + , decBreak = T.break + , decBuilder = B.fromText + , decDrop = T.drop + , decEntity = decodeEntity + , decUncons = T.uncons } + where decodeEntity entity = + T.concat + $ CL.sourceList ["&",entity,";"] + $= XML.parseText def { XML.psDecodeEntities = XML.decodeHtmlEntities } + $= CL.map snd + $$ XML.content + token :: Parser Token token = char '<' *> (tag <|> incomplete) <|> text diff --git a/tagstream-conduit.cabal b/tagstream-conduit.cabal index 65ce3c9..8f19a02 100644 --- a/tagstream-conduit.cabal +++ b/tagstream-conduit.cabal @@ -32,6 +32,7 @@ Library , Text.HTML.TagStream.Text , Text.HTML.TagStream.Types , Text.HTML.TagStream.Utils + , Text.HTML.TagStream.Entities Build-depends: base >= 4 && < 5 , bytestring , text @@ -42,6 +43,8 @@ Library , blaze-builder , blaze-builder-conduit >= 0.5 && < 1.1 , attoparsec-conduit >= 0.5 + , xml-conduit >= 1.1.0.0 + , data-default >= 0.5.0 test-suite test hs-source-dirs: tests diff --git a/tests/Tests.hs b/tests/Tests.hs index 9c3ab22..87248bc 100644 --- a/tests/Tests.hs +++ b/tests/Tests.hs @@ -65,14 +65,14 @@ onePassTests = mapM_ one testcases where one (T.encodeUtf8 -> str, map encodeTokenUtf8 -> tokens) = it (S.unpack str) $ do - result <- combineText <$> assertDecodeBS str + result <- combineText S.decodeEntities <$> assertDecodeBS str assertEqual "one-pass parse result incorrect" tokens result onePassTestsText :: Spec onePassTestsText = mapM_ one testcases where one (str, tokens) = it (T.unpack str) $ do - result <- combineText <$> assertDecodeText str + result <- combineText T.decodeEntities <$> assertDecodeText str assertEqual "one-pass parse result incorrect" tokens result streamlineTests :: Spec @@ -84,7 +84,7 @@ streamlineTests = mapM_ one testcases it (S.unpack str) $ do -- streamline parse result don't contain the trailing Incomplete token. let tokens' = reverse . dropWhile isIncomplete . reverse $ tokens - result <- combineText <$> C.runResourceT ( + result <- combineText S.decodeEntities <$> C.runResourceT ( CL.sourceList (map S.singleton (S.unpack str)) C.$= S.tokenStream C.$$ CL.consume ) @@ -99,7 +99,7 @@ streamlineTestsText = mapM_ one testcases it (S.unpack str) $ do -- streamline parse result don't contain the trailing Incomplete token. let tokens' = reverse . dropWhile isIncomplete . reverse $ tokens - result <- combineText <$> C.runResourceT ( + result <- combineText T.decodeEntities <$> C.runResourceT ( CL.sourceList (map S.singleton (S.unpack str)) C.$= T.tokenStreamBS C.$$ CL.consume ) @@ -205,6 +205,10 @@ testcases = , ( " hello" , [TagOpen "foo" [] False, Text " hello", TagClose "foo"] ) + , ( "

foo"

" + , [TagOpen "p" [] False,Text "foo\"",TagClose "p"]) + , ( "&foo bar "mu <zot> < a &#; &#a; &hello;" + , [TagOpen "a" [] False, Text " bar \"mu < a ",TagClose "a"]) ] testChar :: Gen Char @@ -238,7 +242,10 @@ assertDecodeText s = do let (Right tokens) = result return tokens -combineText :: Monoid s => [Token' s] -> [Token' s] -combineText [] = [] -combineText (Text t1 : Text t2 : xs) = combineText $ Text (mappend t1 t2) : xs -combineText (x:xs) = x : combineText xs +combineText :: Monoid s => (s -> s) -> [Token' s] -> [Token' s] +combineText decode = map (onText decode) . go + where go [] = [] + go (Text t1 : Text t2 : xs) = go $ Text (mappend t1 t2) : xs + go (x:xs) = x : go xs + onText f (Text t) = Text (f t) + onText _ x = x From d4f1e87630bff37b518d0126d9ef2df18327f347 Mon Sep 17 00:00:00 2001 From: Michael Snoyman Date: Tue, 28 Jan 2014 11:20:15 +0000 Subject: [PATCH 2/5] Streaming decodeEntities --- Text/HTML/TagStream/ByteString.hs | 14 +++--- Text/HTML/TagStream/Entities.hs | 72 +++++++++++++++++++++++++------ Text/HTML/TagStream/Text.hs | 13 +++--- tests/Tests.hs | 16 +++---- 4 files changed, 82 insertions(+), 33 deletions(-) diff --git a/Text/HTML/TagStream/ByteString.hs b/Text/HTML/TagStream/ByteString.hs index 281d496..5443f58 100644 --- a/Text/HTML/TagStream/ByteString.hs +++ b/Text/HTML/TagStream/ByteString.hs @@ -12,6 +12,7 @@ import qualified Data.ByteString.Char8 as S import Data.Conduit import qualified Data.Conduit.List as CL import Data.Default +import Data.Functor.Identity (runIdentity) import Data.Monoid import Data.Text.Encoding import qualified Text.XML.Stream.Parse as XML @@ -129,9 +130,9 @@ text :: Parser Token text = Text <$> atLeast 1 (takeTill (=='<')) -- | Decode the HTML entities e.g. @&@ in some text into @&@. -decodeEntities :: ByteString -> ByteString -decodeEntities = - makeEntityDecoder +decodeEntitiesBS :: Monad m => Conduit Token m Token +decodeEntitiesBS = + decodeEntities Dec { decToS = B.toByteString , decBreak = S.break , decBuilder = B.fromByteString @@ -174,7 +175,10 @@ html = tokens <|> pure [] _ -> (t:) <$> html decode :: ByteString -> Either String [Token] -decode = parseOnly html +decode = fmap decodeEntitiesBS' . parseOnly html + where + decodeEntitiesBS' tokens = runIdentity $ mapM_ yield tokens $$ decodeEntitiesBS =$ CL.consume + {-- - Utils {{{ @@ -233,7 +237,7 @@ tokenStream :: Monad m => GInfConduit ByteString m Token #endif tokenStream = - loop S.empty + loop S.empty =$= decodeEntitiesBS where #if MIN_VERSION_conduit(1, 0, 0) loop accum = await >>= maybe (close accum ()) (push accum) diff --git a/Text/HTML/TagStream/Entities.hs b/Text/HTML/TagStream/Entities.hs index ddc2afa..2c55717 100644 --- a/Text/HTML/TagStream/Entities.hs +++ b/Text/HTML/TagStream/Entities.hs @@ -6,14 +6,57 @@ module Text.HTML.TagStream.Entities (Dec(..) - ,makeEntityDecoder ,isNameChar - ,isNameStart) + ,isNameStart + ,decodeEntities) where import Data.Char import Data.Monoid import Data.String +import Data.Conduit +import Text.HTML.TagStream.Types +import qualified Data.Text as T +import qualified Data.Conduit.List as CL +import Data.Maybe (fromMaybe, isJust) +import Control.Arrow (first) + +decodeEntities' :: (Monad m,Monoid string,Eq string,IsString string,Monoid builder,Show string) + => Dec builder string + -> Conduit string m string +decodeEntities' dec = + loop id + where + loop accum = do + mchunk <- await + let chunk = accum $ fromMaybe mempty mchunk + (newStr, remainder) = makeEntityDecoder dec chunk + yield newStr + if isJust mchunk + then loop (mappend remainder) + else yield remainder + +decodeEntities :: (Monad m,Monoid string,Eq string,IsString string,Monoid builder,Show string) + => Dec builder string + -> Conduit (Token' string) m (Token' string) +decodeEntities dec = + start + where + start = await >>= maybe (return ()) (\token -> start' token >> start) + start' (Text t) = (yield t >> yieldWhileText) =$= decodeEntities' dec =$= CL.mapMaybe go + start' token = yield token + + go t + | t == "" = Nothing + | otherwise = Just (Text t) + +yieldWhileText :: Monad m => Conduit (Token' string) m string +yieldWhileText = + loop + where + loop = await >>= maybe (return ()) go + go (Text t) = yield t >> loop + go token = leftover token -- | A decoder. data Dec builder string = Dec @@ -26,22 +69,25 @@ data Dec builder string = Dec } -- | Decode the entities in a string type with a decoder. -makeEntityDecoder :: (Eq string,IsString string,Monoid builder) - => Dec builder string -> string -> string -makeEntityDecoder Dec{..} = decToS . go +makeEntityDecoder :: (Eq string,IsString string,Monoid builder,Show string) + => Dec builder string -> string -> (string, string) +makeEntityDecoder Dec{..} = first decToS . go where go s = case decBreak (=='&') s of - (_,"") -> decBuilder s - (before,decDrop 1 -> rest) -> + (_,"") -> (decBuilder s, "") + (before,restPlusAmp@(decDrop 1 -> rest)) -> case decBreak (not . (\c -> isNameChar c || c == '#')) rest of - (_,"") -> decBuilder before <> decBuilder rest + (_,"") -> (decBuilder before, restPlusAmp) (entity,after) -> - decBuilder before <> - decBuilder (decEntity entity) <> - go (case decUncons after of - Just (';',validAfter) -> validAfter - _ -> after) + let before1 = decBuilder before + decoded = decBuilder (decEntity entity) + (before2, after') = + case decUncons after of + Just (';',validAfter) -> first (decoded <>) (go validAfter) + Just (_invalid,_rest) -> first (decoded <>) (go after) + Nothing -> (mempty, s) + in (before1 <> before2, after') -- | Is the character a valid Name starter? isNameStart :: Char -> Bool diff --git a/Text/HTML/TagStream/Text.hs b/Text/HTML/TagStream/Text.hs index 8c72721..f2595df 100644 --- a/Text/HTML/TagStream/Text.hs +++ b/Text/HTML/TagStream/Text.hs @@ -16,6 +16,7 @@ import Data.Attoparsec.Text import Data.ByteString (ByteString) import qualified Data.CaseInsensitive as CI import Data.Conduit +import Data.Functor.Identity (runIdentity) import Data.Maybe (fromMaybe) import Data.Monoid (mconcat) import Data.Text (Text) @@ -147,9 +148,9 @@ text :: Parser Token text = Text <$> atLeast 1 (takeTill (=='<')) -- | Decode the HTML entities e.g. @&@ in some text into @&@. -decodeEntities :: Text -> Text -decodeEntities = - makeEntityDecoder +decodeEntitiesText :: Monad m => Conduit Token m Token +decodeEntitiesText = + decodeEntities Dec { decToS = L.toStrict . B.toLazyText , decBreak = T.break , decBuilder = B.fromText @@ -192,7 +193,9 @@ html = tokens <|> pure [] _ -> (t:) <$> html decode :: Text -> Either String [Token] -decode = parseOnly html +decode = fmap decodeEntitiesText' . parseOnly html + where + decodeEntitiesText' tokens = runIdentity $ mapM_ yield tokens $$ decodeEntitiesText =$ CL.consume {-- - Utils {{{ @@ -251,7 +254,7 @@ tokenStream :: Monad m => GInfConduit Text m Token #endif tokenStream = - loop T.empty + loop T.empty =$= decodeEntitiesText where #if MIN_VERSION_conduit(1, 0, 0) loop accum = await >>= maybe (close accum ()) (push accum) diff --git a/tests/Tests.hs b/tests/Tests.hs index 87248bc..b2162a1 100644 --- a/tests/Tests.hs +++ b/tests/Tests.hs @@ -65,14 +65,14 @@ onePassTests = mapM_ one testcases where one (T.encodeUtf8 -> str, map encodeTokenUtf8 -> tokens) = it (S.unpack str) $ do - result <- combineText S.decodeEntities <$> assertDecodeBS str + result <- combineText <$> assertDecodeBS str assertEqual "one-pass parse result incorrect" tokens result onePassTestsText :: Spec onePassTestsText = mapM_ one testcases where one (str, tokens) = it (T.unpack str) $ do - result <- combineText T.decodeEntities <$> assertDecodeText str + result <- combineText <$> assertDecodeText str assertEqual "one-pass parse result incorrect" tokens result streamlineTests :: Spec @@ -84,7 +84,7 @@ streamlineTests = mapM_ one testcases it (S.unpack str) $ do -- streamline parse result don't contain the trailing Incomplete token. let tokens' = reverse . dropWhile isIncomplete . reverse $ tokens - result <- combineText S.decodeEntities <$> C.runResourceT ( + result <- combineText <$> C.runResourceT ( CL.sourceList (map S.singleton (S.unpack str)) C.$= S.tokenStream C.$$ CL.consume ) @@ -99,7 +99,7 @@ streamlineTestsText = mapM_ one testcases it (S.unpack str) $ do -- streamline parse result don't contain the trailing Incomplete token. let tokens' = reverse . dropWhile isIncomplete . reverse $ tokens - result <- combineText T.decodeEntities <$> C.runResourceT ( + result <- combineText <$> C.runResourceT ( CL.sourceList (map S.singleton (S.unpack str)) C.$= T.tokenStreamBS C.$$ CL.consume ) @@ -205,8 +205,6 @@ testcases = , ( " hello" , [TagOpen "foo" [] False, Text " hello", TagClose "foo"] ) - , ( "

foo"

" - , [TagOpen "p" [] False,Text "foo\"",TagClose "p"]) , ( "&foo bar "mu <zot> < a &#; &#a; &hello;" , [TagOpen "a" [] False, Text " bar \"mu < a ",TagClose "a"]) ] @@ -242,10 +240,8 @@ assertDecodeText s = do let (Right tokens) = result return tokens -combineText :: Monoid s => (s -> s) -> [Token' s] -> [Token' s] -combineText decode = map (onText decode) . go +combineText :: Monoid s => [Token' s] -> [Token' s] +combineText = go where go [] = [] go (Text t1 : Text t2 : xs) = go $ Text (mappend t1 t2) : xs go (x:xs) = x : go xs - onText f (Text t) = Text (f t) - onText _ x = x From 9386d904baba2d8f4774f0e04213e5050014b932 Mon Sep 17 00:00:00 2001 From: Chris Done Date: Tue, 28 Jan 2014 15:41:20 +0100 Subject: [PATCH 3/5] Add comprehensive entity test-cases --- tests/Tests.hs | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/tests/Tests.hs b/tests/Tests.hs index b2162a1..5a0b674 100644 --- a/tests/Tests.hs +++ b/tests/Tests.hs @@ -2,23 +2,23 @@ {-# OPTIONS_GHC -fno-warn-orphans #-} module Main where -import Control.Applicative +import Control.Applicative -import Data.Monoid (Monoid(..)) -import Data.ByteString (ByteString) +import Data.Monoid (Monoid(..),(<>)) +import Data.ByteString (ByteString) import qualified Data.ByteString.Char8 as S -import Data.Text (Text) +import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.Conduit as C import qualified Data.Conduit.List as CL -import Test.Hspec -import Test.Hspec.QuickCheck (prop) -import Test.HUnit hiding (Test) -import Test.QuickCheck +import Test.Hspec +import Test.Hspec.QuickCheck (prop) +import Test.HUnit hiding (Test) +import Test.QuickCheck -import Text.HTML.TagStream +import Text.HTML.TagStream import qualified Text.HTML.TagStream.ByteString as S import qualified Text.HTML.TagStream.Text as T @@ -205,9 +205,25 @@ testcases = , ( " hello" , [TagOpen "foo" [] False, Text " hello", TagClose "foo"] ) - , ( "&foo bar "mu <zot> < a &#; &#a; &hello;" - , [TagOpen "a" [] False, Text " bar \"mu < a ",TagClose "a"]) + + -- Text entity decoding + , text "" "" + , text "&" "&" + , text "& hello" " hello" + , text "&" "&" + , text "&" "&" + , text """ "\"" + , text "&unknown;" "" + , text "foo &bar; mu" "foo mu" + , text "<p>" "

" + , text "<" "<" + , text "aaa" "aaa" + , text "foo &" "foo &" + , text "foo &" "foo &" + , text "foo &" "foo &" ] + where text b a = ("

" <> b <> "

" + ,concat [[TagOpen "p" [] False],[Text a | not (T.null a)],[TagClose "p"]]) testChar :: Gen Char testChar = growingElements "<>/=\"' \t\r\nabcde\\" From 50d11a4d549593b634b5969afe362a10b61461b9 Mon Sep 17 00:00:00 2001 From: Chris Done Date: Fri, 31 Jan 2014 11:20:25 +0100 Subject: [PATCH 4/5] Remove redundant constraints --- Text/HTML/TagStream/Entities.hs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/Text/HTML/TagStream/Entities.hs b/Text/HTML/TagStream/Entities.hs index 2c55717..9840718 100644 --- a/Text/HTML/TagStream/Entities.hs +++ b/Text/HTML/TagStream/Entities.hs @@ -21,7 +21,11 @@ import qualified Data.Conduit.List as CL import Data.Maybe (fromMaybe, isJust) import Control.Arrow (first) -decodeEntities' :: (Monad m,Monoid string,Eq string,IsString string,Monoid builder,Show string) +decodeEntities' :: (Monad m + ,Monoid string + ,IsString string + ,Monoid builder + ,Eq string) => Dec builder string -> Conduit string m string decodeEntities' dec = @@ -36,7 +40,11 @@ decodeEntities' dec = then loop (mappend remainder) else yield remainder -decodeEntities :: (Monad m,Monoid string,Eq string,IsString string,Monoid builder,Show string) +decodeEntities :: (Monad m + ,Monoid builder + ,Monoid string + ,IsString string + ,Eq string) => Dec builder string -> Conduit (Token' string) m (Token' string) decodeEntities dec = @@ -69,7 +77,7 @@ data Dec builder string = Dec } -- | Decode the entities in a string type with a decoder. -makeEntityDecoder :: (Eq string,IsString string,Monoid builder,Show string) +makeEntityDecoder :: (IsString string,Monoid builder,Eq string) => Dec builder string -> string -> (string, string) makeEntityDecoder Dec{..} = first decToS . go where From f01506f8c92ed4608e3dcbfa6da92a22f8731ccf Mon Sep 17 00:00:00 2001 From: Chris Done Date: Fri, 31 Jan 2014 12:39:22 +0100 Subject: [PATCH 5/5] Leave invalid entities unencoded, updated tests --- Text/HTML/TagStream/ByteString.hs | 3 +- Text/HTML/TagStream/Entities.hs | 68 +++++++++++++++++-------------- Text/HTML/TagStream/Text.hs | 3 +- tests/Tests.hs | 9 ++-- 4 files changed, 46 insertions(+), 37 deletions(-) diff --git a/Text/HTML/TagStream/ByteString.hs b/Text/HTML/TagStream/ByteString.hs index 5443f58..d2f07e5 100644 --- a/Text/HTML/TagStream/ByteString.hs +++ b/Text/HTML/TagStream/ByteString.hs @@ -140,8 +140,7 @@ decodeEntitiesBS = , decEntity = decodeEntity , decUncons = S.uncons } where decodeEntity entity = - S.concat - $ map encodeUtf8 + fmap encodeUtf8 $ CL.sourceList ["&",entity,";"] $= XML.parseBytes def { XML.psDecodeEntities = XML.decodeHtmlEntities } $$ XML.content diff --git a/Text/HTML/TagStream/Entities.hs b/Text/HTML/TagStream/Entities.hs index 9840718..cd92b14 100644 --- a/Text/HTML/TagStream/Entities.hs +++ b/Text/HTML/TagStream/Entities.hs @@ -16,11 +16,30 @@ import Data.Monoid import Data.String import Data.Conduit import Text.HTML.TagStream.Types -import qualified Data.Text as T + import qualified Data.Conduit.List as CL import Data.Maybe (fromMaybe, isJust) import Control.Arrow (first) +-- | A conduit to decode entities from a stream of tokens into a new stream of tokens. +decodeEntities :: (Monad m + ,Monoid builder + ,Monoid string + ,IsString string + ,Eq string) + => Dec builder string + -> Conduit (Token' string) m (Token' string) +decodeEntities dec = + start + where + start = await >>= maybe (return ()) (\token -> start' token >> start) + start' (Text t) = (yield t >> yieldWhileText) =$= decodeEntities' dec =$= CL.mapMaybe go + start' token = yield token + + go t + | t == "" = Nothing + | otherwise = Just (Text t) + decodeEntities' :: (Monad m ,Monoid string ,IsString string @@ -40,24 +59,7 @@ decodeEntities' dec = then loop (mappend remainder) else yield remainder -decodeEntities :: (Monad m - ,Monoid builder - ,Monoid string - ,IsString string - ,Eq string) - => Dec builder string - -> Conduit (Token' string) m (Token' string) -decodeEntities dec = - start - where - start = await >>= maybe (return ()) (\token -> start' token >> start) - start' (Text t) = (yield t >> yieldWhileText) =$= decodeEntities' dec =$= CL.mapMaybe go - start' token = yield token - - go t - | t == "" = Nothing - | otherwise = Just (Text t) - +-- | Yield contiguous text tokens as strings. yieldWhileText :: Monad m => Conduit (Token' string) m string yieldWhileText = loop @@ -72,12 +74,12 @@ data Dec builder string = Dec , decBreak :: (Char -> Bool) -> string -> (string,string) , decBuilder :: string -> builder , decDrop :: Int -> string -> string - , decEntity :: string -> string + , decEntity :: string -> Maybe string , decUncons :: string -> Maybe (Char,string) } -- | Decode the entities in a string type with a decoder. -makeEntityDecoder :: (IsString string,Monoid builder,Eq string) +makeEntityDecoder :: (IsString string,Monoid builder,Eq string,Monoid string) => Dec builder string -> string -> (string, string) makeEntityDecoder Dec{..} = first decToS . go where @@ -87,15 +89,21 @@ makeEntityDecoder Dec{..} = first decToS . go (before,restPlusAmp@(decDrop 1 -> rest)) -> case decBreak (not . (\c -> isNameChar c || c == '#')) rest of (_,"") -> (decBuilder before, restPlusAmp) - (entity,after) -> - let before1 = decBuilder before - decoded = decBuilder (decEntity entity) - (before2, after') = - case decUncons after of - Just (';',validAfter) -> first (decoded <>) (go validAfter) - Just (_invalid,_rest) -> first (decoded <>) (go after) - Nothing -> (mempty, s) - in (before1 <> before2, after') + (entity,after) -> (before1 <> before2, after') + where + before1 = decBuilder before + (before2, after') = + case mdecoded of + Nothing -> first ((decBuilder "&" <> decBuilder entity) <>) (go after) + Just (decBuilder -> decoded) -> + case decUncons after of + Just (';',validAfter) -> first (decoded <>) (go validAfter) + Just (_invalid,_rest) -> first (decoded <>) (go after) + Nothing -> (mempty, s) + mdecoded = + if entity == mempty + then Nothing + else decEntity entity -- | Is the character a valid Name starter? isNameStart :: Char -> Bool diff --git a/Text/HTML/TagStream/Text.hs b/Text/HTML/TagStream/Text.hs index f2595df..3b8f7aa 100644 --- a/Text/HTML/TagStream/Text.hs +++ b/Text/HTML/TagStream/Text.hs @@ -158,8 +158,7 @@ decodeEntitiesText = , decEntity = decodeEntity , decUncons = T.uncons } where decodeEntity entity = - T.concat - $ CL.sourceList ["&",entity,";"] + CL.sourceList ["&",entity,";"] $= XML.parseText def { XML.psDecodeEntities = XML.decodeHtmlEntities } $= CL.map snd $$ XML.content diff --git a/tests/Tests.hs b/tests/Tests.hs index 5a0b674..0ae6822 100644 --- a/tests/Tests.hs +++ b/tests/Tests.hs @@ -209,12 +209,15 @@ testcases = -- Text entity decoding , text "" "" , text "&" "&" - , text "& hello" " hello" + , text "& hello" "& hello" , text "&" "&" , text "&" "&" , text """ "\"" - , text "&unknown;" "" - , text "foo &bar; mu" "foo mu" + , text "&unknown;" "&unknown;" + , text "a &unknown b" "a &unknown b" + , text "\"&unknown\"" "\"&unknown\"" + , text "foo &bar; mu" "foo &bar; mu" + , text "&foo; &bar "mu< zot &hello;" "&foo; &bar \"mu< zot &hello;" , text "<p>" "

" , text "<" "<" , text "aaa" "aaa"