diff --git a/Text/HTML/TagStream/ByteString.hs b/Text/HTML/TagStream/ByteString.hs
index 98ceb13..d2f07e5 100644
--- a/Text/HTML/TagStream/ByteString.hs
+++ b/Text/HTML/TagStream/ByteString.hs
@@ -3,18 +3,23 @@
{-# LANGUAGE TypeFamilies #-}
module Text.HTML.TagStream.ByteString where
-import Control.Applicative
-import Control.Monad (unless)
-
-import Data.Monoid (mconcat)
-import Data.ByteString (ByteString)
-import qualified Data.ByteString.Char8 as S
-import Data.Attoparsec.Char8
-import Data.Conduit
-
import qualified Blaze.ByteString.Builder as B
-import Text.HTML.TagStream.Types
-import Text.HTML.TagStream.Utils (splitAccum)
+import Control.Applicative
+import Control.Monad (unless)
+import Data.Attoparsec.Char8
+import Data.ByteString (ByteString)
+import qualified Data.ByteString.Char8 as S
+import Data.Conduit
+import qualified Data.Conduit.List as CL
+import Data.Default
+import Data.Functor.Identity (runIdentity)
+import Data.Monoid
+import Data.Text.Encoding
+import qualified Text.XML.Stream.Parse as XML
+
+import Text.HTML.TagStream.Entities
+import Text.HTML.TagStream.Types
+import Text.HTML.TagStream.Utils (splitAccum)
type Token = Token' ByteString
type Attr = Attr' ByteString
@@ -124,6 +129,22 @@ incomplete = Incomplete . S.cons '<' <$> takeByteString
text :: Parser Token
text = Text <$> atLeast 1 (takeTill (=='<'))
+-- | Decode the HTML entities e.g. @&@ in some text into @&@.
+decodeEntitiesBS :: Monad m => Conduit Token m Token
+decodeEntitiesBS =
+ decodeEntities
+ Dec { decToS = B.toByteString
+ , decBreak = S.break
+ , decBuilder = B.fromByteString
+ , decDrop = S.drop
+ , decEntity = decodeEntity
+ , decUncons = S.uncons }
+ where decodeEntity entity =
+ fmap encodeUtf8
+ $ CL.sourceList ["&",entity,";"]
+ $= XML.parseBytes def { XML.psDecodeEntities = XML.decodeHtmlEntities }
+ $$ XML.content
+
token :: Parser Token
token = char '<' *> (tag <|> incomplete)
<|> text
@@ -153,7 +174,10 @@ html = tokens <|> pure []
_ -> (t:) <$> html
decode :: ByteString -> Either String [Token]
-decode = parseOnly html
+decode = fmap decodeEntitiesBS' . parseOnly html
+ where
+ decodeEntitiesBS' tokens = runIdentity $ mapM_ yield tokens $$ decodeEntitiesBS =$ CL.consume
+
{--
- Utils {{{
@@ -212,7 +236,7 @@ tokenStream :: Monad m
=> GInfConduit ByteString m Token
#endif
tokenStream =
- loop S.empty
+ loop S.empty =$= decodeEntitiesBS
where
#if MIN_VERSION_conduit(1, 0, 0)
loop accum = await >>= maybe (close accum ()) (push accum)
diff --git a/Text/HTML/TagStream/Entities.hs b/Text/HTML/TagStream/Entities.hs
new file mode 100644
index 0000000..cd92b14
--- /dev/null
+++ b/Text/HTML/TagStream/Entities.hs
@@ -0,0 +1,137 @@
+{-# LANGUAGE ViewPatterns #-}
+{-# LANGUAGE RecordWildCards #-}
+{-# LANGUAGE OverloadedStrings #-}
+
+-- | HTML entity decoding.
+
+module Text.HTML.TagStream.Entities
+ (Dec(..)
+ ,isNameChar
+ ,isNameStart
+ ,decodeEntities)
+ where
+
+import Data.Char
+import Data.Monoid
+import Data.String
+import Data.Conduit
+import Text.HTML.TagStream.Types
+
+import qualified Data.Conduit.List as CL
+import Data.Maybe (fromMaybe, isJust)
+import Control.Arrow (first)
+
+-- | A conduit to decode entities from a stream of tokens into a new stream of tokens.
+decodeEntities :: (Monad m
+ ,Monoid builder
+ ,Monoid string
+ ,IsString string
+ ,Eq string)
+ => Dec builder string
+ -> Conduit (Token' string) m (Token' string)
+decodeEntities dec =
+ start
+ where
+ start = await >>= maybe (return ()) (\token -> start' token >> start)
+ start' (Text t) = (yield t >> yieldWhileText) =$= decodeEntities' dec =$= CL.mapMaybe go
+ start' token = yield token
+
+ go t
+ | t == "" = Nothing
+ | otherwise = Just (Text t)
+
+decodeEntities' :: (Monad m
+ ,Monoid string
+ ,IsString string
+ ,Monoid builder
+ ,Eq string)
+ => Dec builder string
+ -> Conduit string m string
+decodeEntities' dec =
+ loop id
+ where
+ loop accum = do
+ mchunk <- await
+ let chunk = accum $ fromMaybe mempty mchunk
+ (newStr, remainder) = makeEntityDecoder dec chunk
+ yield newStr
+ if isJust mchunk
+ then loop (mappend remainder)
+ else yield remainder
+
+-- | Yield contiguous text tokens as strings.
+yieldWhileText :: Monad m => Conduit (Token' string) m string
+yieldWhileText =
+ loop
+ where
+ loop = await >>= maybe (return ()) go
+ go (Text t) = yield t >> loop
+ go token = leftover token
+
+-- | A decoder.
+data Dec builder string = Dec
+ { decToS :: builder -> string
+ , decBreak :: (Char -> Bool) -> string -> (string,string)
+ , decBuilder :: string -> builder
+ , decDrop :: Int -> string -> string
+ , decEntity :: string -> Maybe string
+ , decUncons :: string -> Maybe (Char,string)
+ }
+
+-- | Decode the entities in a string type with a decoder.
+makeEntityDecoder :: (IsString string,Monoid builder,Eq string,Monoid string)
+ => Dec builder string -> string -> (string, string)
+makeEntityDecoder Dec{..} = first decToS . go
+ where
+ go s =
+ case decBreak (=='&') s of
+ (_,"") -> (decBuilder s, "")
+ (before,restPlusAmp@(decDrop 1 -> rest)) ->
+ case decBreak (not . (\c -> isNameChar c || c == '#')) rest of
+ (_,"") -> (decBuilder before, restPlusAmp)
+ (entity,after) -> (before1 <> before2, after')
+ where
+ before1 = decBuilder before
+ (before2, after') =
+ case mdecoded of
+ Nothing -> first ((decBuilder "&" <> decBuilder entity) <>) (go after)
+ Just (decBuilder -> decoded) ->
+ case decUncons after of
+ Just (';',validAfter) -> first (decoded <>) (go validAfter)
+ Just (_invalid,_rest) -> first (decoded <>) (go after)
+ Nothing -> (mempty, s)
+ mdecoded =
+ if entity == mempty
+ then Nothing
+ else decEntity entity
+
+-- | Is the character a valid Name starter?
+isNameStart :: Char -> Bool
+isNameStart c =
+ c == ':' ||
+ c == '_' ||
+ isAsciiUpper c ||
+ isAsciiLower c ||
+ (c >= '\xC0' && c <= '\xD6') ||
+ (c >= '\xD8' && c <= '\xF6') ||
+ (c >= '\xF8' && c <= '\x2FF') ||
+ (c >= '\x370' && c <= '\x37D') ||
+ (c >= '\x37F' && c <= '\x1FFF') ||
+ (c >= '\x200C' && c <= '\x200D') ||
+ (c >= '\x2070' && c <= '\x218F') ||
+ (c >= '\x2C00' && c <= '\x2FEF') ||
+ (c >= '\x3001' && c <= '\xD7FF') ||
+ (c >= '\xF900' && c <= '\xFDCF') ||
+ (c >= '\xFDF0' && c <= '\xFFFD') ||
+ (c >= '\x10000' && c <= '\xEFFFF')
+
+-- | Is the character valid in a Name?
+isNameChar :: Char -> Bool
+isNameChar c =
+ c == '-' ||
+ c == '.' ||
+ c == '\xB7' ||
+ isDigit c ||
+ isNameStart c ||
+ (c >= '\x0300' && c <= '\x036F') ||
+ (c >= '\x203F' && c <= '\x2040')
diff --git a/Text/HTML/TagStream/Text.hs b/Text/HTML/TagStream/Text.hs
index 8b1dcbc..3b8f7aa 100644
--- a/Text/HTML/TagStream/Text.hs
+++ b/Text/HTML/TagStream/Text.hs
@@ -3,36 +3,41 @@
{-# LANGUAGE TypeFamilies #-}
module Text.HTML.TagStream.Text where
-import Prelude hiding (mapM)
-import Control.Applicative
-import Control.Monad (unless, when, liftM)
-import Control.Monad.Trans.Class (lift)
-
-import Data.Traversable (mapM)
-import Data.Maybe (fromMaybe)
-import Data.Monoid (mconcat)
-import Data.Char (isSpace)
-import Data.ByteString (ByteString)
-import Data.Text (Text)
+import Control.Applicative
+import Control.Monad (unless, when, liftM)
+import Control.Monad.Trans.Class (lift)
+import Data.Char
+import qualified Data.Conduit.List as CL
+import Data.Default
+import Prelude hiding (mapM)
+
+import qualified Data.Attoparsec.ByteString.Char8 as S
+import Data.Attoparsec.Text
+import Data.ByteString (ByteString)
+import qualified Data.CaseInsensitive as CI
+import Data.Conduit
+import Data.Functor.Identity (runIdentity)
+import Data.Maybe (fromMaybe)
+import Data.Monoid (mconcat)
+import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Builder as B
-import qualified Data.CaseInsensitive as CI
-import qualified Data.Attoparsec.ByteString.Char8 as S
-import Data.Attoparsec.Text
-import Data.Conduit
+import Data.Traversable (mapM)
+import qualified Text.XML.Stream.Parse as XML
#if MIN_VERSION_conduit(1, 0, 0)
-import Data.Conduit.Internal (unConduitM)
+import Data.Conduit.Internal (unConduitM)
#else
-import Data.Conduit.Internal (pipeL)
+import Data.Conduit.Internal (pipeL)
#endif
import qualified Data.Conduit.List as C
import qualified Data.Conduit.Attoparsec as C
import qualified Data.Conduit.Text as C
import qualified Text.HTML.TagStream.ByteString as S
-import Text.HTML.TagStream.Types
-import Text.HTML.TagStream.Utils (splitAccum)
+import Text.HTML.TagStream.Entities
+import Text.HTML.TagStream.Types
+import Text.HTML.TagStream.Utils (splitAccum)
type Token = Token' Text
type Attr = Attr' Text
@@ -142,6 +147,22 @@ incomplete = Incomplete . T.cons '<' <$> takeText
text :: Parser Token
text = Text <$> atLeast 1 (takeTill (=='<'))
+-- | Decode the HTML entities e.g. @&@ in some text into @&@.
+decodeEntitiesText :: Monad m => Conduit Token m Token
+decodeEntitiesText =
+ decodeEntities
+ Dec { decToS = L.toStrict . B.toLazyText
+ , decBreak = T.break
+ , decBuilder = B.fromText
+ , decDrop = T.drop
+ , decEntity = decodeEntity
+ , decUncons = T.uncons }
+ where decodeEntity entity =
+ CL.sourceList ["&",entity,";"]
+ $= XML.parseText def { XML.psDecodeEntities = XML.decodeHtmlEntities }
+ $= CL.map snd
+ $$ XML.content
+
token :: Parser Token
token = char '<' *> (tag <|> incomplete)
<|> text
@@ -171,7 +192,9 @@ html = tokens <|> pure []
_ -> (t:) <$> html
decode :: Text -> Either String [Token]
-decode = parseOnly html
+decode = fmap decodeEntitiesText' . parseOnly html
+ where
+ decodeEntitiesText' tokens = runIdentity $ mapM_ yield tokens $$ decodeEntitiesText =$ CL.consume
{--
- Utils {{{
@@ -230,7 +253,7 @@ tokenStream :: Monad m
=> GInfConduit Text m Token
#endif
tokenStream =
- loop T.empty
+ loop T.empty =$= decodeEntitiesText
where
#if MIN_VERSION_conduit(1, 0, 0)
loop accum = await >>= maybe (close accum ()) (push accum)
diff --git a/tagstream-conduit.cabal b/tagstream-conduit.cabal
index 65ce3c9..8f19a02 100644
--- a/tagstream-conduit.cabal
+++ b/tagstream-conduit.cabal
@@ -32,6 +32,7 @@ Library
, Text.HTML.TagStream.Text
, Text.HTML.TagStream.Types
, Text.HTML.TagStream.Utils
+ , Text.HTML.TagStream.Entities
Build-depends: base >= 4 && < 5
, bytestring
, text
@@ -42,6 +43,8 @@ Library
, blaze-builder
, blaze-builder-conduit >= 0.5 && < 1.1
, attoparsec-conduit >= 0.5
+ , xml-conduit >= 1.1.0.0
+ , data-default >= 0.5.0
test-suite test
hs-source-dirs: tests
diff --git a/tests/Tests.hs b/tests/Tests.hs
index 9c3ab22..0ae6822 100644
--- a/tests/Tests.hs
+++ b/tests/Tests.hs
@@ -2,23 +2,23 @@
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Main where
-import Control.Applicative
+import Control.Applicative
-import Data.Monoid (Monoid(..))
-import Data.ByteString (ByteString)
+import Data.Monoid (Monoid(..),(<>))
+import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as S
-import Data.Text (Text)
+import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Conduit as C
import qualified Data.Conduit.List as CL
-import Test.Hspec
-import Test.Hspec.QuickCheck (prop)
-import Test.HUnit hiding (Test)
-import Test.QuickCheck
+import Test.Hspec
+import Test.Hspec.QuickCheck (prop)
+import Test.HUnit hiding (Test)
+import Test.QuickCheck
-import Text.HTML.TagStream
+import Text.HTML.TagStream
import qualified Text.HTML.TagStream.ByteString as S
import qualified Text.HTML.TagStream.Text as T
@@ -205,7 +205,28 @@ testcases =
, ( "
" + , text "<" "<" + , text "aaa" "aaa" + , text "foo &" "foo &" + , text "foo &" "foo &" + , text "foo &" "foo &" ] + where text b a = ("
" <> b <> "
" + ,concat [[TagOpen "p" [] False],[Text a | not (T.null a)],[TagClose "p"]]) testChar :: Gen Char testChar = growingElements "<>/=\"' \t\r\nabcde\\" @@ -239,6 +260,7 @@ assertDecodeText s = do return tokens combineText :: Monoid s => [Token' s] -> [Token' s] -combineText [] = [] -combineText (Text t1 : Text t2 : xs) = combineText $ Text (mappend t1 t2) : xs -combineText (x:xs) = x : combineText xs +combineText = go + where go [] = [] + go (Text t1 : Text t2 : xs) = go $ Text (mappend t1 t2) : xs + go (x:xs) = x : go xs