diff --git a/Text/HTML/TagStream/ByteString.hs b/Text/HTML/TagStream/ByteString.hs index 98ceb13..d2f07e5 100644 --- a/Text/HTML/TagStream/ByteString.hs +++ b/Text/HTML/TagStream/ByteString.hs @@ -3,18 +3,23 @@ {-# LANGUAGE TypeFamilies #-} module Text.HTML.TagStream.ByteString where -import Control.Applicative -import Control.Monad (unless) - -import Data.Monoid (mconcat) -import Data.ByteString (ByteString) -import qualified Data.ByteString.Char8 as S -import Data.Attoparsec.Char8 -import Data.Conduit - import qualified Blaze.ByteString.Builder as B -import Text.HTML.TagStream.Types -import Text.HTML.TagStream.Utils (splitAccum) +import Control.Applicative +import Control.Monad (unless) +import Data.Attoparsec.Char8 +import Data.ByteString (ByteString) +import qualified Data.ByteString.Char8 as S +import Data.Conduit +import qualified Data.Conduit.List as CL +import Data.Default +import Data.Functor.Identity (runIdentity) +import Data.Monoid +import Data.Text.Encoding +import qualified Text.XML.Stream.Parse as XML + +import Text.HTML.TagStream.Entities +import Text.HTML.TagStream.Types +import Text.HTML.TagStream.Utils (splitAccum) type Token = Token' ByteString type Attr = Attr' ByteString @@ -124,6 +129,22 @@ incomplete = Incomplete . S.cons '<' <$> takeByteString text :: Parser Token text = Text <$> atLeast 1 (takeTill (=='<')) +-- | Decode the HTML entities e.g. @&@ in some text into @&@. +decodeEntitiesBS :: Monad m => Conduit Token m Token +decodeEntitiesBS = + decodeEntities + Dec { decToS = B.toByteString + , decBreak = S.break + , decBuilder = B.fromByteString + , decDrop = S.drop + , decEntity = decodeEntity + , decUncons = S.uncons } + where decodeEntity entity = + fmap encodeUtf8 + $ CL.sourceList ["&",entity,";"] + $= XML.parseBytes def { XML.psDecodeEntities = XML.decodeHtmlEntities } + $$ XML.content + token :: Parser Token token = char '<' *> (tag <|> incomplete) <|> text @@ -153,7 +174,10 @@ html = tokens <|> pure [] _ -> (t:) <$> html decode :: ByteString -> Either String [Token] -decode = parseOnly html +decode = fmap decodeEntitiesBS' . parseOnly html + where + decodeEntitiesBS' tokens = runIdentity $ mapM_ yield tokens $$ decodeEntitiesBS =$ CL.consume + {-- - Utils {{{ @@ -212,7 +236,7 @@ tokenStream :: Monad m => GInfConduit ByteString m Token #endif tokenStream = - loop S.empty + loop S.empty =$= decodeEntitiesBS where #if MIN_VERSION_conduit(1, 0, 0) loop accum = await >>= maybe (close accum ()) (push accum) diff --git a/Text/HTML/TagStream/Entities.hs b/Text/HTML/TagStream/Entities.hs new file mode 100644 index 0000000..cd92b14 --- /dev/null +++ b/Text/HTML/TagStream/Entities.hs @@ -0,0 +1,137 @@ +{-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE OverloadedStrings #-} + +-- | HTML entity decoding. + +module Text.HTML.TagStream.Entities + (Dec(..) + ,isNameChar + ,isNameStart + ,decodeEntities) + where + +import Data.Char +import Data.Monoid +import Data.String +import Data.Conduit +import Text.HTML.TagStream.Types + +import qualified Data.Conduit.List as CL +import Data.Maybe (fromMaybe, isJust) +import Control.Arrow (first) + +-- | A conduit to decode entities from a stream of tokens into a new stream of tokens. +decodeEntities :: (Monad m + ,Monoid builder + ,Monoid string + ,IsString string + ,Eq string) + => Dec builder string + -> Conduit (Token' string) m (Token' string) +decodeEntities dec = + start + where + start = await >>= maybe (return ()) (\token -> start' token >> start) + start' (Text t) = (yield t >> yieldWhileText) =$= decodeEntities' dec =$= CL.mapMaybe go + start' token = yield token + + go t + | t == "" = Nothing + | otherwise = Just (Text t) + +decodeEntities' :: (Monad m + ,Monoid string + ,IsString string + ,Monoid builder + ,Eq string) + => Dec builder string + -> Conduit string m string +decodeEntities' dec = + loop id + where + loop accum = do + mchunk <- await + let chunk = accum $ fromMaybe mempty mchunk + (newStr, remainder) = makeEntityDecoder dec chunk + yield newStr + if isJust mchunk + then loop (mappend remainder) + else yield remainder + +-- | Yield contiguous text tokens as strings. +yieldWhileText :: Monad m => Conduit (Token' string) m string +yieldWhileText = + loop + where + loop = await >>= maybe (return ()) go + go (Text t) = yield t >> loop + go token = leftover token + +-- | A decoder. +data Dec builder string = Dec + { decToS :: builder -> string + , decBreak :: (Char -> Bool) -> string -> (string,string) + , decBuilder :: string -> builder + , decDrop :: Int -> string -> string + , decEntity :: string -> Maybe string + , decUncons :: string -> Maybe (Char,string) + } + +-- | Decode the entities in a string type with a decoder. +makeEntityDecoder :: (IsString string,Monoid builder,Eq string,Monoid string) + => Dec builder string -> string -> (string, string) +makeEntityDecoder Dec{..} = first decToS . go + where + go s = + case decBreak (=='&') s of + (_,"") -> (decBuilder s, "") + (before,restPlusAmp@(decDrop 1 -> rest)) -> + case decBreak (not . (\c -> isNameChar c || c == '#')) rest of + (_,"") -> (decBuilder before, restPlusAmp) + (entity,after) -> (before1 <> before2, after') + where + before1 = decBuilder before + (before2, after') = + case mdecoded of + Nothing -> first ((decBuilder "&" <> decBuilder entity) <>) (go after) + Just (decBuilder -> decoded) -> + case decUncons after of + Just (';',validAfter) -> first (decoded <>) (go validAfter) + Just (_invalid,_rest) -> first (decoded <>) (go after) + Nothing -> (mempty, s) + mdecoded = + if entity == mempty + then Nothing + else decEntity entity + +-- | Is the character a valid Name starter? +isNameStart :: Char -> Bool +isNameStart c = + c == ':' || + c == '_' || + isAsciiUpper c || + isAsciiLower c || + (c >= '\xC0' && c <= '\xD6') || + (c >= '\xD8' && c <= '\xF6') || + (c >= '\xF8' && c <= '\x2FF') || + (c >= '\x370' && c <= '\x37D') || + (c >= '\x37F' && c <= '\x1FFF') || + (c >= '\x200C' && c <= '\x200D') || + (c >= '\x2070' && c <= '\x218F') || + (c >= '\x2C00' && c <= '\x2FEF') || + (c >= '\x3001' && c <= '\xD7FF') || + (c >= '\xF900' && c <= '\xFDCF') || + (c >= '\xFDF0' && c <= '\xFFFD') || + (c >= '\x10000' && c <= '\xEFFFF') + +-- | Is the character valid in a Name? +isNameChar :: Char -> Bool +isNameChar c = + c == '-' || + c == '.' || + c == '\xB7' || + isDigit c || + isNameStart c || + (c >= '\x0300' && c <= '\x036F') || + (c >= '\x203F' && c <= '\x2040') diff --git a/Text/HTML/TagStream/Text.hs b/Text/HTML/TagStream/Text.hs index 8b1dcbc..3b8f7aa 100644 --- a/Text/HTML/TagStream/Text.hs +++ b/Text/HTML/TagStream/Text.hs @@ -3,36 +3,41 @@ {-# LANGUAGE TypeFamilies #-} module Text.HTML.TagStream.Text where -import Prelude hiding (mapM) -import Control.Applicative -import Control.Monad (unless, when, liftM) -import Control.Monad.Trans.Class (lift) - -import Data.Traversable (mapM) -import Data.Maybe (fromMaybe) -import Data.Monoid (mconcat) -import Data.Char (isSpace) -import Data.ByteString (ByteString) -import Data.Text (Text) +import Control.Applicative +import Control.Monad (unless, when, liftM) +import Control.Monad.Trans.Class (lift) +import Data.Char +import qualified Data.Conduit.List as CL +import Data.Default +import Prelude hiding (mapM) + +import qualified Data.Attoparsec.ByteString.Char8 as S +import Data.Attoparsec.Text +import Data.ByteString (ByteString) +import qualified Data.CaseInsensitive as CI +import Data.Conduit +import Data.Functor.Identity (runIdentity) +import Data.Maybe (fromMaybe) +import Data.Monoid (mconcat) +import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Lazy as L import qualified Data.Text.Lazy.Builder as B -import qualified Data.CaseInsensitive as CI -import qualified Data.Attoparsec.ByteString.Char8 as S -import Data.Attoparsec.Text -import Data.Conduit +import Data.Traversable (mapM) +import qualified Text.XML.Stream.Parse as XML #if MIN_VERSION_conduit(1, 0, 0) -import Data.Conduit.Internal (unConduitM) +import Data.Conduit.Internal (unConduitM) #else -import Data.Conduit.Internal (pipeL) +import Data.Conduit.Internal (pipeL) #endif import qualified Data.Conduit.List as C import qualified Data.Conduit.Attoparsec as C import qualified Data.Conduit.Text as C import qualified Text.HTML.TagStream.ByteString as S -import Text.HTML.TagStream.Types -import Text.HTML.TagStream.Utils (splitAccum) +import Text.HTML.TagStream.Entities +import Text.HTML.TagStream.Types +import Text.HTML.TagStream.Utils (splitAccum) type Token = Token' Text type Attr = Attr' Text @@ -142,6 +147,22 @@ incomplete = Incomplete . T.cons '<' <$> takeText text :: Parser Token text = Text <$> atLeast 1 (takeTill (=='<')) +-- | Decode the HTML entities e.g. @&@ in some text into @&@. +decodeEntitiesText :: Monad m => Conduit Token m Token +decodeEntitiesText = + decodeEntities + Dec { decToS = L.toStrict . B.toLazyText + , decBreak = T.break + , decBuilder = B.fromText + , decDrop = T.drop + , decEntity = decodeEntity + , decUncons = T.uncons } + where decodeEntity entity = + CL.sourceList ["&",entity,";"] + $= XML.parseText def { XML.psDecodeEntities = XML.decodeHtmlEntities } + $= CL.map snd + $$ XML.content + token :: Parser Token token = char '<' *> (tag <|> incomplete) <|> text @@ -171,7 +192,9 @@ html = tokens <|> pure [] _ -> (t:) <$> html decode :: Text -> Either String [Token] -decode = parseOnly html +decode = fmap decodeEntitiesText' . parseOnly html + where + decodeEntitiesText' tokens = runIdentity $ mapM_ yield tokens $$ decodeEntitiesText =$ CL.consume {-- - Utils {{{ @@ -230,7 +253,7 @@ tokenStream :: Monad m => GInfConduit Text m Token #endif tokenStream = - loop T.empty + loop T.empty =$= decodeEntitiesText where #if MIN_VERSION_conduit(1, 0, 0) loop accum = await >>= maybe (close accum ()) (push accum) diff --git a/tagstream-conduit.cabal b/tagstream-conduit.cabal index 65ce3c9..8f19a02 100644 --- a/tagstream-conduit.cabal +++ b/tagstream-conduit.cabal @@ -32,6 +32,7 @@ Library , Text.HTML.TagStream.Text , Text.HTML.TagStream.Types , Text.HTML.TagStream.Utils + , Text.HTML.TagStream.Entities Build-depends: base >= 4 && < 5 , bytestring , text @@ -42,6 +43,8 @@ Library , blaze-builder , blaze-builder-conduit >= 0.5 && < 1.1 , attoparsec-conduit >= 0.5 + , xml-conduit >= 1.1.0.0 + , data-default >= 0.5.0 test-suite test hs-source-dirs: tests diff --git a/tests/Tests.hs b/tests/Tests.hs index 9c3ab22..0ae6822 100644 --- a/tests/Tests.hs +++ b/tests/Tests.hs @@ -2,23 +2,23 @@ {-# OPTIONS_GHC -fno-warn-orphans #-} module Main where -import Control.Applicative +import Control.Applicative -import Data.Monoid (Monoid(..)) -import Data.ByteString (ByteString) +import Data.Monoid (Monoid(..),(<>)) +import Data.ByteString (ByteString) import qualified Data.ByteString.Char8 as S -import Data.Text (Text) +import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.Conduit as C import qualified Data.Conduit.List as CL -import Test.Hspec -import Test.Hspec.QuickCheck (prop) -import Test.HUnit hiding (Test) -import Test.QuickCheck +import Test.Hspec +import Test.Hspec.QuickCheck (prop) +import Test.HUnit hiding (Test) +import Test.QuickCheck -import Text.HTML.TagStream +import Text.HTML.TagStream import qualified Text.HTML.TagStream.ByteString as S import qualified Text.HTML.TagStream.Text as T @@ -205,7 +205,28 @@ testcases = , ( " hello" , [TagOpen "foo" [] False, Text " hello", TagClose "foo"] ) + + -- Text entity decoding + , text "" "" + , text "&" "&" + , text "& hello" "& hello" + , text "&" "&" + , text "&" "&" + , text """ "\"" + , text "&unknown;" "&unknown;" + , text "a &unknown b" "a &unknown b" + , text "\"&unknown\"" "\"&unknown\"" + , text "foo &bar; mu" "foo &bar; mu" + , text "&foo; &bar "mu< zot &hello;" "&foo; &bar \"mu< zot &hello;" + , text "<p>" "

" + , text "<" "<" + , text "aaa" "aaa" + , text "foo &" "foo &" + , text "foo &" "foo &" + , text "foo &" "foo &" ] + where text b a = ("

" <> b <> "

" + ,concat [[TagOpen "p" [] False],[Text a | not (T.null a)],[TagClose "p"]]) testChar :: Gen Char testChar = growingElements "<>/=\"' \t\r\nabcde\\" @@ -239,6 +260,7 @@ assertDecodeText s = do return tokens combineText :: Monoid s => [Token' s] -> [Token' s] -combineText [] = [] -combineText (Text t1 : Text t2 : xs) = combineText $ Text (mappend t1 t2) : xs -combineText (x:xs) = x : combineText xs +combineText = go + where go [] = [] + go (Text t1 : Text t2 : xs) = go $ Text (mappend t1 t2) : xs + go (x:xs) = x : go xs