Skip to content

Commit

Permalink
Merge pull request yihuang#11 from chrisdone/entity_decoding
Browse files Browse the repository at this point in the history
Entity decoding in Text tokens
  • Loading branch information
yihuang committed Feb 1, 2014
2 parents 0e23e8e + f01506f commit d82a501
Show file tree
Hide file tree
Showing 5 changed files with 255 additions and 46 deletions.
50 changes: 37 additions & 13 deletions Text/HTML/TagStream/ByteString.hs
Expand Up @@ -3,18 +3,23 @@
{-# LANGUAGE TypeFamilies #-}
module Text.HTML.TagStream.ByteString where

import Control.Applicative
import Control.Monad (unless)

import Data.Monoid (mconcat)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as S
import Data.Attoparsec.Char8
import Data.Conduit

import qualified Blaze.ByteString.Builder as B
import Text.HTML.TagStream.Types
import Text.HTML.TagStream.Utils (splitAccum)
import Control.Applicative
import Control.Monad (unless)
import Data.Attoparsec.Char8
import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as S
import Data.Conduit
import qualified Data.Conduit.List as CL
import Data.Default
import Data.Functor.Identity (runIdentity)
import Data.Monoid
import Data.Text.Encoding
import qualified Text.XML.Stream.Parse as XML

import Text.HTML.TagStream.Entities
import Text.HTML.TagStream.Types
import Text.HTML.TagStream.Utils (splitAccum)

type Token = Token' ByteString
type Attr = Attr' ByteString
Expand Down Expand Up @@ -124,6 +129,22 @@ incomplete = Incomplete . S.cons '<' <$> takeByteString
text :: Parser Token
text = Text <$> atLeast 1 (takeTill (=='<'))

-- | Decode the HTML entities e.g. @&amp;@ in some text into @&@.
decodeEntitiesBS :: Monad m => Conduit Token m Token
decodeEntitiesBS =
decodeEntities
Dec { decToS = B.toByteString
, decBreak = S.break
, decBuilder = B.fromByteString
, decDrop = S.drop
, decEntity = decodeEntity
, decUncons = S.uncons }
where decodeEntity entity =
fmap encodeUtf8
$ CL.sourceList ["&",entity,";"]
$= XML.parseBytes def { XML.psDecodeEntities = XML.decodeHtmlEntities }
$$ XML.content

token :: Parser Token
token = char '<' *> (tag <|> incomplete)
<|> text
Expand Down Expand Up @@ -153,7 +174,10 @@ html = tokens <|> pure []
_ -> (t:) <$> html

decode :: ByteString -> Either String [Token]
decode = parseOnly html
decode = fmap decodeEntitiesBS' . parseOnly html
where
decodeEntitiesBS' tokens = runIdentity $ mapM_ yield tokens $$ decodeEntitiesBS =$ CL.consume


{--
- Utils {{{
Expand Down Expand Up @@ -212,7 +236,7 @@ tokenStream :: Monad m
=> GInfConduit ByteString m Token
#endif
tokenStream =
loop S.empty
loop S.empty =$= decodeEntitiesBS
where
#if MIN_VERSION_conduit(1, 0, 0)
loop accum = await >>= maybe (close accum ()) (push accum)
Expand Down
137 changes: 137 additions & 0 deletions Text/HTML/TagStream/Entities.hs
@@ -0,0 +1,137 @@
{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE OverloadedStrings #-}

-- | HTML entity decoding.

module Text.HTML.TagStream.Entities
(Dec(..)
,isNameChar
,isNameStart
,decodeEntities)
where

import Data.Char
import Data.Monoid
import Data.String
import Data.Conduit
import Text.HTML.TagStream.Types

import qualified Data.Conduit.List as CL
import Data.Maybe (fromMaybe, isJust)
import Control.Arrow (first)

-- | A conduit to decode entities from a stream of tokens into a new stream of tokens.
decodeEntities :: (Monad m
,Monoid builder
,Monoid string
,IsString string
,Eq string)
=> Dec builder string
-> Conduit (Token' string) m (Token' string)
decodeEntities dec =
start
where
start = await >>= maybe (return ()) (\token -> start' token >> start)
start' (Text t) = (yield t >> yieldWhileText) =$= decodeEntities' dec =$= CL.mapMaybe go
start' token = yield token

go t
| t == "" = Nothing
| otherwise = Just (Text t)

decodeEntities' :: (Monad m
,Monoid string
,IsString string
,Monoid builder
,Eq string)
=> Dec builder string
-> Conduit string m string
decodeEntities' dec =
loop id
where
loop accum = do
mchunk <- await
let chunk = accum $ fromMaybe mempty mchunk
(newStr, remainder) = makeEntityDecoder dec chunk
yield newStr
if isJust mchunk
then loop (mappend remainder)
else yield remainder

-- | Yield contiguous text tokens as strings.
yieldWhileText :: Monad m => Conduit (Token' string) m string
yieldWhileText =
loop
where
loop = await >>= maybe (return ()) go
go (Text t) = yield t >> loop
go token = leftover token

-- | A decoder.
data Dec builder string = Dec
{ decToS :: builder -> string
, decBreak :: (Char -> Bool) -> string -> (string,string)
, decBuilder :: string -> builder
, decDrop :: Int -> string -> string
, decEntity :: string -> Maybe string
, decUncons :: string -> Maybe (Char,string)
}

-- | Decode the entities in a string type with a decoder.
makeEntityDecoder :: (IsString string,Monoid builder,Eq string,Monoid string)
=> Dec builder string -> string -> (string, string)
makeEntityDecoder Dec{..} = first decToS . go
where
go s =
case decBreak (=='&') s of
(_,"") -> (decBuilder s, "")
(before,restPlusAmp@(decDrop 1 -> rest)) ->
case decBreak (not . (\c -> isNameChar c || c == '#')) rest of
(_,"") -> (decBuilder before, restPlusAmp)
(entity,after) -> (before1 <> before2, after')
where
before1 = decBuilder before
(before2, after') =
case mdecoded of
Nothing -> first ((decBuilder "&" <> decBuilder entity) <>) (go after)
Just (decBuilder -> decoded) ->
case decUncons after of
Just (';',validAfter) -> first (decoded <>) (go validAfter)
Just (_invalid,_rest) -> first (decoded <>) (go after)
Nothing -> (mempty, s)
mdecoded =
if entity == mempty
then Nothing
else decEntity entity

-- | Is the character a valid Name starter?
isNameStart :: Char -> Bool
isNameStart c =
c == ':' ||
c == '_' ||
isAsciiUpper c ||
isAsciiLower c ||
(c >= '\xC0' && c <= '\xD6') ||
(c >= '\xD8' && c <= '\xF6') ||
(c >= '\xF8' && c <= '\x2FF') ||
(c >= '\x370' && c <= '\x37D') ||
(c >= '\x37F' && c <= '\x1FFF') ||
(c >= '\x200C' && c <= '\x200D') ||
(c >= '\x2070' && c <= '\x218F') ||
(c >= '\x2C00' && c <= '\x2FEF') ||
(c >= '\x3001' && c <= '\xD7FF') ||
(c >= '\xF900' && c <= '\xFDCF') ||
(c >= '\xFDF0' && c <= '\xFFFD') ||
(c >= '\x10000' && c <= '\xEFFFF')

-- | Is the character valid in a Name?
isNameChar :: Char -> Bool
isNameChar c =
c == '-' ||
c == '.' ||
c == '\xB7' ||
isDigit c ||
isNameStart c ||
(c >= '\x0300' && c <= '\x036F') ||
(c >= '\x203F' && c <= '\x2040')
65 changes: 44 additions & 21 deletions Text/HTML/TagStream/Text.hs
Expand Up @@ -3,36 +3,41 @@
{-# LANGUAGE TypeFamilies #-}
module Text.HTML.TagStream.Text where

import Prelude hiding (mapM)
import Control.Applicative
import Control.Monad (unless, when, liftM)
import Control.Monad.Trans.Class (lift)

import Data.Traversable (mapM)
import Data.Maybe (fromMaybe)
import Data.Monoid (mconcat)
import Data.Char (isSpace)
import Data.ByteString (ByteString)
import Data.Text (Text)
import Control.Applicative
import Control.Monad (unless, when, liftM)
import Control.Monad.Trans.Class (lift)
import Data.Char
import qualified Data.Conduit.List as CL
import Data.Default
import Prelude hiding (mapM)

import qualified Data.Attoparsec.ByteString.Char8 as S
import Data.Attoparsec.Text
import Data.ByteString (ByteString)
import qualified Data.CaseInsensitive as CI
import Data.Conduit
import Data.Functor.Identity (runIdentity)
import Data.Maybe (fromMaybe)
import Data.Monoid (mconcat)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Builder as B
import qualified Data.CaseInsensitive as CI
import qualified Data.Attoparsec.ByteString.Char8 as S
import Data.Attoparsec.Text
import Data.Conduit
import Data.Traversable (mapM)
import qualified Text.XML.Stream.Parse as XML
#if MIN_VERSION_conduit(1, 0, 0)
import Data.Conduit.Internal (unConduitM)
import Data.Conduit.Internal (unConduitM)
#else
import Data.Conduit.Internal (pipeL)
import Data.Conduit.Internal (pipeL)
#endif
import qualified Data.Conduit.List as C
import qualified Data.Conduit.Attoparsec as C
import qualified Data.Conduit.Text as C

import qualified Text.HTML.TagStream.ByteString as S
import Text.HTML.TagStream.Types
import Text.HTML.TagStream.Utils (splitAccum)
import Text.HTML.TagStream.Entities
import Text.HTML.TagStream.Types
import Text.HTML.TagStream.Utils (splitAccum)

type Token = Token' Text
type Attr = Attr' Text
Expand Down Expand Up @@ -142,6 +147,22 @@ incomplete = Incomplete . T.cons '<' <$> takeText
text :: Parser Token
text = Text <$> atLeast 1 (takeTill (=='<'))

-- | Decode the HTML entities e.g. @&amp;@ in some text into @&@.
decodeEntitiesText :: Monad m => Conduit Token m Token
decodeEntitiesText =
decodeEntities
Dec { decToS = L.toStrict . B.toLazyText
, decBreak = T.break
, decBuilder = B.fromText
, decDrop = T.drop
, decEntity = decodeEntity
, decUncons = T.uncons }
where decodeEntity entity =
CL.sourceList ["&",entity,";"]
$= XML.parseText def { XML.psDecodeEntities = XML.decodeHtmlEntities }
$= CL.map snd
$$ XML.content

token :: Parser Token
token = char '<' *> (tag <|> incomplete)
<|> text
Expand Down Expand Up @@ -171,7 +192,9 @@ html = tokens <|> pure []
_ -> (t:) <$> html

decode :: Text -> Either String [Token]
decode = parseOnly html
decode = fmap decodeEntitiesText' . parseOnly html
where
decodeEntitiesText' tokens = runIdentity $ mapM_ yield tokens $$ decodeEntitiesText =$ CL.consume

{--
- Utils {{{
Expand Down Expand Up @@ -230,7 +253,7 @@ tokenStream :: Monad m
=> GInfConduit Text m Token
#endif
tokenStream =
loop T.empty
loop T.empty =$= decodeEntitiesText
where
#if MIN_VERSION_conduit(1, 0, 0)
loop accum = await >>= maybe (close accum ()) (push accum)
Expand Down
3 changes: 3 additions & 0 deletions tagstream-conduit.cabal
Expand Up @@ -32,6 +32,7 @@ Library
, Text.HTML.TagStream.Text
, Text.HTML.TagStream.Types
, Text.HTML.TagStream.Utils
, Text.HTML.TagStream.Entities
Build-depends: base >= 4 && < 5
, bytestring
, text
Expand All @@ -42,6 +43,8 @@ Library
, blaze-builder
, blaze-builder-conduit >= 0.5 && < 1.1
, attoparsec-conduit >= 0.5
, xml-conduit >= 1.1.0.0
, data-default >= 0.5.0

test-suite test
hs-source-dirs: tests
Expand Down

0 comments on commit d82a501

Please sign in to comment.