forked from yihuang/tagstream-conduit
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request yihuang#11 from chrisdone/entity_decoding
Entity decoding in Text tokens
- Loading branch information
Showing
5 changed files
with
255 additions
and
46 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
{-# LANGUAGE ViewPatterns #-} | ||
{-# LANGUAGE RecordWildCards #-} | ||
{-# LANGUAGE OverloadedStrings #-} | ||
|
||
-- | HTML entity decoding. | ||
|
||
module Text.HTML.TagStream.Entities | ||
(Dec(..) | ||
,isNameChar | ||
,isNameStart | ||
,decodeEntities) | ||
where | ||
|
||
import Data.Char | ||
import Data.Monoid | ||
import Data.String | ||
import Data.Conduit | ||
import Text.HTML.TagStream.Types | ||
|
||
import qualified Data.Conduit.List as CL | ||
import Data.Maybe (fromMaybe, isJust) | ||
import Control.Arrow (first) | ||
|
||
-- | A conduit to decode entities from a stream of tokens into a new stream of tokens. | ||
decodeEntities :: (Monad m | ||
,Monoid builder | ||
,Monoid string | ||
,IsString string | ||
,Eq string) | ||
=> Dec builder string | ||
-> Conduit (Token' string) m (Token' string) | ||
decodeEntities dec = | ||
start | ||
where | ||
start = await >>= maybe (return ()) (\token -> start' token >> start) | ||
start' (Text t) = (yield t >> yieldWhileText) =$= decodeEntities' dec =$= CL.mapMaybe go | ||
start' token = yield token | ||
|
||
go t | ||
| t == "" = Nothing | ||
| otherwise = Just (Text t) | ||
|
||
decodeEntities' :: (Monad m | ||
,Monoid string | ||
,IsString string | ||
,Monoid builder | ||
,Eq string) | ||
=> Dec builder string | ||
-> Conduit string m string | ||
decodeEntities' dec = | ||
loop id | ||
where | ||
loop accum = do | ||
mchunk <- await | ||
let chunk = accum $ fromMaybe mempty mchunk | ||
(newStr, remainder) = makeEntityDecoder dec chunk | ||
yield newStr | ||
if isJust mchunk | ||
then loop (mappend remainder) | ||
else yield remainder | ||
|
||
-- | Yield contiguous text tokens as strings. | ||
yieldWhileText :: Monad m => Conduit (Token' string) m string | ||
yieldWhileText = | ||
loop | ||
where | ||
loop = await >>= maybe (return ()) go | ||
go (Text t) = yield t >> loop | ||
go token = leftover token | ||
|
||
-- | A decoder. | ||
data Dec builder string = Dec | ||
{ decToS :: builder -> string | ||
, decBreak :: (Char -> Bool) -> string -> (string,string) | ||
, decBuilder :: string -> builder | ||
, decDrop :: Int -> string -> string | ||
, decEntity :: string -> Maybe string | ||
, decUncons :: string -> Maybe (Char,string) | ||
} | ||
|
||
-- | Decode the entities in a string type with a decoder. | ||
makeEntityDecoder :: (IsString string,Monoid builder,Eq string,Monoid string) | ||
=> Dec builder string -> string -> (string, string) | ||
makeEntityDecoder Dec{..} = first decToS . go | ||
where | ||
go s = | ||
case decBreak (=='&') s of | ||
(_,"") -> (decBuilder s, "") | ||
(before,restPlusAmp@(decDrop 1 -> rest)) -> | ||
case decBreak (not . (\c -> isNameChar c || c == '#')) rest of | ||
(_,"") -> (decBuilder before, restPlusAmp) | ||
(entity,after) -> (before1 <> before2, after') | ||
where | ||
before1 = decBuilder before | ||
(before2, after') = | ||
case mdecoded of | ||
Nothing -> first ((decBuilder "&" <> decBuilder entity) <>) (go after) | ||
Just (decBuilder -> decoded) -> | ||
case decUncons after of | ||
Just (';',validAfter) -> first (decoded <>) (go validAfter) | ||
Just (_invalid,_rest) -> first (decoded <>) (go after) | ||
Nothing -> (mempty, s) | ||
mdecoded = | ||
if entity == mempty | ||
then Nothing | ||
else decEntity entity | ||
|
||
-- | Is the character a valid Name starter? | ||
isNameStart :: Char -> Bool | ||
isNameStart c = | ||
c == ':' || | ||
c == '_' || | ||
isAsciiUpper c || | ||
isAsciiLower c || | ||
(c >= '\xC0' && c <= '\xD6') || | ||
(c >= '\xD8' && c <= '\xF6') || | ||
(c >= '\xF8' && c <= '\x2FF') || | ||
(c >= '\x370' && c <= '\x37D') || | ||
(c >= '\x37F' && c <= '\x1FFF') || | ||
(c >= '\x200C' && c <= '\x200D') || | ||
(c >= '\x2070' && c <= '\x218F') || | ||
(c >= '\x2C00' && c <= '\x2FEF') || | ||
(c >= '\x3001' && c <= '\xD7FF') || | ||
(c >= '\xF900' && c <= '\xFDCF') || | ||
(c >= '\xFDF0' && c <= '\xFFFD') || | ||
(c >= '\x10000' && c <= '\xEFFFF') | ||
|
||
-- | Is the character valid in a Name? | ||
isNameChar :: Char -> Bool | ||
isNameChar c = | ||
c == '-' || | ||
c == '.' || | ||
c == '\xB7' || | ||
isDigit c || | ||
isNameStart c || | ||
(c >= '\x0300' && c <= '\x036F') || | ||
(c >= '\x203F' && c <= '\x2040') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.