Skip to content

Commit

Permalink
Merge pull request #66 from gbwey/addparsetext
Browse files Browse the repository at this point in the history
add strict and lazy text parsing
  • Loading branch information
snoyberg committed Nov 12, 2015
2 parents 53d0a57 + 19e9b1e commit e9fe420
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 7 deletions.
36 changes: 29 additions & 7 deletions html-conduit/Text/HTML/DOM.hs
Expand Up @@ -5,6 +5,10 @@ module Text.HTML.DOM
, readFile
, parseLBS
, parseBSChunks
, eventConduitText
, sinkDocText
, parseLT
, parseSTChunks
) where

import Control.Monad.Trans.Resource
Expand All @@ -15,10 +19,10 @@ import qualified Text.HTML.TagStream as TS
import qualified Data.XML.Types as XT
import Data.Conduit
import Data.Conduit.Text (decodeUtf8Lenient)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Conduit.List as CL
import Control.Arrow ((***), second)
import Control.Arrow ((***))
import qualified Data.Set as Set
import qualified Text.XML as X
import Text.XML.Stream.Parse (decodeHtmlEntities)
Expand All @@ -33,8 +37,14 @@ import Data.Maybe (mapMaybe)
-- Note that there may be multiple (or not) root elements. @sinkDoc@ addresses
-- that case.
eventConduit :: Monad m => Conduit S.ByteString m XT.Event
eventConduit =
decodeUtf8Lenient =$= TS.tokenStream =$= go []
eventConduit = decodeUtf8Lenient =$= eventConduit'

eventConduitText :: Monad m => Conduit T.Text m XT.Event
eventConduitText = eventConduit'

eventConduit' :: Monad m => Conduit T.Text m XT.Event
eventConduit' =
TS.tokenStream =$= go []
where
go stack = do
mx <- await
Expand Down Expand Up @@ -92,8 +102,14 @@ eventConduit =
]

sinkDoc :: MonadThrow m => Sink S.ByteString m X.Document
sinkDoc =
fmap stripDummy $ mapOutput ((,) Nothing) eventConduit =$ addDummyWrapper =$ X.fromEvents
sinkDoc = sinkDoc' eventConduit

sinkDocText :: MonadThrow m => Sink T.Text m X.Document
sinkDocText = sinkDoc' eventConduitText

sinkDoc' :: (Monad m, MonadThrow m) => Conduit a m XT.Event -> Sink a m X.Document
sinkDoc' f =
fmap stripDummy $ mapOutput ((,) Nothing) f =$ addDummyWrapper =$ X.fromEvents
where
addDummyWrapper = do
yield (Nothing, XT.EventBeginElement "html" [])
Expand All @@ -115,5 +131,11 @@ parseLBS :: L.ByteString -> X.Document
parseLBS = parseBSChunks . L.toChunks

parseBSChunks :: [S.ByteString] -> X.Document
parseBSChunks bss = runIdentity $ runExceptionT_ $ CL.sourceList bss $$ sinkDoc
parseBSChunks tss = runIdentity $ runExceptionT_ $ CL.sourceList tss $$ sinkDoc

parseLT :: TL.Text -> X.Document
parseLT = parseSTChunks . TL.toChunks

parseSTChunks :: [T.Text] -> X.Document
parseSTChunks tss = runIdentity $ runExceptionT_ $ CL.sourceList tss $$ sinkDocText

6 changes: 6 additions & 0 deletions html-conduit/test/main.hs
Expand Up @@ -42,6 +42,12 @@ main = hspec $ do
it "split code-points" $
X.parseLBS_ X.def "<foo>&#xa0;</foo>" @=?
H.parseBSChunks ["<foo>\xc2", "\xa0</foo>"]
it "latin1 codes" $
X.parseText_ X.def "<foo>\232</foo>" @=?
H.parseSTChunks ["<foo>\232</foo>"]
it "latin1 codes strict vs lazy" $
H.parseLT "<foo>\232</foo>" @=?
H.parseSTChunks ["<foo>\232</foo>"]
describe "HTML parsing" $ do
it "XHTML" $
let html = "<html><head><title>foo</title></head><body><p>Hello World</p></body></html>"
Expand Down

0 comments on commit e9fe420

Please sign in to comment.