/
DOM.hs
148 lines (129 loc) · 4.48 KB
/
DOM.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
{-# LANGUAGE OverloadedStrings #-}
module Text.HTML.DOM
( eventConduit
, sinkDoc
, readFile
, parseLBS
, parseBSChunks
, eventConduitText
, sinkDocText
, parseLT
, parseSTChunks
) where
import Control.Monad.Trans.Resource
import Prelude hiding (readFile)
import qualified Data.ByteString as S
import qualified Text.HTML.TagStream as TS
import qualified Data.XML.Types as XT
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Conduit.List as CL
import Control.Arrow ((***))
import qualified Data.Set as Set
import qualified Text.XML as X
import Conduit
import qualified Data.ByteString.Lazy as L
import Data.Maybe (mapMaybe)
import qualified Data.Map.Strict as Map
-- | Converts a stream of bytes to a stream of properly balanced @Event@s.
--
-- Note that there may be multiple (or not) root elements. @sinkDoc@ addresses
-- that case.
eventConduit :: Monad m => ConduitT S.ByteString XT.Event m ()
eventConduit = decodeUtf8LenientC .| eventConduit'
eventConduitText :: Monad m => ConduitT T.Text XT.Event m ()
eventConduitText = eventConduit'
eventConduit' :: Monad m => ConduitT T.Text XT.Event m ()
eventConduit' =
TS.tokenStream .| go []
where
go stack = do
mx <- await
case mx of
Nothing -> closeStack stack
-- Ignore processing instructions (or pseudo-instructions)
Just (TS.TagOpen local _ _) | "?" `T.isPrefixOf` local -> go stack
Just (TS.TagOpen local attrs isClosed) -> do
let name = toName local
attrs' = map (toName *** return . XT.ContentText) $ Map.toList attrs
yield $ XT.EventBeginElement name attrs'
if isClosed || isVoid local
then yield (XT.EventEndElement name) >> go stack
else go $ name : stack
Just (TS.TagClose name)
| toName name `elem` stack ->
let loop [] = go []
loop (n:ns) = do
yield $ XT.EventEndElement n
if n == toName name
then go ns
else loop ns
in loop stack
| otherwise -> go stack
Just (TS.Text t) -> do
yield $ XT.EventContent $ XT.ContentText t
go stack
Just (TS.Comment t) -> do
yield $ XT.EventComment t
go stack
Just TS.Special{} -> go stack
Just TS.Incomplete{} -> go stack
toName l = XT.Name l Nothing Nothing
closeStack = mapM_ (yield . XT.EventEndElement)
isVoid name = Set.member (T.toLower name) voidSet
voidSet :: Set.Set T.Text
voidSet = Set.fromList
[ "area"
, "base"
, "br"
, "col"
, "command"
, "embed"
, "hr"
, "img"
, "input"
, "keygen"
, "link"
, "meta"
, "param"
, "source"
, "track"
, "wbr"
]
sinkDoc :: MonadThrow m => ConduitT S.ByteString o m X.Document
sinkDoc = sinkDoc' eventConduit
sinkDocText :: MonadThrow m => ConduitT T.Text o m X.Document
sinkDocText = sinkDoc' eventConduitText
sinkDoc'
:: MonadThrow m
=> ConduitT a XT.Event m ()
-> ConduitT a o m X.Document
sinkDoc' f =
fmap stripDummy $ mapOutput ((,) Nothing) f .| addDummyWrapper .| X.fromEvents
where
addDummyWrapper = do
yield (Nothing, XT.EventBeginElement "html" [])
awaitForever yield
yield (Nothing, XT.EventEndElement "html")
stripDummy doc@(X.Document pro (X.Element _ _ nodes) epi) =
case mapMaybe toElement nodes of
[root] -> X.Document pro root epi
_ -> doc
toElement (X.NodeElement e) = Just e
toElement _ = Nothing
readFile :: FilePath -> IO X.Document
readFile fp = withSourceFile fp $ \src -> runConduit $ src .| sinkDoc
parseLBS :: L.ByteString -> X.Document
parseLBS = parseBSChunks . L.toChunks
parseBSChunks :: [S.ByteString] -> X.Document
parseBSChunks tss =
case runConduit $ CL.sourceList tss .| sinkDoc of
Left e -> error $ "Unexpected exception in parseBSChunks: " ++ show e
Right x -> x
parseLT :: TL.Text -> X.Document
parseLT = parseSTChunks . TL.toChunks
parseSTChunks :: [T.Text] -> X.Document
parseSTChunks tss =
case runConduit $ CL.sourceList tss .| sinkDocText of
Left e -> error $ "Unexpected exception in parseSTChunks: " ++ show e
Right x -> x