Skip to content

Commit

Permalink
Support for iso-8859-1 #63
Browse files Browse the repository at this point in the history
  • Loading branch information
snoyberg committed Oct 1, 2015
1 parent aeaef7a commit 53d0a57
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 17 deletions.
2 changes: 1 addition & 1 deletion stack.yaml
Expand Up @@ -2,4 +2,4 @@ packages:
- xml-conduit/
- xml-hamlet/
- html-conduit/
resolver: lts-2.17
resolver: lts-3.7
4 changes: 4 additions & 0 deletions xml-conduit/ChangeLog.md
@@ -1,3 +1,7 @@
## 1.3.2

* Support for iso-8859-1 [#63](https://github.com/snoyberg/xml/issues/63)

## 1.3.1

* Add functions to ignore subtrees & result-streaming (yield) parsers [#58](https://github.com/snoyberg/xml/pull/58)
Expand Down
59 changes: 44 additions & 15 deletions xml-conduit/Text/XML/Stream/Parse.hs
Expand Up @@ -145,6 +145,7 @@ import Control.Monad.Trans.Resource (MonadResource, MonadThrow (..),
import Data.Attoparsec.Text (Parser, anyChar, char, manyTill,
skipWhile, string, takeWhile,
takeWhile1, try)
import qualified Data.Attoparsec.Text as AT
import Data.Conduit.Attoparsec (PositionRange, conduitParser)
import Data.XML.Types (Content (..), Event (..),
ExternalID (..),
Expand Down Expand Up @@ -172,8 +173,8 @@ import Data.Maybe (fromMaybe, isNothing)
import Data.Text (Text, pack)
import qualified Data.Text as T
import qualified Data.Text as TS
import Data.Text.Encoding (decodeUtf32BEWith)
import Data.Text.Encoding.Error (ignore)
import Data.Text.Encoding (decodeUtf32BEWith, decodeUtf8With)
import Data.Text.Encoding.Error (ignore, lenientDecode)
import Data.Text.Read (Reader, decimal, hexadecimal)
import Data.Typeable (Typeable)
import Data.Word (Word32)
Expand Down Expand Up @@ -259,30 +260,58 @@ detectUtf =
conduit front = await >>= maybe (return ()) (push front)

push front bss =
either conduit (\(bss', continue) -> leftover bss' >> continue)
either conduit
(uncurry checkXMLDecl)
(getEncoding front bss)

getEncoding front bs'
| S.length bs < 4 =
Left (bs `S.append`)
| otherwise =
Right (bsOut, CT.decode codec)
Right (bsOut, mcodec)
where
bs = front bs'
bsOut = S.append (S.drop toDrop x) y
(x, y) = S.splitAt 4 bs
(toDrop, codec) =
(toDrop, mcodec) =
case S.unpack x of
[0x00, 0x00, 0xFE, 0xFF] -> (4, CT.utf32_be)
[0xFF, 0xFE, 0x00, 0x00] -> (4, CT.utf32_le)
0xFE : 0xFF: _ -> (2, CT.utf16_be)
0xFF : 0xFE: _ -> (2, CT.utf16_le)
0xEF : 0xBB: 0xBF : _ -> (3, CT.utf8)
[0x00, 0x00, 0x00, 0x3C] -> (0, CT.utf32_be)
[0x3C, 0x00, 0x00, 0x00] -> (0, CT.utf32_le)
[0x00, 0x3C, 0x00, 0x3F] -> (0, CT.utf16_be)
[0x3C, 0x00, 0x3F, 0x00] -> (0, CT.utf16_le)
_ -> (0, CT.utf8) -- Assuming UTF-8
[0x00, 0x00, 0xFE, 0xFF] -> (4, Just $ CT.utf32_be)
[0xFF, 0xFE, 0x00, 0x00] -> (4, Just $ CT.utf32_le)
0xFE : 0xFF: _ -> (2, Just $ CT.utf16_be)
0xFF : 0xFE: _ -> (2, Just $ CT.utf16_le)
0xEF : 0xBB: 0xBF : _ -> (3, Just $ CT.utf8)
[0x00, 0x00, 0x00, 0x3C] -> (0, Just $ CT.utf32_be)
[0x3C, 0x00, 0x00, 0x00] -> (0, Just $ CT.utf32_le)
[0x00, 0x3C, 0x00, 0x3F] -> (0, Just $ CT.utf16_be)
[0x3C, 0x00, 0x3F, 0x00] -> (0, Just $ CT.utf16_le)
_ -> (0, Nothing) -- Assuming UTF-8

checkXMLDecl :: MonadThrow m
=> S.ByteString
-> Maybe CT.Codec
-> Conduit S.ByteString m TS.Text
checkXMLDecl bs (Just codec) = leftover bs >> CT.decode codec
checkXMLDecl bs0 Nothing =
loop [] (AT.parse (parseToken decodeXmlEntities)) bs0
where
loop chunks0 parser nextChunk =
case parser $ decodeUtf8With lenientDecode nextChunk of
AT.Fail _ _ _ -> fallback
AT.Partial f -> await >>= maybe fallback (loop chunks f)
AT.Done _ (TokenBeginDocument attrs) -> findEncoding attrs
AT.Done _ _ -> fallback
where
chunks = nextChunk : chunks0
fallback = complete CT.utf8
complete codec = mapM_ leftover chunks >> CT.decode codec

findEncoding [] = fallback
findEncoding ((TName _ "encoding", [ContentText enc]):_) =
case enc of
"iso-8859-1" -> complete CT.iso8859_1
"utf-8" -> complete CT.utf8
_ -> complete CT.utf8
findEncoding (_:xs) = findEncoding xs

type EventPos = (Maybe PositionRange, Event)

Expand Down
13 changes: 13 additions & 0 deletions xml-conduit/test/main.hs
Expand Up @@ -93,6 +93,7 @@ main = hspec $ do
it "works" caseOrderAttrs
it "parsing CDATA" caseParseCdata
it "retains namespaces when asked" caseRetainNamespaces
it "handles iso-8859-1" caseIso8859_1

documentParseRender :: IO ()
documentParseRender =
Expand Down Expand Up @@ -605,3 +606,15 @@ caseRetainNamespaces = do
[]
])
[]

caseIso8859_1 :: Assertion
caseIso8859_1 = do
let lbs = "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?><foo>\232</foo>"
doc = Res.parseLBS_ def lbs
doc `shouldBe` Res.Document
(Res.Prologue [] Nothing [])
(Res.Element
"foo"
Map.empty
[Res.NodeContent "\232"])
[]
2 changes: 1 addition & 1 deletion xml-conduit/xml-conduit.cabal
@@ -1,5 +1,5 @@
name: xml-conduit
version: 1.3.1
version: 1.3.2
license: MIT
license-file: LICENSE
author: Michael Snoyman <michael@snoyman.com>, Aristid Breitkreuz <aristidb@googlemail.com>
Expand Down

0 comments on commit 53d0a57

Please sign in to comment.