Permalink
Browse files

UTF-8 support from Jean-Philippe Bernardy and Alan Zimmerman.

Alex will now, by default, parse a UTF-8-encoded byte sequence.  (to
disable this behaviour, there is a new flag --latin1, which we will be
using in GHC).

Because parsing UTF-8 generates a large state machine, I added DFA
minimisation (source file DFAMin.hs).  This makes Alex itself somewhat
slower, but the generated lexers are smaller - dramatically so for
UTF-8 lexers.
  • Loading branch information...
simonmar committed Jul 11, 2011
1 parent 5d6e749 commit 892688ff71bcc4b74313f7348ee5dace73dd8506
Showing with 1,889 additions and 202 deletions.
  1. +37 −0 .gitignore
  2. +9 −0 NOTE.txt
  3. +11 −9 alex.cabal
  4. +7 −2 src/AbsSyn.hs
  5. +128 −16 src/CharSet.hs
  6. +12 −14 src/DFA.hs
  7. +150 −0 src/DFAMin.hs
  8. +30 −0 src/Data/LICENSE.txt
  9. +9 −0 src/Data/Ranged.hs
  10. +229 −0 src/Data/Ranged/Boundaries.hs
  11. +486 −0 src/Data/Ranged/RangedSet.hs
  12. +360 −0 src/Data/Ranged/Ranges.hs
  13. +3 −3 src/Info.hs
  14. +27 −4 src/Main.hs
  15. +1 −0 src/Map.hs
  16. +64 −20 src/NFA.hs
  17. +12 −15 src/Output.hs
  18. +21 −8 src/ParseMonad.hs
  19. +79 −66 src/Scan.hs
  20. +4 −4 src/Scan.x
  21. +1 −0 src/Set.hs
  22. +36 −0 src/UTF8.hs
  23. +19 −11 templates/GenericTemplate.hs
  24. +64 −25 templates/wrappers.hs
  25. +8 −5 tests/Makefile
  26. +82 −0 tests/unicode.x
View
@@ -0,0 +1,37 @@
*.hi
*.prof
*~
*.o
*.info
/AlexTemplate-debug.bak
/AlexTemplate-ghc-debug.bak
/AlexTemplate-ghc.bak
/AlexTemplate.bak
/AlexWrapper-basic-bytestring.bak
/AlexWrapper-basic.bak
/AlexWrapper-gscan.bak
/AlexWrapper-monad-bytestring.bak
/AlexWrapper-monad.bak
/AlexWrapper-monadUserState-bytestring.bak
/AlexWrapper-monadUserState.bak
/AlexWrapper-posn-bytestring.bak
/AlexWrapper-posn.bak
/AlexWrapper-strict-bytestring.bak
/AlexTemplate
/AlexTemplate-debug
/AlexTemplate-ghc
/AlexTemplate-ghc-debug
/AlexWrapper-basic
/AlexWrapper-basic-bytestring
/AlexWrapper-gscan
/AlexWrapper-monad
/AlexWrapper-monad-bytestring
/AlexWrapper-monadUserState
/AlexWrapper-monadUserState-bytestring
/AlexWrapper-posn
/AlexWrapper-posn-bytestring
/AlexWrapper-strict-bytestring
/dist
/src/Data/Ranged/Boundaries.hs~
/src/Data/Ranged/RangedSet.hs~
/src/Data/Ranged/Ranges.hs~
View
@@ -0,0 +1,9 @@
Note:
The contents of package Ranged-sets-0.3.0 has been copied into this
package, in order to allow it to be part of the Haskell Platform,
without introducing additional dependencies.
The original license agreement has been included in the src/Data
subdirectory, as required by the package source.
View
@@ -10,15 +10,9 @@ stability: stable
homepage: http://www.haskell.org/alex/ homepage: http://www.haskell.org/alex/
synopsis: Alex is a tool for generating lexical analysers in Haskell synopsis: Alex is a tool for generating lexical analysers in Haskell
category: Development category: Development
cabal-version: >= 1.2 cabal-version: >= 1.6
-- Later, this isn't compatible with Cabal 1.2:
-- source-repository head
-- type: darcs
-- location: http://darcs.haskell.org/alex/
build-type: Custom build-type: Custom
extra-source-files: extra-source-files:
ANNOUNCE ANNOUNCE
README README
@@ -58,6 +52,10 @@ extra-source-files:
tests/tokens_gscan.x tests/tokens_gscan.x
tests/tokens_posn.x tests/tokens_posn.x
source-repository head
type: git
location: http://darcs.haskell.org/alex.git
flag small_base flag small_base
description: Choose the new smaller, split-up base package. description: Choose the new smaller, split-up base package.
@@ -71,13 +69,16 @@ executable alex
build-depends: base >= 1.0 build-depends: base >= 1.0
build-depends: base < 5 build-depends: base < 5
-- build-depends: Ranged-sets
build-depends: QuickCheck >=2
extensions: CPP extensions: CPP
ghc-options: -Wall ghc-options: -Wall -rtsopts
other-modules: other-modules:
AbsSyn AbsSyn
CharSet CharSet
DFA DFA
DFAMin
DFS DFS
Info Info
Main Main
@@ -90,3 +91,4 @@ executable alex
Set Set
Sort Sort
Util Util
UTF8
View
@@ -23,6 +23,7 @@ module AbsSyn (
import CharSet ( CharSet ) import CharSet ( CharSet )
import Map ( Map ) import Map ( Map )
import qualified Map hiding ( Map ) import qualified Map hiding ( Map )
import Data.IntMap (IntMap)
import Sort ( nub' ) import Sort ( nub' )
import Util ( str, nl ) import Util ( str, nl )
@@ -66,6 +67,7 @@ data RightContext r
= NoRightContext = NoRightContext
| RightContextRExp r | RightContextRExp r
| RightContextCode Code | RightContextCode Code
deriving (Eq,Ord)
instance Show RECtx where instance Show RECtx where
showsPrec _ (RECtx scs _ r rctx code) = showsPrec _ (RECtx scs _ r rctx code) =
@@ -95,16 +97,19 @@ data DFA s a = DFA
dfa_states :: Map s (State s a) dfa_states :: Map s (State s a)
} }
data State s a = State [Accept a] (Map Char s) data State s a = State { state_acc :: [Accept a],
state_out :: IntMap s -- 0..255 only
}
type SNum = Int type SNum = Int
data Accept a data Accept a
= Acc { accPrio :: Int, = Acc { accPrio :: Int,
accAction :: Maybe a, accAction :: Maybe a,
accLeftCtx :: Maybe CharSet, accLeftCtx :: Maybe CharSet, -- cannot be converted to byteset at this point.
accRightCtx :: RightContext SNum accRightCtx :: RightContext SNum
} }
deriving (Eq,Ord)
-- debug stuff -- debug stuff
instance Show (Accept a) where instance Show (Accept a) where
View
@@ -11,6 +11,16 @@
-- ----------------------------------------------------------------------------} -- ----------------------------------------------------------------------------}
module CharSet ( module CharSet (
setSingleton,
Encoding(..),
Byte,
ByteSet,
byteSetSingleton,
byteRanges,
byteSetRange,
CharSet, -- abstract CharSet, -- abstract
emptyCharSet, emptyCharSet,
charSetSingleton, charSetSingleton,
@@ -19,39 +29,141 @@ module CharSet (
charSetComplement, charSetComplement,
charSetRange, charSetRange,
charSetUnion, charSetUnion,
charSetToArray, charSetQuote,
charSetElems setUnions,
byteSetToArray,
byteSetElems,
byteSetElem
) where ) where
import Data.Array ( Array, array ) import Data.Array
import Data.Ranged
import Data.Word
import Data.Maybe (catMaybes)
import Data.Char (chr,ord)
import UTF8
type Byte = Word8
-- Implementation as functions -- Implementation as functions
type CharSet = Char -> Bool type CharSet = RSet Char
type ByteSet = RSet Byte
-- type Utf8Set = RSet [Byte]
type Utf8Range = Span [Byte]
data Encoding = Latin1 | UTF8
emptyCharSet :: CharSet emptyCharSet :: CharSet
emptyCharSet = const False emptyCharSet = rSetEmpty
byteSetElem :: ByteSet -> Byte -> Bool
byteSetElem = rSetHas
charSetSingleton :: Char -> CharSet charSetSingleton :: Char -> CharSet
charSetSingleton c = \x -> x == c charSetSingleton = rSingleton
setSingleton :: DiscreteOrdered a => a -> RSet a
setSingleton = rSingleton
charSet :: [Char] -> CharSet charSet :: [Char] -> CharSet
charSet s x = x `elem` s charSet = setUnions . fmap charSetSingleton
charSetMinus :: CharSet -> CharSet -> CharSet charSetMinus :: CharSet -> CharSet -> CharSet
charSetMinus s1 s2 x = s1 x && not (s2 x) charSetMinus = rSetDifference
charSetUnion :: CharSet -> CharSet -> CharSet charSetUnion :: CharSet -> CharSet -> CharSet
charSetUnion s1 s2 x = s1 x || s2 x charSetUnion = rSetUnion
setUnions :: DiscreteOrdered a => [RSet a] -> RSet a
setUnions = foldr rSetUnion rSetEmpty
charSetComplement :: CharSet -> CharSet charSetComplement :: CharSet -> CharSet
charSetComplement s1 = not . s1 charSetComplement = rSetNegation
charSetRange :: Char -> Char -> CharSet charSetRange :: Char -> Char -> CharSet
charSetRange c1 c2 x = x >= c1 && x <= c2 charSetRange c1 c2 = makeRangedSet [Range (BoundaryBelow c1) (BoundaryAbove c2)]
byteSetToArray :: ByteSet -> Array Byte Bool
byteSetToArray set = array (fst (head ass), fst (last ass)) ass
where ass = [(c,rSetHas set c) | c <- [0..0xff]]
byteSetElems :: ByteSet -> [Byte]
byteSetElems set = [c | c <- [0 .. 0xff], rSetHas set c]
charToRanges :: Encoding -> CharSet -> [Utf8Range]
charToRanges Latin1 =
map (fmap ((: []).fromIntegral.ord)) -- Span [Byte]
. catMaybes
. fmap (charRangeToCharSpan False)
. rSetRanges
charToRanges UTF8 =
concat -- Span [Byte]
. fmap toUtfRange -- [Span [Byte]]
. fmap (fmap UTF8.encode) -- Span [Byte]
. catMaybes
. fmap (charRangeToCharSpan True)
. rSetRanges
-- | Turns a range of characters expressed as a pair of UTF-8 byte sequences into a set of ranges, in which each range of the resulting set is between pairs of sequences of the same length
toUtfRange :: Span [Byte] -> [Span [Byte]]
toUtfRange (Span x y) = fix x y
fix :: [Byte] -> [Byte] -> [Span [Byte]]
fix x y
| length x == length y = [Span x y]
| length x == 1 = Span x [0x7F] : fix [0xC2,0x80] y
| length x == 2 = Span x [0xDF,0xBF] : fix [0xE0,0x80,0x80] y
| length x == 3 = Span x [0xEF,0xBF,0xBF] : fix [0xF0,0x80,0x80,0x80] y
| otherwise = error "fix: incorrect input given"
byteRangeToBytePair :: Span [Byte] -> ([Byte],[Byte])
byteRangeToBytePair (Span x y) = (x,y)
data Span a = Span a a -- lower bound inclusive, higher bound exclusive
-- (SDM: upper bound inclusive, surely??)
instance Functor Span where
fmap f (Span x y) = Span (f x) (f y)
charRangeToCharSpan :: Bool -> Range Char -> Maybe (Span Char)
charRangeToCharSpan _ (Range BoundaryAboveAll _) = Nothing
charRangeToCharSpan _ (Range _ BoundaryBelowAll) = Nothing
charRangeToCharSpan uni (Range x y) = Just (Span (l x) (h y))
where l b = case b of
BoundaryBelowAll -> '\0'
BoundaryBelow a -> a
BoundaryAbove a -> succ a
BoundaryAboveAll -> error "panic: charRangeToCharSpan"
h b = case b of
BoundaryBelowAll -> error "panic: charRangeToCharSpan"
BoundaryBelow a -> pred a
BoundaryAbove a -> a
BoundaryAboveAll | uni -> chr 0x10ffff
| otherwise -> chr 0xff
byteRanges :: Encoding -> CharSet -> [([Byte],[Byte])]
byteRanges enc = fmap byteRangeToBytePair . charToRanges enc
byteSetRange :: Byte -> Byte -> ByteSet
byteSetRange c1 c2 = makeRangedSet [Range (BoundaryBelow c1) (BoundaryAbove c2)]
byteSetSingleton :: Byte -> ByteSet
byteSetSingleton = rSingleton
instance DiscreteOrdered Word8 where
adjacent x y = x + 1 == y
adjacentBelow 0 = Nothing
adjacentBelow x = Just (x-1)
charSetToArray :: CharSet -> Array Char Bool -- TODO: More efficient generated code!
charSetToArray set = array (fst (head ass), fst (last ass)) ass charSetQuote :: CharSet -> String
where ass = [(c,set c) | c <- ['\0'..'\xff']] charSetQuote s = "(\\c -> " ++ foldr (\x y -> x ++ " || " ++ y) "False" (map quoteRange (rSetRanges s)) ++ ")"
where quoteRange (Range l h) = quoteL l ++ " && " ++ quoteH h
quoteL (BoundaryAbove a) = "c > " ++ show a
quoteL (BoundaryBelow a) = "c >= " ++ show a
quoteL (BoundaryAboveAll) = "False"
quoteL (BoundaryBelowAll) = "True"
quoteH (BoundaryAbove a) = "c <= " ++ show a
quoteH (BoundaryBelow a) = "c < " ++ show a
quoteH (BoundaryAboveAll) = "True"
quoteH (BoundaryBelowAll) = "False"
charSetElems :: CharSet -> [Char]
charSetElems set = [c | c <- ['\0'..'\xff'], set c]
View
@@ -17,6 +17,7 @@ module DFA(scanner2dfa) where
import AbsSyn import AbsSyn
import qualified Map import qualified Map
import qualified Data.IntMap as IntMap
import NFA import NFA
import Sort ( msort, nub' ) import Sort ( msort, nub' )
import CharSet import CharSet
@@ -88,8 +89,8 @@ type StartCode = Int
-- state of the partial DFA, until all possible state sets have been considered -- state of the partial DFA, until all possible state sets have been considered
-- The final DFA is then constructed with a `mk_dfa'. -- The final DFA is then constructed with a `mk_dfa'.
scanner2dfa:: Scanner -> [StartCode] -> DFA SNum Code scanner2dfa:: Encoding -> Scanner -> [StartCode] -> DFA SNum Code
scanner2dfa scanner scs = nfa2dfa scs (scanner2nfa scanner scs) scanner2dfa enc scanner scs = nfa2dfa scs (scanner2nfa enc scanner scs)
nfa2dfa:: [StartCode] -> NFA -> DFA SNum Code nfa2dfa:: [StartCode] -> NFA -> DFA SNum Code
nfa2dfa scs nfa = mk_int_dfa nfa (nfa2pdfa nfa pdfa (dfa_start_states pdfa)) nfa2dfa scs nfa = mk_int_dfa nfa (nfa2pdfa nfa pdfa (dfa_start_states pdfa))
@@ -110,30 +111,27 @@ nfa2pdfa nfa pdfa (ss:umkd)
| ss `in_pdfa` pdfa = nfa2pdfa nfa pdfa umkd | ss `in_pdfa` pdfa = nfa2pdfa nfa pdfa umkd
| otherwise = nfa2pdfa nfa pdfa' umkd' | otherwise = nfa2pdfa nfa pdfa' umkd'
where where
pdfa' = add_pdfa ss (State accs (Map.fromList ss_outs)) pdfa pdfa' = add_pdfa ss (State accs (IntMap.fromList ss_outs)) pdfa
umkd' = rctx_sss ++ map snd ss_outs ++ umkd umkd' = rctx_sss ++ map snd ss_outs ++ umkd
-- for each character, the set of states that character would take -- for each character, the set of states that character would take
-- us to from the current set of states in the NFA. -- us to from the current set of states in the NFA.
ss_outs :: [(Char, StateSet)] ss_outs :: [(Int, StateSet)]
ss_outs = [ (ch, mk_ss nfa ss') ss_outs = [ (fromIntegral ch, mk_ss nfa ss')
| ch <- dfa_alphabet, | ch <- byteSetElems $ setUnions [p | (p,_) <- outs],
let ss' = [ s' | (p,s') <- outs, p ch ], let ss' = [ s' | (p,s') <- outs, byteSetElem p ch ],
not (null ss') not (null ss')
] ]
rctx_sss = [ mk_ss nfa [s] rctx_sss = [ mk_ss nfa [s]
| Acc _ _ _ (RightContextRExp s) <- accs ] | Acc _ _ _ (RightContextRExp s) <- accs ]
outs :: [(CharSet,SNum)] outs :: [(ByteSet,SNum)]
outs = [ out | s <- ss, out <- nst_outs (nfa!s) ] outs = [ out | s <- ss, out <- nst_outs (nfa!s) ]
accs = sort_accs [acc| s<-ss, acc<-nst_accs (nfa!s)] accs = sort_accs [acc| s<-ss, acc<-nst_accs (nfa!s)]
dfa_alphabet:: [Char]
dfa_alphabet = ['\0'..'\255']
-- `sort_accs' sorts a list of accept values into decending order of priority, -- `sort_accs' sorts a list of accept values into decending order of priority,
-- eliminating any elements that follow an unconditional accept value. -- eliminating any elements that follow an unconditional accept value.
@@ -198,7 +196,7 @@ mk_int_dfa nfa (DFA start_states mp)
cnv :: State StateSet a -> State SNum a cnv :: State StateSet a -> State SNum a
cnv (State accs as) = State accs' as' cnv (State accs as) = State accs' as'
where where
as' = Map.mapWithKey (\_ch s -> lookup' s) as as' = IntMap.mapWithKey (\_ch s -> lookup' s) as
accs' = map cnv_acc accs accs' = map cnv_acc accs
cnv_acc (Acc p a lctx rctx) = Acc p a lctx rctx' cnv_acc (Acc p a lctx rctx) = Acc p a lctx rctx'
Oops, something went wrong.

0 comments on commit 892688f

Please sign in to comment.