Permalink
Browse files

UTF-8 support from Jean-Philippe Bernardy and Alan Zimmerman.

Alex will now, by default, parse a UTF-8-encoded byte sequence.  (to
disable this behaviour, there is a new flag --latin1, which we will be
using in GHC).

Because parsing UTF-8 generates a large state machine, I added DFA
minimisation (source file DFAMin.hs).  This makes Alex itself somewhat
slower, but the generated lexers are smaller - dramatically so for
UTF-8 lexers.
  • Loading branch information...
simonmar committed Jul 11, 2011
1 parent 5d6e749 commit 892688ff71bcc4b74313f7348ee5dace73dd8506
Showing with 1,889 additions and 202 deletions.
  1. +37 −0 .gitignore
  2. +9 −0 NOTE.txt
  3. +11 −9 alex.cabal
  4. +7 −2 src/AbsSyn.hs
  5. +128 −16 src/CharSet.hs
  6. +12 −14 src/DFA.hs
  7. +150 −0 src/DFAMin.hs
  8. +30 −0 src/Data/LICENSE.txt
  9. +9 −0 src/Data/Ranged.hs
  10. +229 −0 src/Data/Ranged/Boundaries.hs
  11. +486 −0 src/Data/Ranged/RangedSet.hs
  12. +360 −0 src/Data/Ranged/Ranges.hs
  13. +3 −3 src/Info.hs
  14. +27 −4 src/Main.hs
  15. +1 −0 src/Map.hs
  16. +64 −20 src/NFA.hs
  17. +12 −15 src/Output.hs
  18. +21 −8 src/ParseMonad.hs
  19. +79 −66 src/Scan.hs
  20. +4 −4 src/Scan.x
  21. +1 −0 src/Set.hs
  22. +36 −0 src/UTF8.hs
  23. +19 −11 templates/GenericTemplate.hs
  24. +64 −25 templates/wrappers.hs
  25. +8 −5 tests/Makefile
  26. +82 −0 tests/unicode.x
View
@@ -0,0 +1,37 @@
*.hi
*.prof
*~
*.o
*.info
/AlexTemplate-debug.bak
/AlexTemplate-ghc-debug.bak
/AlexTemplate-ghc.bak
/AlexTemplate.bak
/AlexWrapper-basic-bytestring.bak
/AlexWrapper-basic.bak
/AlexWrapper-gscan.bak
/AlexWrapper-monad-bytestring.bak
/AlexWrapper-monad.bak
/AlexWrapper-monadUserState-bytestring.bak
/AlexWrapper-monadUserState.bak
/AlexWrapper-posn-bytestring.bak
/AlexWrapper-posn.bak
/AlexWrapper-strict-bytestring.bak
/AlexTemplate
/AlexTemplate-debug
/AlexTemplate-ghc
/AlexTemplate-ghc-debug
/AlexWrapper-basic
/AlexWrapper-basic-bytestring
/AlexWrapper-gscan
/AlexWrapper-monad
/AlexWrapper-monad-bytestring
/AlexWrapper-monadUserState
/AlexWrapper-monadUserState-bytestring
/AlexWrapper-posn
/AlexWrapper-posn-bytestring
/AlexWrapper-strict-bytestring
/dist
/src/Data/Ranged/Boundaries.hs~
/src/Data/Ranged/RangedSet.hs~
/src/Data/Ranged/Ranges.hs~
View
@@ -0,0 +1,9 @@
Note:
The contents of package Ranged-sets-0.3.0 has been copied into this
package, in order to allow it to be part of the Haskell Platform,
without introducing additional dependencies.
The original license agreement has been included in the src/Data
subdirectory, as required by the package source.
View
@@ -10,15 +10,9 @@ stability: stable
homepage: http://www.haskell.org/alex/
synopsis: Alex is a tool for generating lexical analysers in Haskell
category: Development
cabal-version: >= 1.2
-- Later, this isn't compatible with Cabal 1.2:
-- source-repository head
-- type: darcs
-- location: http://darcs.haskell.org/alex/
cabal-version: >= 1.6
build-type: Custom
extra-source-files:
ANNOUNCE
README
@@ -58,6 +52,10 @@ extra-source-files:
tests/tokens_gscan.x
tests/tokens_posn.x
source-repository head
type: git
location: http://darcs.haskell.org/alex.git
flag small_base
description: Choose the new smaller, split-up base package.
@@ -71,13 +69,16 @@ executable alex
build-depends: base >= 1.0
build-depends: base < 5
-- build-depends: Ranged-sets
build-depends: QuickCheck >=2
extensions: CPP
ghc-options: -Wall
ghc-options: -Wall -rtsopts
other-modules:
AbsSyn
CharSet
DFA
DFAMin
DFS
Info
Main
@@ -90,3 +91,4 @@ executable alex
Set
Sort
Util
UTF8
View
@@ -23,6 +23,7 @@ module AbsSyn (
import CharSet ( CharSet )
import Map ( Map )
import qualified Map hiding ( Map )
import Data.IntMap (IntMap)
import Sort ( nub' )
import Util ( str, nl )
@@ -66,6 +67,7 @@ data RightContext r
= NoRightContext
| RightContextRExp r
| RightContextCode Code
deriving (Eq,Ord)
instance Show RECtx where
showsPrec _ (RECtx scs _ r rctx code) =
@@ -95,16 +97,19 @@ data DFA s a = DFA
dfa_states :: Map s (State s a)
}
data State s a = State [Accept a] (Map Char s)
data State s a = State { state_acc :: [Accept a],
state_out :: IntMap s -- 0..255 only
}
type SNum = Int
data Accept a
= Acc { accPrio :: Int,
accAction :: Maybe a,
accLeftCtx :: Maybe CharSet,
accLeftCtx :: Maybe CharSet, -- cannot be converted to byteset at this point.
accRightCtx :: RightContext SNum
}
deriving (Eq,Ord)
-- debug stuff
instance Show (Accept a) where
View
@@ -11,6 +11,16 @@
-- ----------------------------------------------------------------------------}
module CharSet (
setSingleton,
Encoding(..),
Byte,
ByteSet,
byteSetSingleton,
byteRanges,
byteSetRange,
CharSet, -- abstract
emptyCharSet,
charSetSingleton,
@@ -19,39 +29,141 @@ module CharSet (
charSetComplement,
charSetRange,
charSetUnion,
charSetToArray,
charSetElems
charSetQuote,
setUnions,
byteSetToArray,
byteSetElems,
byteSetElem
) where
import Data.Array ( Array, array )
import Data.Array
import Data.Ranged
import Data.Word
import Data.Maybe (catMaybes)
import Data.Char (chr,ord)
import UTF8
type Byte = Word8
-- Implementation as functions
type CharSet = Char -> Bool
type CharSet = RSet Char
type ByteSet = RSet Byte
-- type Utf8Set = RSet [Byte]
type Utf8Range = Span [Byte]
data Encoding = Latin1 | UTF8
emptyCharSet :: CharSet
emptyCharSet = const False
emptyCharSet = rSetEmpty
byteSetElem :: ByteSet -> Byte -> Bool
byteSetElem = rSetHas
charSetSingleton :: Char -> CharSet
charSetSingleton c = \x -> x == c
charSetSingleton = rSingleton
setSingleton :: DiscreteOrdered a => a -> RSet a
setSingleton = rSingleton
charSet :: [Char] -> CharSet
charSet s x = x `elem` s
charSet = setUnions . fmap charSetSingleton
charSetMinus :: CharSet -> CharSet -> CharSet
charSetMinus s1 s2 x = s1 x && not (s2 x)
charSetMinus = rSetDifference
charSetUnion :: CharSet -> CharSet -> CharSet
charSetUnion s1 s2 x = s1 x || s2 x
charSetUnion = rSetUnion
setUnions :: DiscreteOrdered a => [RSet a] -> RSet a
setUnions = foldr rSetUnion rSetEmpty
charSetComplement :: CharSet -> CharSet
charSetComplement s1 = not . s1
charSetComplement = rSetNegation
charSetRange :: Char -> Char -> CharSet
charSetRange c1 c2 x = x >= c1 && x <= c2
charSetRange c1 c2 = makeRangedSet [Range (BoundaryBelow c1) (BoundaryAbove c2)]
byteSetToArray :: ByteSet -> Array Byte Bool
byteSetToArray set = array (fst (head ass), fst (last ass)) ass
where ass = [(c,rSetHas set c) | c <- [0..0xff]]
byteSetElems :: ByteSet -> [Byte]
byteSetElems set = [c | c <- [0 .. 0xff], rSetHas set c]
charToRanges :: Encoding -> CharSet -> [Utf8Range]
charToRanges Latin1 =
map (fmap ((: []).fromIntegral.ord)) -- Span [Byte]
. catMaybes
. fmap (charRangeToCharSpan False)
. rSetRanges
charToRanges UTF8 =
concat -- Span [Byte]
. fmap toUtfRange -- [Span [Byte]]
. fmap (fmap UTF8.encode) -- Span [Byte]
. catMaybes
. fmap (charRangeToCharSpan True)
. rSetRanges
-- | Turns a range of characters expressed as a pair of UTF-8 byte sequences into a set of ranges, in which each range of the resulting set is between pairs of sequences of the same length
toUtfRange :: Span [Byte] -> [Span [Byte]]
toUtfRange (Span x y) = fix x y
fix :: [Byte] -> [Byte] -> [Span [Byte]]
fix x y
| length x == length y = [Span x y]
| length x == 1 = Span x [0x7F] : fix [0xC2,0x80] y
| length x == 2 = Span x [0xDF,0xBF] : fix [0xE0,0x80,0x80] y
| length x == 3 = Span x [0xEF,0xBF,0xBF] : fix [0xF0,0x80,0x80,0x80] y
| otherwise = error "fix: incorrect input given"
byteRangeToBytePair :: Span [Byte] -> ([Byte],[Byte])
byteRangeToBytePair (Span x y) = (x,y)
data Span a = Span a a -- lower bound inclusive, higher bound exclusive
-- (SDM: upper bound inclusive, surely??)
instance Functor Span where
fmap f (Span x y) = Span (f x) (f y)
charRangeToCharSpan :: Bool -> Range Char -> Maybe (Span Char)
charRangeToCharSpan _ (Range BoundaryAboveAll _) = Nothing
charRangeToCharSpan _ (Range _ BoundaryBelowAll) = Nothing
charRangeToCharSpan uni (Range x y) = Just (Span (l x) (h y))
where l b = case b of
BoundaryBelowAll -> '\0'
BoundaryBelow a -> a
BoundaryAbove a -> succ a
BoundaryAboveAll -> error "panic: charRangeToCharSpan"
h b = case b of
BoundaryBelowAll -> error "panic: charRangeToCharSpan"
BoundaryBelow a -> pred a
BoundaryAbove a -> a
BoundaryAboveAll | uni -> chr 0x10ffff
| otherwise -> chr 0xff
byteRanges :: Encoding -> CharSet -> [([Byte],[Byte])]
byteRanges enc = fmap byteRangeToBytePair . charToRanges enc
byteSetRange :: Byte -> Byte -> ByteSet
byteSetRange c1 c2 = makeRangedSet [Range (BoundaryBelow c1) (BoundaryAbove c2)]
byteSetSingleton :: Byte -> ByteSet
byteSetSingleton = rSingleton
instance DiscreteOrdered Word8 where
adjacent x y = x + 1 == y
adjacentBelow 0 = Nothing
adjacentBelow x = Just (x-1)
charSetToArray :: CharSet -> Array Char Bool
charSetToArray set = array (fst (head ass), fst (last ass)) ass
where ass = [(c,set c) | c <- ['\0'..'\xff']]
-- TODO: More efficient generated code!
charSetQuote :: CharSet -> String
charSetQuote s = "(\\c -> " ++ foldr (\x y -> x ++ " || " ++ y) "False" (map quoteRange (rSetRanges s)) ++ ")"
where quoteRange (Range l h) = quoteL l ++ " && " ++ quoteH h
quoteL (BoundaryAbove a) = "c > " ++ show a
quoteL (BoundaryBelow a) = "c >= " ++ show a
quoteL (BoundaryAboveAll) = "False"
quoteL (BoundaryBelowAll) = "True"
quoteH (BoundaryAbove a) = "c <= " ++ show a
quoteH (BoundaryBelow a) = "c < " ++ show a
quoteH (BoundaryAboveAll) = "True"
quoteH (BoundaryBelowAll) = "False"
charSetElems :: CharSet -> [Char]
charSetElems set = [c | c <- ['\0'..'\xff'], set c]
View
@@ -17,6 +17,7 @@ module DFA(scanner2dfa) where
import AbsSyn
import qualified Map
import qualified Data.IntMap as IntMap
import NFA
import Sort ( msort, nub' )
import CharSet
@@ -88,8 +89,8 @@ type StartCode = Int
-- state of the partial DFA, until all possible state sets have been considered
-- The final DFA is then constructed with a `mk_dfa'.
scanner2dfa:: Scanner -> [StartCode] -> DFA SNum Code
scanner2dfa scanner scs = nfa2dfa scs (scanner2nfa scanner scs)
scanner2dfa:: Encoding -> Scanner -> [StartCode] -> DFA SNum Code
scanner2dfa enc scanner scs = nfa2dfa scs (scanner2nfa enc scanner scs)
nfa2dfa:: [StartCode] -> NFA -> DFA SNum Code
nfa2dfa scs nfa = mk_int_dfa nfa (nfa2pdfa nfa pdfa (dfa_start_states pdfa))
@@ -110,30 +111,27 @@ nfa2pdfa nfa pdfa (ss:umkd)
| ss `in_pdfa` pdfa = nfa2pdfa nfa pdfa umkd
| otherwise = nfa2pdfa nfa pdfa' umkd'
where
pdfa' = add_pdfa ss (State accs (Map.fromList ss_outs)) pdfa
pdfa' = add_pdfa ss (State accs (IntMap.fromList ss_outs)) pdfa
umkd' = rctx_sss ++ map snd ss_outs ++ umkd
-- for each character, the set of states that character would take
-- us to from the current set of states in the NFA.
ss_outs :: [(Char, StateSet)]
ss_outs = [ (ch, mk_ss nfa ss')
| ch <- dfa_alphabet,
let ss' = [ s' | (p,s') <- outs, p ch ],
not (null ss')
]
ss_outs :: [(Int, StateSet)]
ss_outs = [ (fromIntegral ch, mk_ss nfa ss')
| ch <- byteSetElems $ setUnions [p | (p,_) <- outs],
let ss' = [ s' | (p,s') <- outs, byteSetElem p ch ],
not (null ss')
]
rctx_sss = [ mk_ss nfa [s]
| Acc _ _ _ (RightContextRExp s) <- accs ]
outs :: [(CharSet,SNum)]
outs :: [(ByteSet,SNum)]
outs = [ out | s <- ss, out <- nst_outs (nfa!s) ]
accs = sort_accs [acc| s<-ss, acc<-nst_accs (nfa!s)]
dfa_alphabet:: [Char]
dfa_alphabet = ['\0'..'\255']
-- `sort_accs' sorts a list of accept values into decending order of priority,
-- eliminating any elements that follow an unconditional accept value.
@@ -198,7 +196,7 @@ mk_int_dfa nfa (DFA start_states mp)
cnv :: State StateSet a -> State SNum a
cnv (State accs as) = State accs' as'
where
as' = Map.mapWithKey (\_ch s -> lookup' s) as
as' = IntMap.mapWithKey (\_ch s -> lookup' s) as
accs' = map cnv_acc accs
cnv_acc (Acc p a lctx rctx) = Acc p a lctx rctx'
Oops, something went wrong.

0 comments on commit 892688f

Please sign in to comment.