UTF-8 support from Jean-Philippe Bernardy and Alan Zimmerman.

Alex will now, by default, parse a UTF-8-encoded byte sequence. (to disable this behaviour, there is a new flag --latin1, which we will be using in GHC). Because parsing UTF-8 generates a large state machine, I added DFA minimisation (source file DFAMin.hs). This makes Alex itself somewhat slower, but the generated lexers are smaller - dramatically so for UTF-8 lexers.
haskell · Jul 11, 2011 · 892688f · 892688f
1 parent 5d6e749
commit 892688f
Show file tree

Hide file tree

Showing 26 changed files with 1,889 additions and 202 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,37 @@
+*.hi
+*.prof
+*~
+*.o
+*.info
+/AlexTemplate-debug.bak
+/AlexTemplate-ghc-debug.bak
+/AlexTemplate-ghc.bak
+/AlexTemplate.bak
+/AlexWrapper-basic-bytestring.bak
+/AlexWrapper-basic.bak
+/AlexWrapper-gscan.bak
+/AlexWrapper-monad-bytestring.bak
+/AlexWrapper-monad.bak
+/AlexWrapper-monadUserState-bytestring.bak
+/AlexWrapper-monadUserState.bak
+/AlexWrapper-posn-bytestring.bak
+/AlexWrapper-posn.bak
+/AlexWrapper-strict-bytestring.bak
+/AlexTemplate
+/AlexTemplate-debug
+/AlexTemplate-ghc
+/AlexTemplate-ghc-debug
+/AlexWrapper-basic
+/AlexWrapper-basic-bytestring
+/AlexWrapper-gscan
+/AlexWrapper-monad
+/AlexWrapper-monad-bytestring
+/AlexWrapper-monadUserState
+/AlexWrapper-monadUserState-bytestring
+/AlexWrapper-posn
+/AlexWrapper-posn-bytestring
+/AlexWrapper-strict-bytestring
+/dist
+/src/Data/Ranged/Boundaries.hs~
+/src/Data/Ranged/RangedSet.hs~
+/src/Data/Ranged/Ranges.hs~
diff --git a/NOTE.txt b/NOTE.txt
@@ -0,0 +1,9 @@
+Note:
+
+The contents of package Ranged-sets-0.3.0 has been copied into this
+package, in order to allow it to be part of the Haskell Platform,
+without introducing additional dependencies.
+
+The original license agreement has been included in the src/Data
+subdirectory, as required by the package source.
+
diff --git a/alex.cabal b/alex.cabal
@@ -10,15 +10,9 @@ stability: stable
 homepage: http://www.haskell.org/alex/
 synopsis: Alex is a tool for generating lexical analysers in Haskell
 category: Development
-cabal-version: >= 1.2
-
--- Later, this isn't compatible with Cabal 1.2:
--- source-repository head
---   type:     darcs
---   location: http://darcs.haskell.org/alex/
-
-
+cabal-version: >= 1.6
 build-type: Custom
+
 extra-source-files:
 	ANNOUNCE
 	README
@@ -58,6 +52,10 @@ extra-source-files:
 	tests/tokens_gscan.x
 	tests/tokens_posn.x
 
+source-repository head
+    type:     git
+    location: http://darcs.haskell.org/alex.git
+
 flag small_base
   description: Choose the new smaller, split-up base package.
 
@@ -71,13 +69,16 @@ executable alex
     build-depends: base >= 1.0
 
   build-depends: base < 5
+  -- build-depends: Ranged-sets
+  build-depends: QuickCheck >=2
 
   extensions: CPP
-  ghc-options: -Wall
+  ghc-options: -Wall -rtsopts
   other-modules:
         AbsSyn
         CharSet
         DFA
+        DFAMin
         DFS
         Info
         Main
@@ -90,3 +91,4 @@ executable alex
         Set
         Sort
         Util
+        UTF8
diff --git a/src/AbsSyn.hs b/src/AbsSyn.hs
@@ -23,6 +23,7 @@ module AbsSyn (
 import CharSet ( CharSet )
 import Map ( Map )
 import qualified Map hiding ( Map )
+import Data.IntMap (IntMap)
 import Sort ( nub' )
 import Util ( str, nl )
 
@@ -66,6 +67,7 @@ data RightContext r
   = NoRightContext 
   | RightContextRExp r
   | RightContextCode Code
+  deriving (Eq,Ord)
 
 instance Show RECtx where
   showsPrec _ (RECtx scs _ r rctx code) = 
@@ -95,16 +97,19 @@ data DFA s a = DFA
     dfa_states       :: Map s (State s a)
   }
 
-data State s a = State [Accept a] (Map Char s)
+data State s a = State { state_acc :: [Accept a],
+                         state_out :: IntMap s -- 0..255 only
+                       }
 
 type SNum = Int
 
 data Accept a
   = Acc { accPrio       :: Int,
 	  accAction     :: Maybe a,
-	  accLeftCtx    :: Maybe CharSet,
+	  accLeftCtx    :: Maybe CharSet, -- cannot be converted to byteset at this point.
 	  accRightCtx   :: RightContext SNum
     }
+    deriving (Eq,Ord)
 
 -- debug stuff
 instance Show (Accept a) where

diff --git a/src/CharSet.hs b/src/CharSet.hs
@@ -11,6 +11,16 @@
 -- ----------------------------------------------------------------------------}
 
 module CharSet (
+  setSingleton,
+
+  Encoding(..),
+
+  Byte,
+  ByteSet,
+  byteSetSingleton,
+  byteRanges,
+  byteSetRange,
+
   CharSet, -- abstract
   emptyCharSet,
   charSetSingleton,
@@ -19,39 +29,141 @@ module CharSet (
   charSetComplement,
   charSetRange,
   charSetUnion,
-  charSetToArray,
-  charSetElems
+  charSetQuote,
+  setUnions,
+  byteSetToArray,
+  byteSetElems,
+  byteSetElem
   ) where
 
-import Data.Array ( Array, array )
+import Data.Array
+import Data.Ranged
+import Data.Word
+import Data.Maybe (catMaybes)
+import Data.Char (chr,ord)
+import UTF8
 
+type Byte = Word8
 -- Implementation as functions
-type CharSet = Char -> Bool
+type CharSet = RSet Char
+type ByteSet = RSet Byte
+-- type Utf8Set = RSet [Byte]
+type Utf8Range = Span [Byte]
+
+data Encoding = Latin1 | UTF8
 
 emptyCharSet :: CharSet
-emptyCharSet = const False
+emptyCharSet = rSetEmpty
+
+byteSetElem :: ByteSet -> Byte -> Bool
+byteSetElem = rSetHas
 
 charSetSingleton :: Char -> CharSet
-charSetSingleton c = \x -> x == c
+charSetSingleton = rSingleton
+
+setSingleton :: DiscreteOrdered a => a -> RSet a
+setSingleton = rSingleton
 
 charSet :: [Char] -> CharSet
-charSet s x = x `elem` s
+charSet = setUnions . fmap charSetSingleton
 
 charSetMinus :: CharSet -> CharSet -> CharSet
-charSetMinus s1 s2 x = s1 x && not (s2 x)
+charSetMinus = rSetDifference
 
 charSetUnion :: CharSet -> CharSet -> CharSet
-charSetUnion s1 s2 x = s1 x || s2 x
+charSetUnion = rSetUnion
+
+setUnions :: DiscreteOrdered a => [RSet a] -> RSet a
+setUnions = foldr rSetUnion rSetEmpty
 
 charSetComplement :: CharSet -> CharSet
-charSetComplement s1 = not . s1
+charSetComplement = rSetNegation
 
 charSetRange :: Char -> Char -> CharSet
-charSetRange c1 c2 x = x >= c1 && x <= c2
+charSetRange c1 c2 = makeRangedSet [Range (BoundaryBelow c1) (BoundaryAbove c2)]
+
+byteSetToArray :: ByteSet -> Array Byte Bool
+byteSetToArray set = array (fst (head ass), fst (last ass)) ass
+  where ass = [(c,rSetHas set c) | c <- [0..0xff]]
+
+byteSetElems :: ByteSet -> [Byte]
+byteSetElems set = [c | c <- [0 .. 0xff], rSetHas set c]
+
+charToRanges :: Encoding -> CharSet -> [Utf8Range]
+charToRanges Latin1 =
+    map (fmap ((: []).fromIntegral.ord)) -- Span [Byte]
+  . catMaybes
+  . fmap (charRangeToCharSpan False)
+  . rSetRanges
+charToRanges UTF8 =
+    concat                  -- Span [Byte]
+  . fmap toUtfRange         -- [Span [Byte]]
+  . fmap (fmap UTF8.encode) -- Span [Byte]
+  . catMaybes
+  . fmap (charRangeToCharSpan True)
+  . rSetRanges
+
+-- | Turns a range of characters expressed as a pair of UTF-8 byte sequences into a set of ranges, in which each range of the resulting set is between pairs of sequences of the same length
+toUtfRange :: Span [Byte] -> [Span [Byte]]
+toUtfRange (Span x y) = fix x y
+
+fix :: [Byte] -> [Byte] -> [Span [Byte]]
+fix x y 
+    | length x == length y = [Span x y]
+    | length x == 1 = Span x [0x7F] : fix [0xC2,0x80] y    
+    | length x == 2 = Span x [0xDF,0xBF] : fix [0xE0,0x80,0x80] y
+    | length x == 3 = Span x [0xEF,0xBF,0xBF] : fix [0xF0,0x80,0x80,0x80] y
+    | otherwise = error "fix: incorrect input given"
+
+
+byteRangeToBytePair :: Span [Byte] -> ([Byte],[Byte])
+byteRangeToBytePair (Span x y) = (x,y)
+
+data Span a = Span a a -- lower bound inclusive, higher bound exclusive
+                       -- (SDM: upper bound inclusive, surely??)
+instance Functor Span where
+    fmap f (Span x y) = Span (f x) (f y)
+
+charRangeToCharSpan :: Bool -> Range Char -> Maybe (Span Char)
+charRangeToCharSpan _ (Range BoundaryAboveAll _) = Nothing
+charRangeToCharSpan _ (Range _ BoundaryBelowAll) = Nothing
+charRangeToCharSpan uni (Range x y) = Just (Span (l x) (h y))
+    where l b = case b of
+            BoundaryBelowAll -> '\0'
+            BoundaryBelow a  -> a
+            BoundaryAbove a  -> succ a
+            BoundaryAboveAll -> error "panic: charRangeToCharSpan"
+          h b = case b of
+            BoundaryBelowAll -> error "panic: charRangeToCharSpan"
+            BoundaryBelow a  -> pred a
+            BoundaryAbove a  -> a
+            BoundaryAboveAll | uni -> chr 0x10ffff
+                             | otherwise -> chr 0xff
+
+byteRanges :: Encoding -> CharSet -> [([Byte],[Byte])]
+byteRanges enc =  fmap byteRangeToBytePair . charToRanges enc
+
+byteSetRange :: Byte -> Byte -> ByteSet
+byteSetRange c1 c2 = makeRangedSet [Range (BoundaryBelow c1) (BoundaryAbove c2)]
+
+byteSetSingleton :: Byte -> ByteSet
+byteSetSingleton = rSingleton
+
+instance DiscreteOrdered Word8 where
+    adjacent x y = x + 1 == y
+    adjacentBelow 0 = Nothing
+    adjacentBelow x = Just (x-1)
 
-charSetToArray :: CharSet -> Array Char Bool
-charSetToArray set = array (fst (head ass), fst (last ass)) ass
-  where ass = [(c,set c) | c <- ['\0'..'\xff']]
+-- TODO: More efficient generated code!
+charSetQuote :: CharSet -> String
+charSetQuote s = "(\\c -> " ++ foldr (\x y -> x ++ " || " ++ y) "False" (map quoteRange (rSetRanges s)) ++ ")" 
+    where quoteRange (Range l h) = quoteL l ++ " && " ++ quoteH h
+          quoteL (BoundaryAbove a) = "c > " ++ show a
+          quoteL (BoundaryBelow a) = "c >= " ++ show a
+          quoteL (BoundaryAboveAll) = "False"
+          quoteL (BoundaryBelowAll) = "True"
+          quoteH (BoundaryAbove a) = "c <= " ++ show a
+          quoteH (BoundaryBelow a) = "c < " ++ show a
+          quoteH (BoundaryAboveAll) = "True"
+          quoteH (BoundaryBelowAll) = "False"
 
-charSetElems :: CharSet -> [Char]
-charSetElems set = [c | c <- ['\0'..'\xff'], set c]
diff --git a/src/DFA.hs b/src/DFA.hs
@@ -17,6 +17,7 @@ module DFA(scanner2dfa) where
 
 import AbsSyn
 import qualified Map
+import qualified Data.IntMap as IntMap
 import NFA
 import Sort ( msort, nub' )
 import CharSet
@@ -88,8 +89,8 @@ type StartCode = Int
 -- state of the partial DFA, until all possible state sets have been considered
 -- The final DFA is then constructed with a `mk_dfa'.
 
-scanner2dfa:: Scanner -> [StartCode] -> DFA SNum Code
-scanner2dfa scanner scs = nfa2dfa scs (scanner2nfa scanner scs)
+scanner2dfa:: Encoding -> Scanner -> [StartCode] -> DFA SNum Code
+scanner2dfa enc scanner scs = nfa2dfa scs (scanner2nfa enc scanner scs)
 
 nfa2dfa:: [StartCode] -> NFA -> DFA SNum Code
 nfa2dfa scs nfa = mk_int_dfa nfa (nfa2pdfa nfa pdfa (dfa_start_states pdfa))
@@ -110,30 +111,27 @@ nfa2pdfa nfa pdfa (ss:umkd)
   |  ss `in_pdfa` pdfa =  nfa2pdfa nfa pdfa  umkd
   |  otherwise         =  nfa2pdfa nfa pdfa' umkd'
   where
-	pdfa' = add_pdfa ss (State accs (Map.fromList ss_outs)) pdfa
+        pdfa' = add_pdfa ss (State accs (IntMap.fromList ss_outs)) pdfa
 
 	umkd' = rctx_sss ++ map snd ss_outs ++ umkd
 
         -- for each character, the set of states that character would take
         -- us to from the current set of states in the NFA.
-        ss_outs :: [(Char, StateSet)]
-	ss_outs =  [ (ch, mk_ss nfa ss')
-		   | ch  <- dfa_alphabet,
-		     let ss'  = [ s' | (p,s') <- outs, p ch ],
-		     not (null ss')
-		   ]
+        ss_outs :: [(Int, StateSet)]
+        ss_outs = [ (fromIntegral ch, mk_ss nfa ss')
+		  | ch  <- byteSetElems $ setUnions [p | (p,_) <- outs],
+		    let ss'  = [ s' | (p,s') <- outs, byteSetElem p ch ],
+		    not (null ss')
+		  ]
 
 	rctx_sss = [ mk_ss nfa [s]
 		   | Acc _ _ _ (RightContextRExp s) <- accs ]
 
-        outs :: [(CharSet,SNum)]
+        outs :: [(ByteSet,SNum)]
 	outs =  [ out | s <- ss, out <- nst_outs (nfa!s) ]
 
 	accs = sort_accs [acc| s<-ss, acc<-nst_accs (nfa!s)]
 
-dfa_alphabet:: [Char]
-dfa_alphabet = ['\0'..'\255']
-
 -- `sort_accs' sorts a list of accept values into decending order of priority,
 -- eliminating any elements that follow an unconditional accept value.
 
@@ -198,7 +196,7 @@ mk_int_dfa nfa (DFA start_states mp)
 	cnv :: State StateSet a -> State SNum a
 	cnv (State accs as) = State accs' as'
 		where
-		as'   = Map.mapWithKey (\_ch s -> lookup' s) as
+                as'   = IntMap.mapWithKey (\_ch s -> lookup' s) as
 
 		accs' = map cnv_acc accs
 		cnv_acc (Acc p a lctx rctx) = Acc p a lctx rctx'