Permalink
Browse files

UTF-8 support from Jean-Philippe Bernardy and Alan Zimmerman.

Alex will now, by default, parse a UTF-8-encoded byte sequence.  (to
disable this behaviour, there is a new flag --latin1, which we will be
using in GHC).

Because parsing UTF-8 generates a large state machine, I added DFA
minimisation (source file DFAMin.hs).  This makes Alex itself somewhat
slower, but the generated lexers are smaller - dramatically so for
UTF-8 lexers.
  • Loading branch information...
1 parent 5d6e749 commit 892688ff71bcc4b74313f7348ee5dace73dd8506 @simonmar committed Jul 11, 2011
Showing with 1,889 additions and 202 deletions.
  1. +37 −0 .gitignore
  2. +9 −0 NOTE.txt
  3. +11 −9 alex.cabal
  4. +7 −2 src/AbsSyn.hs
  5. +128 −16 src/CharSet.hs
  6. +12 −14 src/DFA.hs
  7. +150 −0 src/DFAMin.hs
  8. +30 −0 src/Data/LICENSE.txt
  9. +9 −0 src/Data/Ranged.hs
  10. +229 −0 src/Data/Ranged/Boundaries.hs
  11. +486 −0 src/Data/Ranged/RangedSet.hs
  12. +360 −0 src/Data/Ranged/Ranges.hs
  13. +3 −3 src/Info.hs
  14. +27 −4 src/Main.hs
  15. +1 −0 src/Map.hs
  16. +64 −20 src/NFA.hs
  17. +12 −15 src/Output.hs
  18. +21 −8 src/ParseMonad.hs
  19. +79 −66 src/Scan.hs
  20. +4 −4 src/Scan.x
  21. +1 −0 src/Set.hs
  22. +36 −0 src/UTF8.hs
  23. +19 −11 templates/GenericTemplate.hs
  24. +64 −25 templates/wrappers.hs
  25. +8 −5 tests/Makefile
  26. +82 −0 tests/unicode.x
View
@@ -0,0 +1,37 @@
+*.hi
+*.prof
+*~
+*.o
+*.info
+/AlexTemplate-debug.bak
+/AlexTemplate-ghc-debug.bak
+/AlexTemplate-ghc.bak
+/AlexTemplate.bak
+/AlexWrapper-basic-bytestring.bak
+/AlexWrapper-basic.bak
+/AlexWrapper-gscan.bak
+/AlexWrapper-monad-bytestring.bak
+/AlexWrapper-monad.bak
+/AlexWrapper-monadUserState-bytestring.bak
+/AlexWrapper-monadUserState.bak
+/AlexWrapper-posn-bytestring.bak
+/AlexWrapper-posn.bak
+/AlexWrapper-strict-bytestring.bak
+/AlexTemplate
+/AlexTemplate-debug
+/AlexTemplate-ghc
+/AlexTemplate-ghc-debug
+/AlexWrapper-basic
+/AlexWrapper-basic-bytestring
+/AlexWrapper-gscan
+/AlexWrapper-monad
+/AlexWrapper-monad-bytestring
+/AlexWrapper-monadUserState
+/AlexWrapper-monadUserState-bytestring
+/AlexWrapper-posn
+/AlexWrapper-posn-bytestring
+/AlexWrapper-strict-bytestring
+/dist
+/src/Data/Ranged/Boundaries.hs~
+/src/Data/Ranged/RangedSet.hs~
+/src/Data/Ranged/Ranges.hs~
View
@@ -0,0 +1,9 @@
+Note:
+
+The contents of package Ranged-sets-0.3.0 has been copied into this
+package, in order to allow it to be part of the Haskell Platform,
+without introducing additional dependencies.
+
+The original license agreement has been included in the src/Data
+subdirectory, as required by the package source.
+
View
@@ -10,15 +10,9 @@ stability: stable
homepage: http://www.haskell.org/alex/
synopsis: Alex is a tool for generating lexical analysers in Haskell
category: Development
-cabal-version: >= 1.2
-
--- Later, this isn't compatible with Cabal 1.2:
--- source-repository head
--- type: darcs
--- location: http://darcs.haskell.org/alex/
-
-
+cabal-version: >= 1.6
build-type: Custom
+
extra-source-files:
ANNOUNCE
README
@@ -58,6 +52,10 @@ extra-source-files:
tests/tokens_gscan.x
tests/tokens_posn.x
+source-repository head
+ type: git
+ location: http://darcs.haskell.org/alex.git
+
flag small_base
description: Choose the new smaller, split-up base package.
@@ -71,13 +69,16 @@ executable alex
build-depends: base >= 1.0
build-depends: base < 5
+ -- build-depends: Ranged-sets
+ build-depends: QuickCheck >=2
extensions: CPP
- ghc-options: -Wall
+ ghc-options: -Wall -rtsopts
other-modules:
AbsSyn
CharSet
DFA
+ DFAMin
DFS
Info
Main
@@ -90,3 +91,4 @@ executable alex
Set
Sort
Util
+ UTF8
View
@@ -23,6 +23,7 @@ module AbsSyn (
import CharSet ( CharSet )
import Map ( Map )
import qualified Map hiding ( Map )
+import Data.IntMap (IntMap)
import Sort ( nub' )
import Util ( str, nl )
@@ -66,6 +67,7 @@ data RightContext r
= NoRightContext
| RightContextRExp r
| RightContextCode Code
+ deriving (Eq,Ord)
instance Show RECtx where
showsPrec _ (RECtx scs _ r rctx code) =
@@ -95,16 +97,19 @@ data DFA s a = DFA
dfa_states :: Map s (State s a)
}
-data State s a = State [Accept a] (Map Char s)
+data State s a = State { state_acc :: [Accept a],
+ state_out :: IntMap s -- 0..255 only
+ }
type SNum = Int
data Accept a
= Acc { accPrio :: Int,
accAction :: Maybe a,
- accLeftCtx :: Maybe CharSet,
+ accLeftCtx :: Maybe CharSet, -- cannot be converted to byteset at this point.
accRightCtx :: RightContext SNum
}
+ deriving (Eq,Ord)
-- debug stuff
instance Show (Accept a) where
View
@@ -11,6 +11,16 @@
-- ----------------------------------------------------------------------------}
module CharSet (
+ setSingleton,
+
+ Encoding(..),
+
+ Byte,
+ ByteSet,
+ byteSetSingleton,
+ byteRanges,
+ byteSetRange,
+
CharSet, -- abstract
emptyCharSet,
charSetSingleton,
@@ -19,39 +29,141 @@ module CharSet (
charSetComplement,
charSetRange,
charSetUnion,
- charSetToArray,
- charSetElems
+ charSetQuote,
+ setUnions,
+ byteSetToArray,
+ byteSetElems,
+ byteSetElem
) where
-import Data.Array ( Array, array )
+import Data.Array
+import Data.Ranged
+import Data.Word
+import Data.Maybe (catMaybes)
+import Data.Char (chr,ord)
+import UTF8
+type Byte = Word8
-- Implementation as functions
-type CharSet = Char -> Bool
+type CharSet = RSet Char
+type ByteSet = RSet Byte
+-- type Utf8Set = RSet [Byte]
+type Utf8Range = Span [Byte]
+
+data Encoding = Latin1 | UTF8
emptyCharSet :: CharSet
-emptyCharSet = const False
+emptyCharSet = rSetEmpty
+
+byteSetElem :: ByteSet -> Byte -> Bool
+byteSetElem = rSetHas
charSetSingleton :: Char -> CharSet
-charSetSingleton c = \x -> x == c
+charSetSingleton = rSingleton
+
+setSingleton :: DiscreteOrdered a => a -> RSet a
+setSingleton = rSingleton
charSet :: [Char] -> CharSet
-charSet s x = x `elem` s
+charSet = setUnions . fmap charSetSingleton
charSetMinus :: CharSet -> CharSet -> CharSet
-charSetMinus s1 s2 x = s1 x && not (s2 x)
+charSetMinus = rSetDifference
charSetUnion :: CharSet -> CharSet -> CharSet
-charSetUnion s1 s2 x = s1 x || s2 x
+charSetUnion = rSetUnion
+
+setUnions :: DiscreteOrdered a => [RSet a] -> RSet a
+setUnions = foldr rSetUnion rSetEmpty
charSetComplement :: CharSet -> CharSet
-charSetComplement s1 = not . s1
+charSetComplement = rSetNegation
charSetRange :: Char -> Char -> CharSet
-charSetRange c1 c2 x = x >= c1 && x <= c2
+charSetRange c1 c2 = makeRangedSet [Range (BoundaryBelow c1) (BoundaryAbove c2)]
+
+byteSetToArray :: ByteSet -> Array Byte Bool
+byteSetToArray set = array (fst (head ass), fst (last ass)) ass
+ where ass = [(c,rSetHas set c) | c <- [0..0xff]]
+
+byteSetElems :: ByteSet -> [Byte]
+byteSetElems set = [c | c <- [0 .. 0xff], rSetHas set c]
+
+charToRanges :: Encoding -> CharSet -> [Utf8Range]
+charToRanges Latin1 =
+ map (fmap ((: []).fromIntegral.ord)) -- Span [Byte]
+ . catMaybes
+ . fmap (charRangeToCharSpan False)
+ . rSetRanges
+charToRanges UTF8 =
+ concat -- Span [Byte]
+ . fmap toUtfRange -- [Span [Byte]]
+ . fmap (fmap UTF8.encode) -- Span [Byte]
+ . catMaybes
+ . fmap (charRangeToCharSpan True)
+ . rSetRanges
+
+-- | Turns a range of characters expressed as a pair of UTF-8 byte sequences into a set of ranges, in which each range of the resulting set is between pairs of sequences of the same length
+toUtfRange :: Span [Byte] -> [Span [Byte]]
+toUtfRange (Span x y) = fix x y
+
+fix :: [Byte] -> [Byte] -> [Span [Byte]]
+fix x y
+ | length x == length y = [Span x y]
+ | length x == 1 = Span x [0x7F] : fix [0xC2,0x80] y
+ | length x == 2 = Span x [0xDF,0xBF] : fix [0xE0,0x80,0x80] y
+ | length x == 3 = Span x [0xEF,0xBF,0xBF] : fix [0xF0,0x80,0x80,0x80] y
+ | otherwise = error "fix: incorrect input given"
+
+
+byteRangeToBytePair :: Span [Byte] -> ([Byte],[Byte])
+byteRangeToBytePair (Span x y) = (x,y)
+
+data Span a = Span a a -- lower bound inclusive, higher bound exclusive
+ -- (SDM: upper bound inclusive, surely??)
+instance Functor Span where
+ fmap f (Span x y) = Span (f x) (f y)
+
+charRangeToCharSpan :: Bool -> Range Char -> Maybe (Span Char)
+charRangeToCharSpan _ (Range BoundaryAboveAll _) = Nothing
+charRangeToCharSpan _ (Range _ BoundaryBelowAll) = Nothing
+charRangeToCharSpan uni (Range x y) = Just (Span (l x) (h y))
+ where l b = case b of
+ BoundaryBelowAll -> '\0'
+ BoundaryBelow a -> a
+ BoundaryAbove a -> succ a
+ BoundaryAboveAll -> error "panic: charRangeToCharSpan"
+ h b = case b of
+ BoundaryBelowAll -> error "panic: charRangeToCharSpan"
+ BoundaryBelow a -> pred a
+ BoundaryAbove a -> a
+ BoundaryAboveAll | uni -> chr 0x10ffff
+ | otherwise -> chr 0xff
+
+byteRanges :: Encoding -> CharSet -> [([Byte],[Byte])]
+byteRanges enc = fmap byteRangeToBytePair . charToRanges enc
+
+byteSetRange :: Byte -> Byte -> ByteSet
+byteSetRange c1 c2 = makeRangedSet [Range (BoundaryBelow c1) (BoundaryAbove c2)]
+
+byteSetSingleton :: Byte -> ByteSet
+byteSetSingleton = rSingleton
+
+instance DiscreteOrdered Word8 where
+ adjacent x y = x + 1 == y
+ adjacentBelow 0 = Nothing
+ adjacentBelow x = Just (x-1)
-charSetToArray :: CharSet -> Array Char Bool
-charSetToArray set = array (fst (head ass), fst (last ass)) ass
- where ass = [(c,set c) | c <- ['\0'..'\xff']]
+-- TODO: More efficient generated code!
+charSetQuote :: CharSet -> String
+charSetQuote s = "(\\c -> " ++ foldr (\x y -> x ++ " || " ++ y) "False" (map quoteRange (rSetRanges s)) ++ ")"
+ where quoteRange (Range l h) = quoteL l ++ " && " ++ quoteH h
+ quoteL (BoundaryAbove a) = "c > " ++ show a
+ quoteL (BoundaryBelow a) = "c >= " ++ show a
+ quoteL (BoundaryAboveAll) = "False"
+ quoteL (BoundaryBelowAll) = "True"
+ quoteH (BoundaryAbove a) = "c <= " ++ show a
+ quoteH (BoundaryBelow a) = "c < " ++ show a
+ quoteH (BoundaryAboveAll) = "True"
+ quoteH (BoundaryBelowAll) = "False"
-charSetElems :: CharSet -> [Char]
-charSetElems set = [c | c <- ['\0'..'\xff'], set c]
View
@@ -17,6 +17,7 @@ module DFA(scanner2dfa) where
import AbsSyn
import qualified Map
+import qualified Data.IntMap as IntMap
import NFA
import Sort ( msort, nub' )
import CharSet
@@ -88,8 +89,8 @@ type StartCode = Int
-- state of the partial DFA, until all possible state sets have been considered
-- The final DFA is then constructed with a `mk_dfa'.
-scanner2dfa:: Scanner -> [StartCode] -> DFA SNum Code
-scanner2dfa scanner scs = nfa2dfa scs (scanner2nfa scanner scs)
+scanner2dfa:: Encoding -> Scanner -> [StartCode] -> DFA SNum Code
+scanner2dfa enc scanner scs = nfa2dfa scs (scanner2nfa enc scanner scs)
nfa2dfa:: [StartCode] -> NFA -> DFA SNum Code
nfa2dfa scs nfa = mk_int_dfa nfa (nfa2pdfa nfa pdfa (dfa_start_states pdfa))
@@ -110,30 +111,27 @@ nfa2pdfa nfa pdfa (ss:umkd)
| ss `in_pdfa` pdfa = nfa2pdfa nfa pdfa umkd
| otherwise = nfa2pdfa nfa pdfa' umkd'
where
- pdfa' = add_pdfa ss (State accs (Map.fromList ss_outs)) pdfa
+ pdfa' = add_pdfa ss (State accs (IntMap.fromList ss_outs)) pdfa
umkd' = rctx_sss ++ map snd ss_outs ++ umkd
-- for each character, the set of states that character would take
-- us to from the current set of states in the NFA.
- ss_outs :: [(Char, StateSet)]
- ss_outs = [ (ch, mk_ss nfa ss')
- | ch <- dfa_alphabet,
- let ss' = [ s' | (p,s') <- outs, p ch ],
- not (null ss')
- ]
+ ss_outs :: [(Int, StateSet)]
+ ss_outs = [ (fromIntegral ch, mk_ss nfa ss')
+ | ch <- byteSetElems $ setUnions [p | (p,_) <- outs],
+ let ss' = [ s' | (p,s') <- outs, byteSetElem p ch ],
+ not (null ss')
+ ]
rctx_sss = [ mk_ss nfa [s]
| Acc _ _ _ (RightContextRExp s) <- accs ]
- outs :: [(CharSet,SNum)]
+ outs :: [(ByteSet,SNum)]
outs = [ out | s <- ss, out <- nst_outs (nfa!s) ]
accs = sort_accs [acc| s<-ss, acc<-nst_accs (nfa!s)]
-dfa_alphabet:: [Char]
-dfa_alphabet = ['\0'..'\255']
-
-- `sort_accs' sorts a list of accept values into decending order of priority,
-- eliminating any elements that follow an unconditional accept value.
@@ -198,7 +196,7 @@ mk_int_dfa nfa (DFA start_states mp)
cnv :: State StateSet a -> State SNum a
cnv (State accs as) = State accs' as'
where
- as' = Map.mapWithKey (\_ch s -> lookup' s) as
+ as' = IntMap.mapWithKey (\_ch s -> lookup' s) as
accs' = map cnv_acc accs
cnv_acc (Acc p a lctx rctx) = Acc p a lctx rctx'
Oops, something went wrong.

0 comments on commit 892688f

Please sign in to comment.