Skip to content

Commit

Permalink
UTF-8 support from Jean-Philippe Bernardy and Alan Zimmerman.
Browse files Browse the repository at this point in the history
Alex will now, by default, parse a UTF-8-encoded byte sequence.  (to
disable this behaviour, there is a new flag --latin1, which we will be
using in GHC).

Because parsing UTF-8 generates a large state machine, I added DFA
minimisation (source file DFAMin.hs).  This makes Alex itself somewhat
slower, but the generated lexers are smaller - dramatically so for
UTF-8 lexers.
  • Loading branch information
simonmar committed Jul 11, 2011
1 parent 5d6e749 commit 892688f
Show file tree
Hide file tree
Showing 26 changed files with 1,889 additions and 202 deletions.
37 changes: 37 additions & 0 deletions .gitignore
@@ -0,0 +1,37 @@
*.hi
*.prof
*~
*.o
*.info
/AlexTemplate-debug.bak
/AlexTemplate-ghc-debug.bak
/AlexTemplate-ghc.bak
/AlexTemplate.bak
/AlexWrapper-basic-bytestring.bak
/AlexWrapper-basic.bak
/AlexWrapper-gscan.bak
/AlexWrapper-monad-bytestring.bak
/AlexWrapper-monad.bak
/AlexWrapper-monadUserState-bytestring.bak
/AlexWrapper-monadUserState.bak
/AlexWrapper-posn-bytestring.bak
/AlexWrapper-posn.bak
/AlexWrapper-strict-bytestring.bak
/AlexTemplate
/AlexTemplate-debug
/AlexTemplate-ghc
/AlexTemplate-ghc-debug
/AlexWrapper-basic
/AlexWrapper-basic-bytestring
/AlexWrapper-gscan
/AlexWrapper-monad
/AlexWrapper-monad-bytestring
/AlexWrapper-monadUserState
/AlexWrapper-monadUserState-bytestring
/AlexWrapper-posn
/AlexWrapper-posn-bytestring
/AlexWrapper-strict-bytestring
/dist
/src/Data/Ranged/Boundaries.hs~
/src/Data/Ranged/RangedSet.hs~
/src/Data/Ranged/Ranges.hs~
9 changes: 9 additions & 0 deletions NOTE.txt
@@ -0,0 +1,9 @@
Note:

The contents of package Ranged-sets-0.3.0 has been copied into this
package, in order to allow it to be part of the Haskell Platform,
without introducing additional dependencies.

The original license agreement has been included in the src/Data
subdirectory, as required by the package source.

20 changes: 11 additions & 9 deletions alex.cabal
Expand Up @@ -10,15 +10,9 @@ stability: stable
homepage: http://www.haskell.org/alex/
synopsis: Alex is a tool for generating lexical analysers in Haskell
category: Development
cabal-version: >= 1.2

-- Later, this isn't compatible with Cabal 1.2:
-- source-repository head
-- type: darcs
-- location: http://darcs.haskell.org/alex/


cabal-version: >= 1.6
build-type: Custom

extra-source-files:
ANNOUNCE
README
Expand Down Expand Up @@ -58,6 +52,10 @@ extra-source-files:
tests/tokens_gscan.x
tests/tokens_posn.x

source-repository head
type: git
location: http://darcs.haskell.org/alex.git

flag small_base
description: Choose the new smaller, split-up base package.

Expand All @@ -71,13 +69,16 @@ executable alex
build-depends: base >= 1.0

build-depends: base < 5
-- build-depends: Ranged-sets
build-depends: QuickCheck >=2

extensions: CPP
ghc-options: -Wall
ghc-options: -Wall -rtsopts
other-modules:
AbsSyn
CharSet
DFA
DFAMin
DFS
Info
Main
Expand All @@ -90,3 +91,4 @@ executable alex
Set
Sort
Util
UTF8
9 changes: 7 additions & 2 deletions src/AbsSyn.hs
Expand Up @@ -23,6 +23,7 @@ module AbsSyn (
import CharSet ( CharSet )
import Map ( Map )
import qualified Map hiding ( Map )
import Data.IntMap (IntMap)
import Sort ( nub' )
import Util ( str, nl )

Expand Down Expand Up @@ -66,6 +67,7 @@ data RightContext r
= NoRightContext
| RightContextRExp r
| RightContextCode Code
deriving (Eq,Ord)

instance Show RECtx where
showsPrec _ (RECtx scs _ r rctx code) =
Expand Down Expand Up @@ -95,16 +97,19 @@ data DFA s a = DFA
dfa_states :: Map s (State s a)
}

data State s a = State [Accept a] (Map Char s)
data State s a = State { state_acc :: [Accept a],
state_out :: IntMap s -- 0..255 only
}

type SNum = Int

data Accept a
= Acc { accPrio :: Int,
accAction :: Maybe a,
accLeftCtx :: Maybe CharSet,
accLeftCtx :: Maybe CharSet, -- cannot be converted to byteset at this point.
accRightCtx :: RightContext SNum
}
deriving (Eq,Ord)

-- debug stuff
instance Show (Accept a) where
Expand Down
144 changes: 128 additions & 16 deletions src/CharSet.hs
Expand Up @@ -11,6 +11,16 @@
-- ----------------------------------------------------------------------------}

module CharSet (
setSingleton,

Encoding(..),

Byte,
ByteSet,
byteSetSingleton,
byteRanges,
byteSetRange,

CharSet, -- abstract
emptyCharSet,
charSetSingleton,
Expand All @@ -19,39 +29,141 @@ module CharSet (
charSetComplement,
charSetRange,
charSetUnion,
charSetToArray,
charSetElems
charSetQuote,
setUnions,
byteSetToArray,
byteSetElems,
byteSetElem
) where

import Data.Array ( Array, array )
import Data.Array
import Data.Ranged
import Data.Word
import Data.Maybe (catMaybes)
import Data.Char (chr,ord)
import UTF8

type Byte = Word8
-- Implementation as functions
type CharSet = Char -> Bool
type CharSet = RSet Char
type ByteSet = RSet Byte
-- type Utf8Set = RSet [Byte]
type Utf8Range = Span [Byte]

data Encoding = Latin1 | UTF8

emptyCharSet :: CharSet
emptyCharSet = const False
emptyCharSet = rSetEmpty

byteSetElem :: ByteSet -> Byte -> Bool
byteSetElem = rSetHas

charSetSingleton :: Char -> CharSet
charSetSingleton c = \x -> x == c
charSetSingleton = rSingleton

setSingleton :: DiscreteOrdered a => a -> RSet a
setSingleton = rSingleton

charSet :: [Char] -> CharSet
charSet s x = x `elem` s
charSet = setUnions . fmap charSetSingleton

charSetMinus :: CharSet -> CharSet -> CharSet
charSetMinus s1 s2 x = s1 x && not (s2 x)
charSetMinus = rSetDifference

charSetUnion :: CharSet -> CharSet -> CharSet
charSetUnion s1 s2 x = s1 x || s2 x
charSetUnion = rSetUnion

setUnions :: DiscreteOrdered a => [RSet a] -> RSet a
setUnions = foldr rSetUnion rSetEmpty

charSetComplement :: CharSet -> CharSet
charSetComplement s1 = not . s1
charSetComplement = rSetNegation

charSetRange :: Char -> Char -> CharSet
charSetRange c1 c2 x = x >= c1 && x <= c2
charSetRange c1 c2 = makeRangedSet [Range (BoundaryBelow c1) (BoundaryAbove c2)]

byteSetToArray :: ByteSet -> Array Byte Bool
byteSetToArray set = array (fst (head ass), fst (last ass)) ass
where ass = [(c,rSetHas set c) | c <- [0..0xff]]

byteSetElems :: ByteSet -> [Byte]
byteSetElems set = [c | c <- [0 .. 0xff], rSetHas set c]

charToRanges :: Encoding -> CharSet -> [Utf8Range]
charToRanges Latin1 =
map (fmap ((: []).fromIntegral.ord)) -- Span [Byte]
. catMaybes
. fmap (charRangeToCharSpan False)
. rSetRanges
charToRanges UTF8 =
concat -- Span [Byte]
. fmap toUtfRange -- [Span [Byte]]
. fmap (fmap UTF8.encode) -- Span [Byte]
. catMaybes
. fmap (charRangeToCharSpan True)
. rSetRanges

-- | Turns a range of characters expressed as a pair of UTF-8 byte sequences into a set of ranges, in which each range of the resulting set is between pairs of sequences of the same length
toUtfRange :: Span [Byte] -> [Span [Byte]]
toUtfRange (Span x y) = fix x y

fix :: [Byte] -> [Byte] -> [Span [Byte]]
fix x y
| length x == length y = [Span x y]
| length x == 1 = Span x [0x7F] : fix [0xC2,0x80] y
| length x == 2 = Span x [0xDF,0xBF] : fix [0xE0,0x80,0x80] y
| length x == 3 = Span x [0xEF,0xBF,0xBF] : fix [0xF0,0x80,0x80,0x80] y
| otherwise = error "fix: incorrect input given"


byteRangeToBytePair :: Span [Byte] -> ([Byte],[Byte])
byteRangeToBytePair (Span x y) = (x,y)

data Span a = Span a a -- lower bound inclusive, higher bound exclusive
-- (SDM: upper bound inclusive, surely??)
instance Functor Span where
fmap f (Span x y) = Span (f x) (f y)

charRangeToCharSpan :: Bool -> Range Char -> Maybe (Span Char)
charRangeToCharSpan _ (Range BoundaryAboveAll _) = Nothing
charRangeToCharSpan _ (Range _ BoundaryBelowAll) = Nothing
charRangeToCharSpan uni (Range x y) = Just (Span (l x) (h y))
where l b = case b of
BoundaryBelowAll -> '\0'
BoundaryBelow a -> a
BoundaryAbove a -> succ a
BoundaryAboveAll -> error "panic: charRangeToCharSpan"
h b = case b of
BoundaryBelowAll -> error "panic: charRangeToCharSpan"
BoundaryBelow a -> pred a
BoundaryAbove a -> a
BoundaryAboveAll | uni -> chr 0x10ffff
| otherwise -> chr 0xff

byteRanges :: Encoding -> CharSet -> [([Byte],[Byte])]
byteRanges enc = fmap byteRangeToBytePair . charToRanges enc

byteSetRange :: Byte -> Byte -> ByteSet
byteSetRange c1 c2 = makeRangedSet [Range (BoundaryBelow c1) (BoundaryAbove c2)]

byteSetSingleton :: Byte -> ByteSet
byteSetSingleton = rSingleton

instance DiscreteOrdered Word8 where
adjacent x y = x + 1 == y
adjacentBelow 0 = Nothing
adjacentBelow x = Just (x-1)

charSetToArray :: CharSet -> Array Char Bool
charSetToArray set = array (fst (head ass), fst (last ass)) ass
where ass = [(c,set c) | c <- ['\0'..'\xff']]
-- TODO: More efficient generated code!
charSetQuote :: CharSet -> String
charSetQuote s = "(\\c -> " ++ foldr (\x y -> x ++ " || " ++ y) "False" (map quoteRange (rSetRanges s)) ++ ")"
where quoteRange (Range l h) = quoteL l ++ " && " ++ quoteH h
quoteL (BoundaryAbove a) = "c > " ++ show a
quoteL (BoundaryBelow a) = "c >= " ++ show a
quoteL (BoundaryAboveAll) = "False"
quoteL (BoundaryBelowAll) = "True"
quoteH (BoundaryAbove a) = "c <= " ++ show a
quoteH (BoundaryBelow a) = "c < " ++ show a
quoteH (BoundaryAboveAll) = "True"
quoteH (BoundaryBelowAll) = "False"

charSetElems :: CharSet -> [Char]
charSetElems set = [c | c <- ['\0'..'\xff'], set c]
26 changes: 12 additions & 14 deletions src/DFA.hs
Expand Up @@ -17,6 +17,7 @@ module DFA(scanner2dfa) where

import AbsSyn
import qualified Map
import qualified Data.IntMap as IntMap
import NFA
import Sort ( msort, nub' )
import CharSet
Expand Down Expand Up @@ -88,8 +89,8 @@ type StartCode = Int
-- state of the partial DFA, until all possible state sets have been considered
-- The final DFA is then constructed with a `mk_dfa'.

scanner2dfa:: Scanner -> [StartCode] -> DFA SNum Code
scanner2dfa scanner scs = nfa2dfa scs (scanner2nfa scanner scs)
scanner2dfa:: Encoding -> Scanner -> [StartCode] -> DFA SNum Code
scanner2dfa enc scanner scs = nfa2dfa scs (scanner2nfa enc scanner scs)

nfa2dfa:: [StartCode] -> NFA -> DFA SNum Code
nfa2dfa scs nfa = mk_int_dfa nfa (nfa2pdfa nfa pdfa (dfa_start_states pdfa))
Expand All @@ -110,30 +111,27 @@ nfa2pdfa nfa pdfa (ss:umkd)
| ss `in_pdfa` pdfa = nfa2pdfa nfa pdfa umkd
| otherwise = nfa2pdfa nfa pdfa' umkd'
where
pdfa' = add_pdfa ss (State accs (Map.fromList ss_outs)) pdfa
pdfa' = add_pdfa ss (State accs (IntMap.fromList ss_outs)) pdfa

umkd' = rctx_sss ++ map snd ss_outs ++ umkd

-- for each character, the set of states that character would take
-- us to from the current set of states in the NFA.
ss_outs :: [(Char, StateSet)]
ss_outs = [ (ch, mk_ss nfa ss')
| ch <- dfa_alphabet,
let ss' = [ s' | (p,s') <- outs, p ch ],
not (null ss')
]
ss_outs :: [(Int, StateSet)]
ss_outs = [ (fromIntegral ch, mk_ss nfa ss')
| ch <- byteSetElems $ setUnions [p | (p,_) <- outs],
let ss' = [ s' | (p,s') <- outs, byteSetElem p ch ],
not (null ss')
]

rctx_sss = [ mk_ss nfa [s]
| Acc _ _ _ (RightContextRExp s) <- accs ]

outs :: [(CharSet,SNum)]
outs :: [(ByteSet,SNum)]
outs = [ out | s <- ss, out <- nst_outs (nfa!s) ]

accs = sort_accs [acc| s<-ss, acc<-nst_accs (nfa!s)]

dfa_alphabet:: [Char]
dfa_alphabet = ['\0'..'\255']

-- `sort_accs' sorts a list of accept values into decending order of priority,
-- eliminating any elements that follow an unconditional accept value.

Expand Down Expand Up @@ -198,7 +196,7 @@ mk_int_dfa nfa (DFA start_states mp)
cnv :: State StateSet a -> State SNum a
cnv (State accs as) = State accs' as'
where
as' = Map.mapWithKey (\_ch s -> lookup' s) as
as' = IntMap.mapWithKey (\_ch s -> lookup' s) as

accs' = map cnv_acc accs
cnv_acc (Acc p a lctx rctx) = Acc p a lctx rctx'
Expand Down

0 comments on commit 892688f

Please sign in to comment.