/
Matching.hs
97 lines (82 loc) · 3.79 KB
/
Matching.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
{-# LANGUAGE MultiParamTypeClasses, FlexibleContexts #-}
module Text.RegExp.Matching where
import Data.Semiring
import Text.RegExp.Data
import Text.RegExp.Matching.Leftmost.Type
import Text.RegExp.Matching.Longest.Type
import Text.RegExp.Matching.LeftLong.Type
-- |
-- Checks whether a regular expression matches the given word. For
-- example, @acceptFull (fromString \"b|abc\") \"b\"@ yields @True@
-- because the first alternative of @b|abc@ matches the string
-- @\"b\"@.
--
acceptFull :: RegExp c -> [c] -> Bool
acceptFull r = fullMatch r
-- |
-- Checks whether a regular expression matches a subword of the given
-- word. For example, @acceptPartial (fromString \"b\") \"abc\"@
-- yields @True@ because @\"abc\"@ contains the substring @\"b\"@.
--
acceptPartial :: RegExp c -> [c] -> Bool
acceptPartial r = partialMatch r
-- |
-- Computes in how many ways a word can be matched against a regular
-- expression.
--
matchingCount :: Num a => RegExp c -> [c] -> a
matchingCount r = getNumeric . fullMatch r
{-# SPECIALIZE matchingCount :: RegExp c -> [c] -> Int #-}
-- |
-- Matches a regular expression against a word computing a weight in
-- an arbitrary semiring.
--
-- The symbols can have associated weights associated by the
-- 'symWeight' function of the 'Weight' class. This function also
-- allows to adjust the type of the used alphabet such that, for
-- example, positional information can be taken into account by
-- 'zip'ping the word with positions.
--
fullMatch :: Weight a b w => RegExp a -> [b] -> w
fullMatch (RegExp r) = matchW (weighted r)
{-# SPECIALIZE fullMatch :: RegExp c -> [c] -> Bool #-}
{-# SPECIALIZE fullMatch :: RegExp c -> [c] -> Numeric Int #-}
{-# SPECIALIZE fullMatch :: Num a => RegExp c -> [c] -> Numeric a #-}
{-# SPECIALIZE fullMatch :: RegExp c -> [(Int,c)] -> Leftmost #-}
{-# SPECIALIZE fullMatch :: RegExp c -> [c] -> Longest #-}
{-# SPECIALIZE fullMatch :: RegExp c -> [(Int,c)] -> LeftLong #-}
-- |
-- Matches a regular expression against substrings of a word computing
-- a weight in an arbitrary semiring. Similar to 'fullMatch' the
-- 'Weight' class is used to associate weights to the symbols of the
-- regular expression.
--
partialMatch :: Weight a b w => RegExp a -> [b] -> w
partialMatch (RegExp r) = matchW (arb `seqW` weighted r `seqW` arb)
where RegExp arb = rep anySym
{-# SPECIALIZE partialMatch :: RegExp c -> [c] -> Bool #-}
{-# SPECIALIZE partialMatch :: RegExp c -> [c] -> Numeric Int #-}
{-# SPECIALIZE partialMatch :: Num a => RegExp c -> [c] -> Numeric a #-}
{-# SPECIALIZE partialMatch :: RegExp c -> [(Int,c)] -> Leftmost #-}
{-# SPECIALIZE partialMatch :: RegExp c -> [c] -> Longest #-}
{-# SPECIALIZE partialMatch :: RegExp c -> [(Int,c)] -> LeftLong #-}
matchW :: Semiring w => RegW w c -> [c] -> w
matchW r [] = empty r
matchW r (c:cs) = final (foldl (shiftW zero) (shiftW one r c) cs)
{-# SPECIALIZE matchW :: RegW Bool c -> [c] -> Bool #-}
{-# SPECIALIZE matchW :: RegW (Numeric Int) c -> [c] -> Numeric Int #-}
{-# SPECIALIZE matchW :: Num a => RegW (Numeric a) c -> [c] -> Numeric a #-}
{-# SPECIALIZE matchW :: RegW Leftmost (Int,c) -> [(Int,c)] -> Leftmost #-}
{-# SPECIALIZE matchW :: RegW Longest c -> [c] -> Longest #-}
{-# SPECIALIZE matchW :: RegW LeftLong (Int,c) -> [(Int,c)] -> LeftLong #-}
shiftW :: Semiring w => w -> RegW w c -> c -> RegW w c
shiftW w r c | active r || w /= zero = shift w (reg r) c
| otherwise = r
shift :: Semiring w => w -> Reg w c -> c -> RegW w c
shift _ Eps _ = epsW
shift w (Sym s f) c = let w' = w .*. f c
in (symW s f) { active = w' /= zero, final_ = w' }
shift w (Alt p q) c = altW (shiftW w p c) (shiftW w q c)
shift w (Seq p q) c = seqW (shiftW w p c)
(shiftW (w .*. empty p .+. final p) q c)
shift w (Rep r) c = repW (shiftW (w .+. final r) r c)