In [2]:
from collections import defaultdict, deque
from random import choice, randint
from string import ascii_letters, digits, whitespace

In [3]:
class FA():
    def __init__(self, pattern):
        self.pattern = self.preprocess(pattern) # represents pattern as list of lists, characters and sets
        self.build_table(self.pattern)
    
    def build_table(self, pattern):
        "builds transition table"
        self.table = [defaultdict(lambda: 0)]
        iterators = deque() # iterators change when contents of `()` `[]` are being processed
        modifiers = deque() # ?+*
        current_iter = iter(pattern)
        
        i = 0
        while True:
            try:
                element = next(current_iter)
            except StopIteration:
                if len(iterators) == 0:
                    break
                else:
                    current_iter = iterators.pop()
                    
                modifier, index = modifiers.pop()
                if modifier == "+":
                    for key in self.table[index].keys():
                        self.table[i][key] = index + 1
                elif modifier == "?":
                     self.table[index]["skip"] = i 
                else: # modifier == "*"
                    for key in self.table[index].keys():
                        self.table[i][key] = index + 1
                    self.table[index]["skip"] = i 
                continue
                    
            if type(element) == list:
                iterators.append(current_iter)
                current_iter = iter(element)
            elif type(element) == set:
                for letter in element:
                    self.table[i][letter] = i + 1
                i += 1
                self.table.append(defaultdict(lambda: 0))
            elif element in "?+*":
                modifiers.append((element, i))
            else: # normal character
                if element == ".":
                    self.table[i] = defaultdict(lambda: i + 1)
                else:
                    self.table[i][element] = i + 1
                i += 1
                self.table.append(defaultdict(lambda: 0))
                
    def find(self, text):
        "finds pattern in text - every result is in form (beggining of match, end of match)"
        start = -1
        state = 0
        matches = []
        i = 0
        while i < len(text):
            l = text[i]
            if start == -1:
                state = self.table[state][l]
                if state == 0:
                    state = self.table[state]['skip']
                    if state != 0 and state != len(self.table) - 1:
                        i -= 1
                if state != 0:
                    start = i # first matching character
            else:
                next_state = self.table[state][l] 
                if next_state == 0:
                    next_state = self.table[state]['skip']
                    if next_state != 0 and next_state != len(self.table) - 1:
                        i -= 1
                state = next_state

            if state == len(self.table) - 1 or (
                 state == len(self.table) - 2 and self.table[state]['skip'] != 0): 
                matches.append((start, i))
                state = 0
            if state == 0:
                start = -1
            i += 1
                    
        return matches
    
    @staticmethod
    def preprocess(pattern):
        "changes representation of pattern from string to nested lists"
        divided_pattern = []
        stack = deque()
        i = 0
        while i < len(pattern):
            letter = pattern[i]
            if letter in "?*+":
                if len(stack) == 0:
                    tmp = divided_pattern[-1]
                    if type(tmp) != list: tmp = [tmp]
                    divided_pattern[-1] = letter
                    divided_pattern.append(tmp)
                else:
                    tmp = stack.pop()
                    if type(tmp) != list: tmp = [tmp]
                    stack.append(letter)
                    stack.append(tmp)
            elif letter in '([':
                stack.append(letter)
            elif letter == ')':
                letter = stack.pop()
                letters = []
                while letter != '(':
                    letters.append(letter)
                    letter = stack.pop()
                letters.reverse()
                if i < len(pattern) - 1 and pattern[i+1] in "?*+": # letters make block
                    if len(stack) == 0:
                        divided_pattern.append(letters)
                    else:
                        stack.append(letters)
                else: # unimportant block
                    if len(stack) == 0:
                        divided_pattern += letters
                    else:
                        for l in letters:
                            stack.append(l)
            elif letter == ']':
                letter = stack.pop()
                letters = []
                while letter != '[':
                    letters.append(letter)
                    letter = stack.pop()
                if letters:
                    if len(stack) == 0:
                        divided_pattern.append(set(letters))
                    else:
                        stack.append(set(letters))
            elif letter == '\\':
                i += 1
                letter = pattern[i]
                appendto = stack
                if len(stack) == 0:
                    appendto = divided_pattern
                if letter == 'd':
                    appendto.append(set(digits))
                elif letter == 'w':
                    appendto.append(set(ascii_letters + digits + "_"))
                elif letter == 's':
                    appendto.append(set(whitespace))
            else:
                if len(stack)==0:
                    divided_pattern.append(letter)
                else:
                    stack.append(letter)
            i += 1
            
        return divided_pattern

# Przykłady działania

Rezultatem jest lista krotek - każda z nich zawiera indeksy początku i końca znalezionego wzorca.

In [11]:
def test(pattern, texts):
    automaton = FA(pattern)
    for text in texts:
        dots = ""
        if len(text) > 10:
            dots = "..."
        print(f"{text[:10]}{dots} | Result: {automaton.find(text)}")

## Nawiasy \[\]

In [12]:
test("a[abc]q", ["qaaqqqq", "abqa", "aabc", "aq", "aaqabq"])

qaaqqqq | Result: [(1, 3)]
abqa | Result: [(0, 2)]
aabc | Result: []
aq | Result: []
aaqabq | Result: [(0, 2), (3, 5)]


## Modyfikatory + ? *

In [13]:
test("ab*cd*", ["abcd", "aadacacabbbbcdd"])

abcd | Result: [(0, 2)]
aadacacabb... | Result: [(3, 4), (5, 6), (7, 12)]


In [14]:
test("ab+c", ["bbacacq", "qqabbbc"])

bbacacq | Result: []
qqabbbc | Result: [(2, 6)]


In [15]:
test("ab?c", ["acabcabbc"])

acabcabbc | Result: [(0, 1), (2, 4)]


In [16]:
test("a(bcd)+c", ["ac" + "abcdc" + "abcdbcdc"])

acabcdcabc... | Result: [(2, 6), (7, 14)]


## Grupy znaków \d \s \w

In [17]:
test("\dq\s\sq\w", ["3q \tqr", "0q\t\tqs"])

3q 	qr | Result: [(0, 5)]
0q		qs | Result: [(0, 5)]


## Różne

In [18]:
test("aa?[cde]+b", ["aaedeb", "ee" + "aaedeb" + "aaedeb", 
                    "ee" + "aaedeb" + "aaededddb" + "aaa"])

aaedeb | Result: [(0, 5)]
eeaaedebaa... | Result: [(2, 7), (8, 13)]
eeaaedebaa... | Result: [(2, 7), (8, 16)]


In [19]:
test("a(b[cde]?b)*", ["abb" + "abbb" + "cde",  "bb" + "abdb" + "ee", "abcb",
                      "abcb" + "abeb" + "abeeb", ])

abbabbbcde | Result: [(0, 2), (3, 5)]
bbabdbee | Result: [(2, 5)]
abcb | Result: [(0, 3)]
abcbabebab... | Result: [(0, 3), (4, 7)]


In [20]:
test("\d+q", ["214qqq45621361q"])

214qqq4562... | Result: [(0, 3), (6, 14)]
