In [5]:
from collections import defaultdict, deque
from random import choice, randint
from string import ascii_letters, digits, whitespace

Rozwiązanie wykorzystuje niedeterministyczny automat skończony, którego stany przeszukiwane są algorytmem DFS.

In [25]:
class FA():
    def __init__(self, pattern):
        self.pattern = self.preprocess(pattern) # represents pattern as list of lists, characters and sets
        self.build_table(self.pattern)
    
    def build_table(self, pattern):
        "builds transition table"
        self.table = [defaultdict(lambda: set())]
        iterators = deque() # iterators change when contents of `()` `[]` are being processed
        modifiers = deque() # ?+*
        current_iter = iter(pattern)
        
        i = 0
        while True:
            try:
                element = next(current_iter)
            except StopIteration:
                if len(iterators) == 0:
                    break
                else:
                    current_iter = iterators.pop()
                    
                modifier, first = modifiers.pop()
                last = i - 1
                if modifier == "+":
                    for key in self.table[first].keys():
                        self.table[last][key].add(first + 1)
                        self.table[last][key].add(last + 1)
                        if last == first: # add loop
                            self.table[last][key].add(first)
                elif modifier == "?":
                    # adding skipping
                    if first > 0:
                        for key in self.table[first - 1].keys():
                            self.table[first - 1][key].add(last + 1)
                    else:
                        self.table[0]['skip'].add(i)
                else: # modifier == "*"
                    for key in self.table[first].keys():
                        self.table[last][key].add(first + 1)
                        self.table[last][key].add(last + 1)
                        if last == first: # add loop
                            self.table[last][key].add(first)
                    # adding skipping
                    if first > 0:
                        for key in self.table[first - 1].keys():
                            self.table[first - 1][key].add(last + 1)
                    else:
                        self.table[0]['skip'].add(i)
                continue
                    
            if type(element) == list:
                iterators.append(current_iter)
                current_iter = iter(element)
            elif type(element) == set:
                for letter in element:
                    self.table[i][letter].add(i + 1)
                i += 1
                self.table.append(defaultdict(lambda: set()))
            elif element in "?+*":
                modifiers.append((element, i))
            else: # normal character
                self.table[i][element].add(i + 1)
                i += 1
                self.table.append(defaultdict(lambda: set()))
                
    def find(self, text):
        "finds pattern in text - every result is in form (beggining of the pattern, end of pattern)"
        start = -1
        state = 0
        matches = []
        stack = deque()
        stack.append((0, 0)) # (letter index, state)

        while True:
            idx, state = stack.pop()
            
            if state == len(self.table) - 1:
                end = matches[-1][-1]
                end = max(end, idx - 1)
                matches[-1][-1] = end
                if len(stack) == 0:
                    start = -1  
                 #   stack.append((idx, 0)) - for finding only not overlaping occurences of patterns
                    stack.append((matches[-1][0] + 1, 0)) # for finding all occurences of pattern
            else:
                if idx < len(text):
                    letter = text[idx]
                else:
                    break
                if start == -1: # state = 0
                    states = self.table[state][letter]
                    if states:
                        start = idx
                        matches.append([start, -1])
                        for next_state in states:
                            stack.append((idx + 1, next_state))
                    if 'skip' in self.table[state].keys():
                        next_state =  self.table[state]['skip']
                        stack.append((idx + 1, next_state))
                        start = idx
                        matches.append([start, -1])
                else:
                    states = self.table[state][letter]
                    for next_state in states:
                        stack.append((idx + 1, next_state))
                
            if len(stack) == 0:
                start = -1  
                if matches and matches[-1][-1] == -1:
                    matches.pop()    
                stack.append((idx + 1, 0))
                            
        if matches and matches[-1][-1] == -1:
            matches.pop()
        return matches
    
    @staticmethod
    def preprocess(pattern):
        "changes representation of pattern from string to nested lists"
        divided_pattern = []
        stack = deque()
        i = 0
        while i < len(pattern):
            letter = pattern[i]
            if letter in "?*+":
                if len(stack) == 0:
                    tmp = divided_pattern[-1]
                    if type(tmp) != list: tmp = [tmp]
                    divided_pattern[-1] = letter
                    divided_pattern.append(tmp)
                else:
                    tmp = stack.pop()
                    if type(tmp) != list: tmp = [tmp]
                    stack.append(letter)
                    stack.append(tmp)
            elif letter in '([':
                stack.append(letter)
            elif letter == ')':
                letter = stack.pop()
                letters = []
                while letter != '(':
                    letters.append(letter)
                    letter = stack.pop()
                letters.reverse()
                if i < len(pattern) - 1 and pattern[i+1] in "?*+": # letters make block
                    if len(stack) == 0:
                        divided_pattern.append(letters)
                    else:
                        stack.append(letters)
                else: # unimportant block
                    if len(stack) == 0:
                        divided_pattern += letters
                    else:
                        for l in letters:
                            stack.append(l)
            elif letter == ']':
                letter = stack.pop()
                letters = []
                while letter != '[':
                    letters.append(letter)
                    letter = stack.pop()
                if letters:
                    if len(stack) == 0:
                        divided_pattern.append(set(letters))
                    else:
                        stack.append(set(letters))
            elif letter == '\\':
                i += 1
                letter = pattern[i]
                appendto = stack
                if len(stack) == 0:
                    appendto = divided_pattern
                if letter == 'd':
                    appendto.append(set(digits))
                elif letter == 'w':
                    appendto.append(set(ascii_letters + digits + "_"))
                elif letter == 's':
                    appendto.append(set(whitespace))
            else:
                if len(stack)==0:
                    divided_pattern.append(letter)
                else:
                    stack.append(letter)
            i += 1
            
        return divided_pattern

# Przykłady działania

Rezultatem jest lista krotek - każda z nich zawiera indeksy początku i końca znalezionego wzorca.

In [10]:
def test(pattern, texts):
    automaton = FA(pattern)
    for text in texts:
        dots = ""
        if len(text) > 10:
            dots = "..."
        print(f"{text[:10]}{dots} | Result: {automaton.find(text)}")

## Nawiasy \[\]

In [11]:
test("a[abc]q", ["qaaqqqq", "abqa", "aabc", "aq", "aaqabq"])

qaaqqqq | Result: [[1, 3]]
abqa | Result: [[0, 2]]
aabc | Result: []
aq | Result: []
aaqabq | Result: [[0, 2], [3, 5]]


In [12]:
test("[abcd]", ["abcdeabcde"])

abcdeabcde | Result: [[0, 0], [1, 1], [2, 2], [3, 3], [5, 5], [6, 6], [7, 7], [8, 8]]


## Modyfikatory + ? *

In [13]:
test("aa*a", ["aaaaaaa"])

aaaaaaa | Result: [[0, 6]]


In [14]:
test("ab*cd*d", ["abcdd", "abbbcc" + "acdddddddd"])

abcdd | Result: [[0, 4]]
abbbccacdd... | Result: [[6, 15]]


In [15]:
test("ab+c", ["bbacq", "qq" + "abbbc"])

bbacq | Result: []
qqabbbc | Result: [[2, 6]]


In [16]:
test("ab?c", ["ac" + "abc " + "abbc"])

acabc abbc | Result: [[0, 1], [2, 4]]


In [17]:
test("a(bcd)+c", ["ac" + "abcdc" + "abcdbcdc"])

acabcdcabc... | Result: [[2, 6]]


## Grupy znaków \d \s \w

In [18]:
test("\dq\s\sq\w", ["3q \tqr", "0q\t\tqs"])

3q 	qr | Result: [[0, 5]]
0q		qs | Result: [[0, 5]]


## Różne

In [29]:
test("[ac]+b[ad]*", ["accabaaaaabdaad"])

accabaaaaa... | Result: [[0, 9], [5, 14]]


In [19]:
test("aa?[cde]+b", ["aaedeb", "ee" + "aaedeb" + "aaedeb", 
                    "ee" + "aaedeb" + "aaededddb" + "aaa"])

aaedeb | Result: [[0, 5]]
eeaaedebaa... | Result: [[2, 7], [8, 13]]
eeaaedebaa... | Result: [[2, 7], [8, 16]]


In [20]:
test("a(b[cde]?b)*", ["abb" + "abbb" + "cde",  "bb" + "abdb" + "ee", "abcb",
                      "abcb" + "abeb" + "abeeb", ])

abbabbbcde | Result: [[0, 2], [3, 5]]
bbabdbee | Result: [[2, 5]]
abcb | Result: [[0, 3]]
abcbabebab... | Result: [[0, 3], [8, 8]]


In [21]:
test("\d+", ["214" + "qq" + "45621361"])

214qq45621... | Result: [[0, 2], [5, 12]]
