In [1]:
from tqdm import tqdm
import numpy as np

# Przygotowanie danych

In [2]:
inputs = ["bbb$", "aabbabd", "ababcd", "abcbccd"]
inputs = [string if string[-1]=="$" else string+"$" for string in inputs]
inputs

['bbb$', 'aabbabd$', 'ababcd$', 'abcbccd$']

In [3]:
with open("1997_714.txt", "r") as file:
    filestring = file.read()
    if filestring[-1] != "$":
        filestring += "$"
    inputs.append(filestring)

# Trie

In [4]:
class trie_node():
    def __init__(self, label, parent=None, link=None):
        if parent:
            depth = parent.depth + 1
        else:
            depth = 0
        self.children = {}
        self.label = label
        self.depth = depth
        self.parent = parent
        if parent:
            parent.add_kid(self)
        self.link = link
        
    def add_kid(self, kid):
        self.children[kid.label] = kid

class suffix_trie():
    def __init__(self, text):
        self.root = trie_node("")
        parent = self.root
        for letter in text:
            parent = trie_node(letter, parent=parent) 
        self.leafs = [parent]
            
    def find(self, suffix, leaf=None):
        head = self.root
        for letter in suffix:
            try:
                head = head.children[letter]
            except KeyError:
                break
        return head
            
    def graft(self, head, suffix):
        parent = head
        for letter in suffix:
             parent = trie_node(letter, parent=parent) 
        self.leafs.append(parent)
        
    def leaf_suffix(self, leaf):
        letters = []
        while leaf.label != "":
            letters.append(leaf.label)
            leaf = leaf.parent
        return "".join(letters[::-1])
    
    def child(self, letter):
        return self.children[letter]
    
    def write(self):
        for leaf in self.leafs:
            print(self.leaf_suffix(leaf).__repr__())

In [5]:
def build_trie(text):
    trie=suffix_trie(text) # creating initial trie
    leaf=trie.leafs[0]
    for i in range(1,len(text)):
        suffix=text[i:]
        head=trie.find(suffix,leaf)
        suffix_end=suffix[head.depth:]
        leaf=trie.graft(head, suffix_end)
    return trie

In [6]:
for string in inputs[:-1]:
    print(f"suffixes for input: '{string}'")
    build_trie(string).write()
    print()
    
#tree = build_tree(inputs[-1][-4000:])

suffixes for input: 'bbb$'
'bbb$'
'bb$'
'b$'
'$'

suffixes for input: 'aabbabd$'
'aabbabd$'
'abbabd$'
'bbabd$'
'babd$'
'abd$'
'bd$'
'd$'
'$'

suffixes for input: 'ababcd$'
'ababcd$'
'babcd$'
'abcd$'
'bcd$'
'cd$'
'd$'
'$'

suffixes for input: 'abcbccd$'
'abcbccd$'
'bcbccd$'
'cbccd$'
'bccd$'
'ccd$'
'cd$'
'd$'
'$'



Trie jest konstruowane poprawnie (na krótkich przykłądach jest to widoczne). Dla długiego tekstu ilość wymaganej pamięci jest zbyt duża, aby można było zbudować trie (tekst długości kilku tysięcy znaków zajmuje jako trie kilka gb).

# Algorytm McCreighta

In [7]:
class suffix_node():
    def __init__(self, start, parent=None, text=None):
        if parent:
            self.depth = parent.depth + len(parent.text) - start # number of letters in suffix up to this point (includes first letter of this node
            self.text = parent.text
            self.size = len(parent.text) - start
            self.parent=parent
        else:
            # root
            assert text is not None, "initialize with parent or text"
            self.text = text
            depth = 0
            self.depth = 0
            self.size = 0
            self.parent = self
        self.children = {}
        self.start = start # label start
        assert  parent is None or self.label()[-1]=="$"
        self.link = None
        if parent:
            if parent.text[self.start] not in parent.children.keys():
                parent.add_kid(self)
                assert self.label()[-1]=="$"
            else:
                parent.break_path(self)
                assert self.label()[-1]!="$"
                
    def show_info(self):
        print(f"start:{self.start}|depth:{self.depth}|size:{self.size}")
        
    def add_kid(self, new_kid):
        assert self.text[new_kid.start] not in self.children.keys()
        self.children[self.text[new_kid.start]] = new_kid
        return new_kid
    
    def break_path(self, new_kid):
        old_kid = self.children[self.text[new_kid.start]]
        self.children[self.text[new_kid.start]] = new_kid
        old_kid.parent = new_kid
        equal_part = 0
        for i in range(min(new_kid.size, old_kid.size)):
            if self.text[new_kid.start + i] == self.text[old_kid.start + i]:
                equal_part += 1
            else:
                break
        new_kid.start = old_kid.start
        old_kid.start += equal_part
        old_kid.size -= equal_part
        new_kid.size = equal_part
        new_kid.depth = self.depth + equal_part
        new_kid.children[self.text[old_kid.start]] = old_kid
        assert old_kid.size > 0
            
    def suffix(self):
        suff_start = self.start + self.size - self.depth
        suff_end = self.start + self.size
        return self.text[suff_start: suff_end]
    
    def label(self):
        return self.text[self.start:self.start+self.size]
    
    def slow_find(self,label, start, matched=0):
        try:
            child = self.children[label[0]]
        except KeyError:
            return self, matched
        for i in range(1, child.size):
            if(child.text[child.start + i]!=label[i]):
                #print(f"slow_start = {start+i}")
                return suffix_node(start, parent=self), matched+i # break path
        return child.slow_find(label[child.size:], start+child.size, matched=matched+child.size)
        
    def fast_find(self,label, start):
        child = self.children[label[0]]
        if len(label)>child.size:
            return child.fast_find(label[child.size:], start+child.size)
        elif len(label)==child.size:
            return child
        else:
            return suffix_node(start, parent=self)  # break path

class suffix_tree():
    def __init__(self, text):
        self.root = suffix_node(start=0, text=text)
        parent = self.root
        self.leafs = [suffix_node(start=0, parent=parent)]
            
    def graft(self, node, suffix, sibling=None):
        if node in self.leafs:
            self.leafs.remove(node)
        start = len(node.text) - len(suffix)
        leaf = suffix_node(start, parent=node)
        if node==self.root:
            leaf.suffix_start=start
        self.leafs.append(leaf)
        return leaf
    
    def write(self):
        print("LEAFS")
        for leaf in self.leafs:
            print(leaf.suffix().__repr__())
        print("\nALL NODES:")
        self.write_children(self.root) 
        
    def contains(self, string):
        if(string[-1]!="$"):
            string += "$"
        node = self.root
        while string:
            try:
                node = node.children[string[0]]
            except KeyError:
                return False
            if node.label() == string[:node.size]:
                string = string[node.size:]
            else:
                return False
        return True
        
    @staticmethod
    def write_children(node):
        print()
        for k, v in node.children.items():
            print(f"st{v.start}|siz{v.size}|d{v.depth}|{v.label().__repr__()}")
        for k, v in node.children.items():
            suffix_tree.write_children(v)

In [8]:
def mc_creight(text, slow=False, use_tqdm=True):
    tree = suffix_tree(text)
    last_head = head = tree.root #tree.leafs[0]
    node=tree.root
    leaf=tree.leafs[0]
    text_length=len(text)
    standard = 0
    iterator = range(1,text_length)
    if use_tqdm:
        iterator = tqdm(iterator, position=0) 
    for i in iterator:
        suffix=text[i:]
       # print(f"SUFFIX >{suffix}<")
        if len(suffix)<=leaf.size or slow:
            # nie ma czego linkować
            head, matched = tree.root.slow_find(suffix, i)
            leaf = tree.graft(head, suffix[matched:])
            assert leaf.label()[-1]=="$"
            last_head = head
        elif(len(suffix)<=leaf.size+head.size or not head.parent.link):
            # można linkować, ale trzeba wiedzieć do czego
            head, matched = tree.root.slow_find(suffix, i)
            last_leaf_size = leaf.size
            leaf = tree.graft(head, suffix[matched:])
            assert leaf.label()[-1]=="$"
            if last_leaf_size==leaf.size:
                last_head.link = head
            else:
                pass # break path
          #  last_head.link = head # do czego linkowac?????
            last_head = head
        else:
            # standardowa procedura
            standard+=1
            leaf_label=leaf.label()
            assert leaf.label()[-1]=="$"
            head_label=head.label()
            node = head.parent.link
            try:
                node = node.fast_find(head_label, i + node.depth)
            except KeyError:
                print(suffix[:20].__repr__())
                head.show_info()
                node.show_info()
                return tree
            if(len(node.children)==0):
                head = node
            else:
                head, matched = node.slow_find(leaf_label, i + len(suffix) - len(leaf_label))

            leaf=tree.graft(head, leaf_label[matched:])
            last_head.link = node
            last_head = head
        tree.standard = standard
    return tree

text = inputs[-1]
tree = mc_creight(text)

100%|██████████| 246472/246472 [07:53<00:00, 520.28it/s]


## sprawdzenie poprawności drzewa suffiksów

In [9]:
for i in range(len(inputs[-1])): # poprawność ustawy
    assert tree.contains(inputs[-1][i:]), "suffix tree is not complete"

In [10]:
for string in inputs[:-1]: # poprawność reszty danych wejściowych
    print(string[:10].__repr__())
    tree = mc_creight(string, use_tqdm=False)
    print("testing...")
    for i in range(len(string)):
        assert tree.contains(string[i:]), "suffix tree is not complete"

'bbb$'
testing...
'aabbabd$'
testing...
'ababcd$'
testing...
'abcbccd$'
testing...


# test czasów wykonania

## testy na krótkich danych

#### trie

In [11]:
%%timeit
for string in inputs[:-1]:
    build_trie(string)

117 µs ± 2.75 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


#### algorytm McCreighta bez linków i fast find

In [12]:
%%timeit
for string in inputs[:-1]:
    mc_creight(string, slow=True, use_tqdm=False)

213 µs ± 39.8 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


#### algorytm McCreighta bez ograniczeń

In [13]:
%%timeit
for string in inputs[:-1]:
    mc_creight(string, use_tqdm=False)

276 µs ± 15.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


Na krótkich przykładach najszybsze jest trie, chociaż róznica nie jest duża. Algorytm McCreighta na tych danych wykonuje dodatkową pracę, która nie ma okazji stać się użyteczną przez długość tekstów.

## test na ustawie

#### trie

In [14]:
%%timeit
build_trie(inputs[-1][-2000:])

2.47 s ± 486 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### algorytm McCreighta bez linków i fast find

In [19]:
%%timeit
mc_creight(inputs[-1][-2000:], slow=True, use_tqdm=False)

52.9 ms ± 13.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### algorytm McCreighta bez ograniczeń

In [20]:
%%timeit
mc_creight(inputs[-1][-2000:], use_tqdm=False)

40.8 ms ± 5.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


Przy dłuższych tekstach różnica w wymaganych przez algorytmy zasobach jest bardzo znacząca. Test na części tekstu ustawy pokazuje, że algorytm McCreighta jest około 50x szybszy. Różnica między algorytmem McCreighta z i bez linków jest widoczna, ale nie tak bardzo znacząca.  
Ważniejszą różnicą niż czas wykonania wydaje się być ilość wymaganej przez algorytmy pamięci. Zbudowanie trie z całego tekstu ustawy na moim sprzęcie okazało się niemożliwe - zabrakło pamięci (dostępne było około 8GB). Z kolei algorytm McCreighta poradził sobie bez większych problemów (zgodnie z oczekiwaniami).