## Table of contents

[Trie](#Trie)

[Simple McCreight algorithm](#sMcCreight)

[McCreight algorithm](#McCreight)

[Tests](#Tests)

### Imports

In [13]:
import random
import math
import string
from time import time
from queue import LifoQueue as Stack

<a id='Trie'></a>
## Trie

### Structures

In [14]:
class Trie:
    def __init__(self):
        self._root = tNode(None, "", 0)

    def find(self, word):
        curr_node = self.root
        for curr_letter in range(len(word)):
            if word[curr_letter] in curr_node.children:
                curr_node = curr_node.children[word[curr_letter]]
            else:
                return False

        return True

    @property
    def root(self):
        return self._root


class tNode:
    def __init__(self, parent, par_letter, depth):
        self._children = {}

        self._parent = parent
        self._parent_letter = par_letter

        self._link = None

        self._depth = depth

    def add_child(self, letter):
        self._children[letter] = tNode(self, letter, self.depth+1)

    def print(self):
        print(self._children)
        for child in self._children.values():
            child.print()

    @property
    def children(self):
        return self._children

    @property
    def parent(self):
        return self._parent

    @property
    def parent_letter(self):
        return self._parent_letter

    @property
    def link(self):
        return self._link

    @property
    def depth(self):
        return self._depth

    @link.setter
    def link(self, link):
        self._link = link

### Functions

In [15]:
def up_link_down(sibling):
    letters = Stack()
    while sibling and not sibling.link:
        letters.put(sibling.parent_letter)
        sibling = sibling.parent

    if not sibling:
        return None, None

    node = sibling.link
    while not letters.empty():
        curr_letter = letters.get()
        if curr_letter in node.children:
            node = node.children[curr_letter]
            sibling = sibling.children[curr_letter]
            sibling.link = node
        else:
            break

    return node, sibling


def t_graft(node, text, sibling=None):
    for current_letter in text:
        node.add_child(current_letter)
        node = node.children[current_letter]

        if sibling:
            sibling = sibling.children[current_letter]
            sibling.link = node

    return node


def make_suffix_trie(word):
    trie = Trie()
    leaf = t_graft(trie.root, word)
    trie.root.children[word[0]].link = trie.root
    for i in range(1, len(word)):
        head, sibling = up_link_down(leaf)
        if not head:
            sibling = trie.root.children[word[i-1]]
            sibling.link = trie.root
            head, sibling = up_link_down(leaf)

        leaf = t_graft(head, word[i+head.depth:], sibling)

    return trie

<a id='sMcCreight'></a>
## Simple McCreight algorithm

In [16]:
class sTree:
    """Structure containing simplified suffix tree"""
    def __init__(self, word):
        self._root = sNode()
        self._word = word

    def add_suffix(self, suffix):
        curr_letter = 0
        curr_node = self.root
        while curr_letter < len(suffix) and suffix[curr_letter] in curr_node.children:  # slow find
            curr_edge = curr_node.children[suffix[curr_letter]]
            interval = curr_edge.interval

            word_letter = interval[0]
            while word_letter <= interval[1]:
                if self.word[word_letter] != suffix[curr_letter]:  # head ended at implicit node, making real node
                    # edge from new node to the to the node at the end of current edge
                    old_edge_interval = (word_letter, interval[1])
                    old_edge = sEdge(old_edge_interval, curr_edge.node)

                    # edge from new node to the leaf made for current suffix
                    new_edge_interval = (len(self.word) - len(suffix) + curr_letter, len(self.word)-1)
                    new_edge = sEdge(new_edge_interval, sNode())

                    # new node in the place where was just implicit node
                    new_node = sNode()
                    new_node.add_child(self.word[old_edge.interval[0]], old_edge)
                    new_node.add_child(self.word[new_edge.interval[0]], new_edge)

                    # current edge is shortened: now it is from current node to new node
                    curr_edge_interval = (interval[0], word_letter-1)
                    curr_edge = sEdge(curr_edge_interval, new_node)
                    curr_node.add_child(self.word[interval[0]], curr_edge)

                    # to leave both loops
                    curr_letter = len(suffix)
                    break

                word_letter += 1
                curr_letter += 1
            else:
                curr_node = curr_node.children[self.word[interval[0]]].node

        else:   # head ended at the node
            if curr_letter < len(suffix):
                # making new edge from current node to the leaf
                new_edge_interval = (len(self.word) - len(suffix) + curr_letter, len(self.word) - 1)
                new_edge = sEdge(new_edge_interval, sNode())

                curr_node.add_child(suffix[curr_letter], new_edge)

    def find(self, word):
        curr_letter = 0
        curr_node = self.root
        while curr_letter < len(word):
            # checking if common part of word to find and text represented by the tree ended in the current node
            if word[curr_letter] not in curr_node.children:
                return False

            # looking for edge which is going out from current node
            curr_edge = curr_node.children[word[curr_letter]]
            interval = curr_edge.interval

            # checking current edge
            word_letter = interval[0]
            while word_letter <= interval[1]:
                if curr_letter == len(word):
                    return True

                if self.word[word_letter] != word[curr_letter]:
                    return False

                curr_letter += 1
                word_letter += 1

            curr_node = curr_edge.node

        return True

    @property
    def root(self):
        return self._root

    @property
    def word(self):
        return self._word


class sNode:
    def __init__(self):
        self._children = {}

    def add_child(self, letter, edge):
        self._children[letter] = edge

    @property
    def children(self):
        return self._children


class sEdge:
    def __init__(self, interval, node):
        self._interval = interval
        self._node = node

    @property
    def interval(self):
        return self._interval

    @property
    def node(self):
        return self._node


def simple_mccreight(word):
    tree = sTree(word)
    for i in range(len(word)):
        tree.add_suffix(word[i:])

    return tree

<a id='McCreight'></a>
## McCreight algorithm

### Structures

In [17]:
class Tree:
    """Structure containing suffix tree"""
    def __init__(self, word):
        self._root = Node(None, "")
        self._word = word

    def split_edge(self, parent_node, top_interval):
        # current edge
        curr_edge = parent_node.children[self.word[top_interval[0]]]

        # new node
        new_node = Node(parent_node, self.word[top_interval[0]])

        # edge from parent node to the new node
        top_edge = Edge(top_interval, new_node)
        parent_node.children[self.word[top_interval[0]]] = top_edge

        # edge from new node to the child of parent node
        bottom_interval = (top_interval[1] + 1, curr_edge.interval[1])
        bottom_edge = Edge(bottom_interval, curr_edge.node)
        new_node.add_child(self.word[top_interval[1] + 1], bottom_edge)
        curr_edge.node.parent = new_node
        curr_edge.node.parent_first_letter = self.word[top_interval[1] + 1]

    def find(self, word):
        curr_letter = 0
        curr_node = self.root
        while curr_letter < len(word):
            # checking if common part of word to find and text represented by the tree ended in the current node
            if word[curr_letter] not in curr_node.children:
                return False

            # looking for edge which is going out from current node
            curr_edge = curr_node.children[word[curr_letter]]
            interval = curr_edge.interval

            # checking current edge
            word_letter = interval[0]
            while word_letter <= interval[1]:
                if curr_letter == len(word):
                    return True

                if self.word[word_letter] != word[curr_letter]:
                    return False

                curr_letter += 1
                word_letter += 1

            curr_node = curr_edge.node

        return True

    @property
    def root(self):
        return self._root

    @property
    def word(self):
        return self._word


class Node:
    def __init__(self, parent, parent_first_letter):
        self._parent = parent
        self._parent_first_letter = parent_first_letter
        self._children = {}

        self._link = None

    def add_child(self, letter, edge):
        self._children[letter] = edge

    def print(self):
        print(self._children)
        for child in self._children.values():
            print(child.interval)
            child.node.print()

    @property
    def children(self):
        return self._children

    @property
    def parent(self):
        return self._parent

    @property
    def parent_first_letter(self):
        return self._parent_first_letter

    @property
    def link(self):
        return self._link

    @parent.setter
    def parent(self, parent):
        self._parent = parent

    @parent_first_letter.setter
    def parent_first_letter(self, parent_first_letter):
        self._parent_first_letter = parent_first_letter

    @link.setter
    def link(self, link):
        self._link = link


class Edge:
    def __init__(self, interval, node):
        self._interval = interval
        self._node = node

    @property
    def interval(self):
        return self._interval

    @property
    def node(self):
        return self._node

### Functions

In [18]:
def split_edge(parent_node, top_interval, word):
    # current edge
    curr_edge = parent_node.children[word[top_interval[0]]]

    # new node
    new_node = Node(parent_node, word[top_interval[0]])

    # edge from parent node to the new node
    top_edge = Edge(top_interval, new_node)
    parent_node.children[word[top_interval[0]]] = top_edge

    # edge from new node to the child of parent node
    bottom_interval = (top_interval[1] + 1, curr_edge.interval[1])
    bottom_edge = Edge(bottom_interval, curr_edge.node)
    new_node.add_child(word[top_interval[1] + 1], bottom_edge)
    curr_edge.node.parent = new_node
    curr_edge.node.parent_first_letter = word[top_interval[1] + 1]


def graft(node, interval, first_letter):
    new_node = Node(node, first_letter)
    new_edge = Edge(interval, new_node)
    node.add_child(first_letter, new_edge)

    return new_node


def label_size(label):
    return label[1] - label[0] + 1


def fast_find(node, label, word):
    curr_node = node
    curr_label = label

    child_edge = curr_node.children[word[curr_label[0]]]
    while label_size(curr_label) > label_size(child_edge.interval):
        curr_node = child_edge.node
        curr_label = (curr_label[0] + label_size(child_edge.interval), curr_label[1])

        child_edge = curr_node.children[word[curr_label[0]]]

    if label_size(curr_label) == label_size(child_edge.interval):
        return child_edge.node
    else:
        new_node_top_label = (child_edge.interval[0], child_edge.interval[0] + label_size(curr_label) - 1)
        split_edge(curr_node, new_node_top_label, word)

        curr_node = curr_node.children[word[curr_label[0]]].node

        return curr_node


def slow_find(node, label, word):
    curr_node = node
    curr_label_letter = label[0]

    if not word[curr_label_letter] in curr_node.children:
        return curr_node, label

    child_edge = curr_node.children[word[curr_label_letter]]
    curr_edge_letter = child_edge.interval[0]

    while word[curr_label_letter] == word[curr_edge_letter]:
        if curr_edge_letter == child_edge.interval[1]:
            curr_node = child_edge.node
            curr_label_letter += 1
            if not word[curr_label_letter] in curr_node.children:
                return curr_node, (curr_label_letter, label[1])

            child_edge = curr_node.children[word[curr_label_letter]]
            curr_edge_letter = child_edge.interval[0]
        else:
            curr_label_letter += 1
            curr_edge_letter += 1

    new_node_top_interval = (child_edge.interval[0], curr_edge_letter - 1)
    split_edge(curr_node, new_node_top_interval, word)

    left_label = (curr_label_letter, label[1])
    return curr_node.children[word[child_edge.interval[0]]].node, left_label


def mccreight(word):
    tree = Tree(word)
    head = tree.root
    node = tree.root
    leaf = graft(node, (0, len(word)-1), word[0])

    for i in range(1, len(word)):
        left_label = (i, len(word)-1)
        if head == tree.root:
            node = tree.root
        else:
            to_head_label = head.parent.children[head.parent_first_letter].interval

            if head.parent == tree.root:
                to_head_label = (to_head_label[0] + 1, to_head_label[1])

                node = tree.root
            else:
                node = head.parent.link

            if to_head_label[1] >= to_head_label[0]:
                node = fast_find(node, to_head_label, word)
            left_label = (leaf.parent.children[leaf.parent_first_letter].interval[0], left_label[1])

        last_head = head
        head, left_label = slow_find(node, left_label, word)
        last_head.link = node
        leaf = graft(head, left_label, word[left_label[0]])

    return tree

<a id='Tests'></a>
## Tests

### Useful functions

In [19]:
def test(word, struct):
    # printing name of algorithm
    if struct == "trie":
        print("Using trie")
        algorithm = make_suffix_trie
    elif struct == "simple McCreight":
        print("Using simplified McCreight algorithm")
        algorithm = simple_mccreight
    elif struct == "McCreight":
        print("Using McCreight algorithm")
        algorithm = mccreight
    else:
        raise NameError("Invalid algorithm")

    # opening file if we have got the act
    text_name = word
    if word[-4:] == ".txt":
        with open(word) as file:
            word = file.read()
            word += "$"  # marker at the end of the text

    # running algorithm
    start_time = time()
    tree = algorithm(word)
    time_of_running = time() - start_time

    # test: 1000 random words which must be in the tree
    for _ in range(1000):
        start = random.randint(0, len(word)-1)
        end = random.randint(start, len(word))
        test_word = word[start:end]
        if not tree.find(test_word):
            print(struct + " for word " + word + " is incorrect")
            return False

    # test: 1000 completely random words, it is possible that some of them are in the tree
    # words which are in the tree will be printed
    found_words = set()
    for _ in range(1000):
        length = random.randint(1, math.ceil(min(math.sqrt(len(word)) + 2, 12)))
        test_word = ""
        for _ in range(length):
            test_word += random.choice(string.ascii_letters)
        if tree.find(test_word):
            found_words.add(test_word)

    # in such place we know that algorithm is correct for this set of tests
    print("random words found in tree:", found_words)
    if text_name[-4:] == ".txt":
        print(struct + " for text \"" + text_name + "\" is correct")
    else:
        print(struct + " for word \"" + word + "\" is correct")

    print("time of running:", time_of_running, "\n\n")
    return True


def compare_algorithms(word, with_trie=True):
    if with_trie:
        test(word, "trie")
    test(word, "simple McCreight")
    test(word, "McCreight")

### Examples

Funckcja testujaca sprawdza 1000 losowych slow, ktore powinny znalezc sie w drzewach i jesli ktorekolwiek slowo sie nie znajduje to jest wypisywany komunikat: "tree for word {slowo} is incorrect". W przeciwnym wypadku: "tree for word {slowo} is correct".

Następnie jest losowane 1000 calkowicie losowych slow. Wiekszosc z nich najprawdopodobniej nie powinna sie znalezc w drzewie. Ale w sytuacji gdy takie slowo sie jednak znajdzie to jest wypisywane po komunikacie: "random words found in tree: ". Mozna wtedy zobaczyc czy jakies niewlasciwe slowo zostalo znalezione co swiadczyloby o niepoprawnosci algorytmu (zazwyczaj jest wypisywanych bardzo malo slow).

In [20]:
word = "bbb$"
compare_algorithms(word)

Using trie
random words found in tree: {'b'}
trie for word "bbb$" is correct
time of running: 0.0002837181091308594 


Using simplified McCreight algorithm
random words found in tree: {'b'}
simple McCreight for word "bbb$" is correct
time of running: 5.412101745605469e-05 


Using McCreight algorithm
random words found in tree: {'b'}
McCreight for word "bbb$" is correct
time of running: 4.887580871582031e-05 




In [21]:
word = "aabbabd"
compare_algorithms(word)

Using trie
random words found in tree: {'d', 'b', 'a'}
trie for word "aabbabd" is correct
time of running: 0.0004901885986328125 


Using simplified McCreight algorithm
random words found in tree: {'d', 'b', 'a'}
simple McCreight for word "aabbabd" is correct
time of running: 5.14984130859375e-05 


Using McCreight algorithm
random words found in tree: {'d', 'b', 'ba', 'a'}
McCreight for word "aabbabd" is correct
time of running: 6.246566772460938e-05 




In [22]:
word = "ababcd"
compare_algorithms(word)

Using trie
random words found in tree: {'c', 'd', 'b', 'a'}
trie for word "ababcd" is correct
time of running: 0.00022411346435546875 


Using simplified McCreight algorithm
random words found in tree: {'c', 'd', 'b', 'a'}
simple McCreight for word "ababcd" is correct
time of running: 5.2928924560546875e-05 


Using McCreight algorithm
random words found in tree: {'c', 'd', 'b', 'a'}
McCreight for word "ababcd" is correct
time of running: 4.673004150390625e-05 




In [23]:
word = "abcbccd"
compare_algorithms(word)

Using trie
random words found in tree: {'c', 'd', 'b', 'a'}
trie for word "abcbccd" is correct
time of running: 0.0002677440643310547 


Using simplified McCreight algorithm
random words found in tree: {'c', 'd', 'b', 'a'}
simple McCreight for word "abcbccd" is correct
time of running: 6.794929504394531e-05 


Using McCreight algorithm
random words found in tree: {'b', 'd', 'cc', 'a', 'c'}
McCreight for word "abcbccd" is correct
time of running: 5.7697296142578125e-05 




In [24]:
# act file was shortened due to the fact that trie algorithm could not process
# such a big file in a reasonable time
file = "short_act_1997_714.txt"
compare_algorithms(file)

Using trie
random words found in tree: {'os', 'no', 'S', 'f', 'te', 'R', 'U', 'P', 'h', 'A', 'W', 'p', 'T', 'bu', 'r', 'j', 'do', 'N', 'z', 'g', 'i', 'b', 'cj', 'n', 'O', 'ze', 'a', 'c', 'ob', 'y', 'e', 's', 'u', 'in', 'je'}
trie for text "short_act_1997_714.txt" is correct
time of running: 13.014437198638916 


Using simplified McCreight algorithm
random words found in tree: {'t', 'S', 'w', 'l', 'jo', 'f', 'R', 'P', 'h', 'm', 'A', 'id', 'p', 'T', 'r', 'j', 'N', 'D', 'i', 'z', 'b', 'ca', 'n', 'O', 'ni', 'a', 'y', 'o', 'e', 's', 'u', 'ow', 'iz'}
simple McCreight for text "short_act_1997_714.txt" is correct
time of running: 0.033220767974853516 


Using McCreight algorithm
random words found in tree: {'t', 'S', 'f', 'R', 'U', 'P', 'h', 'm', 'pa', 'W', 'T', 'r', 'j', 'N', 'D', 'i', 'b', 'az', 'O', 'a', 'y', 'o', 'e', 's', 'u', 'Ar', 'tko'}
McCreight for text "short_act_1997_714.txt" is correct
time of running: 0.02283024787902832 




In [25]:
file = "act_1997_714.txt"
compare_algorithms(file, with_trie=False)

Using simplified McCreight algorithm
random words found in tree: {'t', 'tr', 'w', 'V', 'J', 'M', 'l', 'G', 'f', 'jo', 'R', 'U', 'P', 'F', 'yjn', 'h', 'fl', 'A', 'WA', 'm', 'dze', 'k', 'E', 'W', 'La', 'p', 'd', 'B', 'x', 'Y', 'yg', 'N', 'i', 'z', 'L', 'Z', 'g', 'b', 'az', 'VI', 'las', 'O', 'n', 'sh', 'a', 'c', 'iu', 'y', 'o', 's', 'u', 'C', 'lin', 'mb', 'hy', 'du', 'X', 'ae', 'K', 'ne', 'I'}
simple McCreight for text "act_1997_714.txt" is correct
time of running: 12.708831310272217 


Using McCreight algorithm
random words found in tree: {'t', 'S', 'if', 'w', 'V', 'M', 'l', 'G', 'f', 'R', 'nu', 'P', 'gi', 'm', 'A', 'mos', 'k', 'tn', 'E', 'W', 'p', 'd', 'em', 'H', 'B', 'Y', 'r', 'ys', 'yt', 'N', 'D', 'z', 'L', 'g', 'i', 'b', 'n', 'KI', 'O', 'a', 'c', 'mu', 'y', 'o', 'e', 'TU', 'u', 'C', 'Pu', 'BJ', 'X', 'tp', 'by', 'K', 'bi', 'I'}
McCreight for text "act_1997_714.txt" is correct
time of running: 7.048904180526733 


