In [1]:
import numpy as np
from typing import List, Dict, Tuple
import tempfile
import subprocess

from IPython.display import Image, display

### Initialization of datasets

In [2]:
datasets: List[str] = ["bbb$", "aabbabd$", "ababcd$", "abcbccd$"]
    
with open("1997_714_head.txt", "r") as file:
    text: str = file.read()
    text += "$"
    datasets.append(text[:len(text)//5])

### Trie class

In [3]:
class Node:
    counter = 1
    
    def __init__(self):
        self.children = {}

class Trie:
    
    def __init__(self, text: str) -> None:
        
        self.root: Node = Node()
            
        for i in range(len(text)):
            suffix = text[i:]
            self.append(suffix)
            
    def append(self, suffix: str) -> None:
        
        current: Node = self.root
        letter: str
        for depth, letter in enumerate(suffix):
            if letter not in current.children:
                current.children[letter] = Node()
            
            current = current.children[letter]
            
    def match(self, word: str) -> bool:
        current: Node = self.root
        letter: str
        for depth, letter in enumerate(word):
            if letter not in current.children:
                return False
            
            current = current.children[letter]
            
        return True
            

In [5]:
# correctness of Trie
Test = Tuple[str, bool]
# datasets:
# ["bbb$", "aabbabd$", "ababcd$", "abcbccd"]
tests: List[List[Test]] = [
    [('bb', True), ('bbb', True), ('bbbb', False), ('a', False)],
    [('aabb', True), ('bbab', True), ('d', True), ('aabbb', False), ('ad', False)],
    [('ab', True), ('ababcd', True), ('bcd', True), ('aa', False), ('da', False)],
    [('ab', True), ('cbcc', True), ('ccd', True), ('abcd', False), ('ccdd', False)],
]
    
for ds, test_suite in zip(datasets[:-1], tests):
    print(ds)
    trie = Trie(ds)
    word: str
    result: bool
    for word, result in test_suite:
        assert trie.match(word) == result
    print("OK")

bbb$
OK
aabbabd$
OK
ababcd$
OK
abcbccd$
OK


### Suffix tree

In [3]:
class SuffixNode:
    def __init__(self, text: str, start: int = -1, end: int = -1, text_content: str = ""):
        # by Python convention, start is inclusive, but end is exclusive
        self.start: int = start
        self.end: int = end
        self.text_content = text_content
        self.text: str = text
        self.letter_children: Dict[str, SuffixNode] = {}
        self.pointer_children: List[SuffixNode] = []
    
    @property
    def length(self) -> int:
        return self.end - self.start
    
    @property
    def content(self) -> str:
        if self.start != -1 and self.end != -1:
            return self.text[self.start:self.end]
        else:
            return self.text_content
    
    def string_in_node(self, string: str) -> bool:
        # possible optimazaion by passing pointers to beggining and end of string
        if len(string) > self.length:
            return string[: self.length] == self.content
        
        return string == self.content[:len(string)]
    
    def add_child(start: int = -1, end: int = -1, text_content: str = ""):
        if child != "":
            self.letter_children[child] = SuffixNode(text, text_content=text_content)
        else:
            self.pointer_children.append(SuffixNode(text, start=start, end=end))
        
        
class SuffixTree:
    
    def __init__(self, text: str):
        self.root: SuffixNode = SuffixNode()
        
        
    def append(self, suffix: str, suffix_index: int) -> None:
        
        # find max prefix that fits
        letter: str
        max_prefix: str
        for i in range(len(suffix)):
            max_prefix = suffix[:i]
            
            if not self.match(max_prefix):
                break
        
        # looking for a place where to add prefix
        current: SuffixNode = self.root
        parent: SuffixNode = self.root
        i = 0
        while i < len(max_prefix):
            if max_prefix[i] not in current.letter_children:
                found = False
                for child in current.pointer_children:
                    if child.string_in_node(word[i:]):
                        i += min(child.length, len(word[i:]))
                        parent = current
                        current = child
                        break
            
            else:
                parent = current
                current = current.letter_children[word[i]]
                i += 1
        
        
        # adding dependent if it is proper node, or multiple node
        if current.length == 1: # proper node
            string_to_append: str = suffix[len(max_prefix):]
            if len(string_to_append) == 1:
                current.add_child(text_content=string_to_append)
            else:
                start = suffix_index + len(max_prefix)
                end = suffix_index + len(max_prefix)
                current.add_child(text_content=string_to_append)
                
        else: # node is really an edge
            
    def match(self, word: str) -> bool:
        current: SuffixNode = self.root
        
        i = 0
        while i < len(word):
            if word[i] not in current.letter_children:
                found = False
                for child in current.pointer_children:
                    if child.string_in_node(word[i:]):
                        i += min(child.length, len(word[i:]))
                        current = child
                        found = True
                        break
                
                if not found:
                    return False
            
            else:
                current = current.letter_children[word[i]]
                i += 1
            
            
        return True