# substring-generator

> Utilities for generating substrings from a body of text.

In [None]:
#| default_exp common.substring_generator

In [None]:
# | hide
%load_ext autoreload
%autoreload 2

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| hide
from fastcore.test import *

In [None]:
#| export
from collections import OrderedDict
from typing import Sequence

In [None]:
#| export
class SubstringGenerator:
    """Iterable that produces all possible substrings of a given
    length from a given text."""
    def __init__(self, text: str, substring_length: int):
        if len(text) < substring_length:
            raise ValueError("Text length must be greater than or equal to substring length.")

        if substring_length < 1:
            raise ValueError("Substring length must be greater than or equal to 1.")

        self.text = text
        self.substring_length = substring_length

    def __len__(self):
        return len(self.text) - self.substring_length + 1

    def __iter__(self):
        for i in range(len(self.text) - self.substring_length + 1):
            yield self.text[i : i + self.substring_length]

In [None]:
# Tests for SubstringGenerator

# Substring length longer than text
with ExceptionExpected(ValueError):
    SubstringGenerator("abc", 4)

# Invalid substring length
with ExceptionExpected(ValueError):
    SubstringGenerator("abc", 0)

with ExceptionExpected(ValueError):
    SubstringGenerator("abc", -1)

# Valid substring length
sg = SubstringGenerator("abcd", 2)
test_eq(len(sg), 3)
test_eq(list(sg), ["ab", "bc", "cd"])

In [None]:
#| export
def all_unique_substrings(text: str, substring_length: int) -> Sequence[str]:
    """Returns all unique substrings of a given length from a given text.
    Substrings are returned in the order of first occurrence in the text."""
    sg = SubstringGenerator(text, substring_length)
    od: OrderedDict[str, None] = OrderedDict()
    for substring in sg:
        # only insert if not already present, which ensures the
        # order of first occurrence is preserved
        if substring not in od:
            od[substring] = None
    return list(od.keys())

In [None]:
# Tests for all_unique_substrings

# No duplicates
test_eq(all_unique_substrings("abc", 2), ["ab", "bc"])

# Duplicates are removed and appear in order of first occurrence
test_eq(all_unique_substrings("abcab", 2), ["ab", "bc", "ca"])

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()