# text-analysis

> Useful code for analyzing text.

In [None]:
#| default_exp common.text_analysis

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| hide
from fastcore.test import *

In [None]:
#| export
from typing import Dict, Iterable, Sequence, Tuple 

In [None]:
#| export 
import torch

In [None]:
#| export
def top_nonzero_tokens(freqs: torch.Tensor, itos: Dict[int, str]) -> Iterable[Tuple[str, float]]:
    k = torch.count_nonzero(freqs).item()
    assert isinstance(k, int) # keep mypy happy
    topk = torch.topk(freqs, k=k)
    return [(itos[i], freqs[i].item()) for i in topk.indices.tolist()]

In [None]:
# Tests for top_nonzero_tokens
itos = {0: 'a', 1: 'b', 2: 'c', 3: 'd'}

# All zeros
freqs = torch.tensor([0, 0, 0, 0])
test_eq(top_nonzero_tokens(freqs, itos), [])

# All non-zeros
freqs = torch.tensor([1, 2, 3, 4])
test_eq(top_nonzero_tokens(freqs, itos), [('d', 4), ('c', 3), ('b', 2), ('a', 1)])

# Some zeros
freqs = torch.tensor([0, 2, 0, 4])
test_eq(top_nonzero_tokens(freqs, itos), [('d', 4), ('b', 2)])

In [None]:
#| export
class SubstringFrequencyAnalysis:
    """Class that performs frequency analysis on a body of text for a set of substrings."""

    def __init__(
        self,
        substrs: Sequence[str],
        text: str,
        vocab_size: int,
        itos: Dict[int, str],
        stoi: Dict[str, int],
    ):
        self.substrs = substrs
        self.text = text
        self.vocab_size = vocab_size
        self.itos = itos
        self.stoi = stoi

        # Need at least one string to determine the length
        # and for this to be useful.
        assert len(substrs) > 0
        self.s_len = len(substrs[0])

        # Build frequency map of next characters
        self.freq_map = {
            s: torch.zeros(self.vocab_size, dtype=torch.long) for s in self.substrs
        }

        for i in range(len(self.text) - self.s_len):
            s = self.text[i : i + self.s_len]
            if s in self.freq_map:
                next_char = text[i + self.s_len]
                self.freq_map[s][self.stoi[next_char]] += 1

        # Compute the normalized cumulative frequencies
        self.cumulative_freqs = torch.zeros(self.vocab_size, dtype=torch.float32)
        for freqs in self.freq_map.values():
            self.cumulative_freqs += freqs.float()
        self.cumulative_freqs /= self.cumulative_freqs.sum()

        # Figure out the top tokens for each substring
        self.top_tokens = {
            s: top_nonzero_tokens(freqs, self.itos)
            for s, freqs in self.freq_map.items()
        }
        self.top_tokens_cumulative = top_nonzero_tokens(
            self.cumulative_freqs, self.itos
        )

    def print_summary(self):
        print(f"Substrings: {', '.join([repr(substr) for substr in self.substrs])}")

        print("Top Tokens for each substring:")
        for s, tokens in self.top_tokens.items():
            print(
                f"{repr(s):>{2*self.s_len+2}}: {', '.join([f'{repr(token):>4} ({freq:>4})' for token, freq in tokens])}"
            )

        print("Cumulative Top Tokens:")
        print(
            ', '.join(
                [
                    f'{repr(token):>4} ({freq:.2f})'
                    for token, freq in self.top_tokens_cumulative
                ]
            )
        )

In [None]:
# Tests for SubstringFrequencyAnalysis
itos = {0: 'a', 1: 'b', 2: 'c', 3: 'd'}
stoi = {v: k for k, v in itos.items()}
text = 'aaabbbcccddd'
substrs = ['aaa', 'bbb', 'ccc', 'ddd']
sfa = SubstringFrequencyAnalysis(
    substrs=substrs, text=text, vocab_size=len(itos), itos=itos, stoi=stoi
)
test_eq(sfa.freq_map['aaa'].tolist(), [0, 1, 0, 0])
test_eq(sfa.freq_map['bbb'].tolist(), [0, 0, 1, 0])
test_eq(sfa.freq_map['ccc'].tolist(), [0, 0, 0, 1])
test_eq(sfa.freq_map['ddd'].tolist(), [0, 0, 0, 0])

test_close(sfa.cumulative_freqs.tolist(), [0, 1/3, 1/3, 1/3])

test_eq(sfa.top_tokens['aaa'], [('b', 1)])
test_eq(sfa.top_tokens['bbb'], [('c', 1)])
test_eq(sfa.top_tokens['ccc'], [('d', 1)])

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()