# text

> Text document display class.

In [None]:
#| default_exp text

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
from fastcore.basics import patch
import numpy as np
from IPython.display import display, HTML

In [None]:
#| hide
from conc.corpus import Corpus

In [None]:
#| hide
path_to_toy_corpus = '../test-corpora/saved/toy.corpus'
path_to_brown_corpus = '../test-corpora/saved/brown.corpus'
path_to_reuters_corpus = '../test-corpora/saved/reuters.corpus'

In [None]:
#| export
class Text:
	""" Class to represent text documents """
	def __init__(self,
			  tokens:np.ndarray, # list of token strs
			  has_spaces: np.ndarray # whether token strs followed by space
			  ): 
		self.tokens = tokens
		self.has_spaces = has_spaces


In [None]:
#| export
@patch
def _nl2br(self:Text,
           text:str # document text
           ):
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    return text.replace('\n', '<br>\n')

In [None]:
#| export
@patch
def _div(self:Text,
         text:str, # document text
         class_str:str = '' # div class
         ):
    """ Wrap text in div, with optional class """
    if class_str != '':
        class_str = f' class="{class_str}"'
    return f'<div{class_str}>{text}</div>'

In [None]:
#| export
@patch
def as_string(self:Text,
        ):
    """ Return the text as a string """

    interleaved = np.empty((self.tokens.size + self.has_spaces.size,), dtype=self.tokens.dtype)
    interleaved[0::2] = self.tokens
    interleaved[1::2] = np.where(self.has_spaces, ' ', '')

    return ''.join(list(interleaved))

In [None]:
#| export
@patch
def as_tokens(self:Text,
        ):
    """ Return the text as a tokens """

    return list(self.tokens)

In [None]:
#| export
@patch
def __str__(self:Text):
    return self.as_string()

In [None]:
#| export
@patch
def tokens_count(self:Text):
    return len(self.tokens)

In [None]:
#| export
@patch
def display(self:Text,
                ):
    """ Output a text """
    style = '<style>.conc-text {white-space: pre-wrap;}</style>\n'
    display(HTML(style + self._div(self.as_string(), class_str = 'conc-text')))


In [None]:
#| hide
toy = Corpus().load(path_to_toy_corpus)
brown = Corpus().load(path_to_brown_corpus)

In [None]:
#| hide
text = Text(*toy._get_text(0))
assert str(text) == 'The cat sat on the mat.'
assert text.as_string() == 'The cat sat on the mat.'
assert text.tokens_count() == 7

In [None]:
#| hide
text.display()

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()