# text

> Text document display class.
- toc: false
- page-layout: full

In [None]:
#| default_exp text

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
from fastcore.basics import patch
import numpy as np
from IPython.display import display, HTML
import polars as pl

In [None]:
#| hide
from conc.corpus import Corpus
import os

In [None]:
#| export
from conc.result import Result

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

path_to_toy_corpus = f'{save_path}toy.corpus'
path_to_brown_corpus = f'{save_path}brown.corpus'
path_to_reuters_corpus = f'{save_path}reuters.corpus'
path_to_gardenparty_corpus = f'{save_path}garden-party.corpus'

In [None]:
#| export
class Text:
	""" Class to represent text documents """
	def __init__(self,
			  tokens:np.ndarray, # list of token strs
			  has_spaces: np.ndarray, # whether token strs followed by space
			  metadata: dict = {} # metadata for doc as a dict
			  ): 
		self.tokens = tokens
		self.has_spaces = has_spaces
		self.metadata = metadata

In [None]:
#| export
@patch
def _nl2br(self:Text,
           text:str # document text
           ):
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    return text.replace('\n', '<br>\n')

In [None]:
#| export
@patch
def _div(self:Text,
         text:str, # document text
         class_str:str = '' # div class
         ):
    """ Wrap text in div, with optional class """
    if class_str != '':
        class_str = f' class="{class_str}"'
    return f'<div{class_str}>{text}</div>'

In [None]:
#| export
@patch
def as_string(self:Text,
              max_tokens: int|None = None # maximum length of text to display in tokens, if None, display all
        ):
    """ Return the text as a string """

    interleaved = np.empty((self.tokens.size + self.has_spaces.size,), dtype=self.tokens.dtype)
    interleaved[0::2] = self.tokens
    interleaved[1::2] = np.where(self.has_spaces, ' ', '')

    if max_tokens is not None and self.tokens.size > max_tokens:
        interleaved = interleaved[:max_tokens * 2]
        interleaved[-1] = ''

    return ''.join(list(interleaved))

In [None]:
#| export
@patch
def as_tokens(self:Text,
        ):
    """ Return the text as a tokens """

    return list(self.tokens)

In [None]:
#| export
@patch
def __str__(self:Text):
    return self.as_string()

In [None]:
#| export
@patch
def tokens_count(self:Text):
    return len(self.tokens)

In [None]:
#| export
@patch
def display_metadata(self:Text,
                ):
    """ Output the metadata for a text """

    Result('metadata', self.metadata.transpose(include_header = True, header_name = 'attribute', column_names = ['value']), 'Metadata', '', {}, []).display()


In [None]:
#| export
@patch
def display(self:Text,
			show_metadata: bool = True, # whether to display Metadata for the text
			max_tokens: int|None = None # maximum length of text to display in tokens, if None, display all
				):
	""" Output a text """
	style = '<style>.conc-text {white-space: pre-wrap;}</style>\n'
	if show_metadata:
		self.display_metadata()

	text_string = self.as_string(max_tokens = max_tokens)

	if max_tokens is not None and self.tokens.size > max_tokens:
		text_string += f'… [{max_tokens} of {self.tokens.size} tokens]'

	display(HTML(style + self._div(text_string, class_str = 'conc-text')))


In [None]:
#| hide
toy = Corpus().load(path_to_toy_corpus)
brown = Corpus().load(path_to_brown_corpus)
gardenparty = Corpus().load(path_to_gardenparty_corpus)

In [None]:
#| hide
text = Text(*toy._get_text(1))
assert str(text) == 'The cat sat on the mat.'
assert text.as_string() == 'The cat sat on the mat.'
assert text.tokens_count() == 7

In [None]:
#| hide
text.display()

Metadata,Metadata
Attribute,Value
document_id,1
source,1.txt


In [None]:
#| hide
# testing on more complex text with spaces
assert brown.text(1).as_string() == open(f'{source_path}brown/ca01.txt', encoding = 'utf8').read()

In [None]:
gardenparty.text(12).display(max_tokens = 200)

Metadata,Metadata
Attribute,Value
document_id,12
file,the-singing-lesson.txt


In [None]:
gardenparty.text(12).display(show_metadata = False, max_tokens = 200)

In [None]:
gardenparty.text(12).as_string(max_tokens = 50)

'With despair—cold, sharp despair—buried deep in her heart like a wicked\r\nknife, Miss Meadows, in cap and gown and carrying a little baton, trod\r\nthe cold corridors that led to the music hall. Girls of all ages, rosy'

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()