# text

> Text document display class.
- toc: false
- page-layout: full

In [None]:
#| default_exp text

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
from fastcore.basics import patch
import numpy as np
from IPython.display import display, HTML
import polars as pl
import textwrap
import re

In [None]:
#| hide
from conc.corpus import Corpus
import os

In [None]:
#| export
from conc.result import Result

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

path_to_toy_corpus = f'{save_path}toy.corpus'
path_to_brown_corpus = f'{save_path}brown.corpus'
path_to_reuters_corpus = f'{save_path}reuters.corpus'
path_to_gardenparty_corpus = f'{save_path}garden-party.corpus'

## Using the Text class

The Text class is not intended to be used directly. Functionality is accessible via the `Corpus.text` method, which provides the necessary inputs to instantiate the class. There are examples below illustrating how Text objects can be created and used for a Corpus.

## Text class API reference

In [None]:
#| export
class Text:
	""" Class to represent text documents """
	def __init__(self,
			  tokens:np.ndarray, # list of token strs
			  has_spaces: np.ndarray, # whether token strs followed by space
			  metadata: dict = {}, # metadata for doc as a dict
			  doc_df: pl.DataFrame = None # if provided can be used for enhanced display (e.g. keyword highlighting)
			  ): 
		self.tokens = tokens
		self.has_spaces = has_spaces
		self.metadata = metadata
		self.doc_df = doc_df

In [None]:
#| export
@patch
def _nl2br(self:Text,
           text:str # document text
           ):
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    return text.replace('\n', '<br>\n')

In [None]:
#| export
@patch
def _div(self:Text,
         text:str, # document text
         class_str:str = '' # div class
         ):
    """ Wrap text in div, with optional class """
    if class_str != '':
        class_str = f' class="{class_str}"'
    return f'<div{class_str}>{text}</div>'

In [None]:
#| exporti
@patch
def corpus_position_to_doc_position(self:Text,
                                      pos:int # position in corpus
                                      ) -> int:
    """ Convert corpus position to document position """

    doc_pos = self.doc_df.with_row_index('doc_position').filter((pl.col('position') == pos) & (pl.col('not_space') == 1)).select(pl.col('doc_position')).collect().item()
    return doc_pos


In [None]:
#| exporti
@patch
def doc_position_to_corpus_position(self:Text,
                                      pos:int # position in corpus
                                      ) -> int:
    """ Convert doc position to corpus position """

    corpus_pos = self.doc_df.filter(pl.col('not_space') == 1).with_row_index('doc_position').filter((pl.col('doc_position') == pos)).select(pl.col('position')).collect().item()
    return corpus_pos

In [None]:
#| export
@patch
def as_string(self:Text,
              max_tokens: int|None = None, # maximum length of text to display in tokens, if None, display all
              highlighted_token_range: tuple|None = None # range of tokens to highlight, note: these token ids are positions within the corpus, not the text itself
        ):
    """ Return the text as a string """

    interleaved = np.empty((self.tokens.size + self.has_spaces.size,), dtype=self.tokens.dtype)
    
    if self.doc_df is not None and highlighted_token_range is not None:
        doc_pos_start = self.corpus_position_to_doc_position(highlighted_token_range[0])
        doc_pos_end = self.corpus_position_to_doc_position(highlighted_token_range[1])
        tokens_with_highlight = self.tokens.copy()
        tokens_with_highlight[doc_pos_start] = f'<span class="highlight">{tokens_with_highlight[doc_pos_start]}'
        tokens_with_highlight[doc_pos_end] = f'{tokens_with_highlight[doc_pos_end]}</span>'
        interleaved[0::2] = tokens_with_highlight
    else:
        interleaved[0::2] = self.tokens
    interleaved[1::2] = np.where(self.has_spaces, ' ', '')

    if max_tokens is not None and self.tokens.size > max_tokens:
        interleaved = interleaved[:max_tokens * 2]
        interleaved[-1] = ''

    return ''.join(list(interleaved))

In [None]:
#| export
@patch
def as_tokens(self:Text,
        ):
    """ Return the text as a tokens """

    return list(self.tokens)

In [None]:
#| export
@patch
def __str__(self:Text):
    return self.as_string()

In [None]:
#| export
@patch
def tokens_count(self:Text):
    return len(self.tokens)

In [None]:
#| export
@patch
def display_metadata(self:Text,
                ):
    """ Output the metadata for a text """

    Result('metadata', self.metadata.transpose(include_header = True, header_name = 'attribute', column_names = ['value']), 'Metadata', '', {}, []).display()


In [None]:
#| export
@patch
def get_metadata(self:Text,
                ):
    """ Output the metadata for a text """

    return Result('metadata', self.metadata.transpose(include_header = True, header_name = 'attribute', column_names = ['value']), 'Metadata', '', {}, [])

In [None]:
#| export
@patch
def display(self:Text,
			show_metadata: bool = True, # whether to display Metadata for the text
			max_tokens: int|None = None, # maximum length of text to display in tokens, if None, display all
			output_html: bool = True, # whether to display text with HTML formatting
			textwrap_width: int|None = None, # maximum length of text to display in characters, if None, no wrapping
			textwrap_args: dict|None = None, # additional args to pass to textwrap.fill
			reflow_paragraphs: bool = False, # whether to reflow paragraphs individually before text wrapping is applied
			paragraph_delimiter_regex: str = r'(\s*\n\s*){1,}\n', # regex to split paragraphs for reflow_paragraphs (default looks for whitespace ending with a newline that contains at least one other newline)
				):
	""" Output a text """

	# TODO - add font size, font family and style overrides

	style = '''
	<style>
	.conc-text-wrapper { background: #fff; color: #000; border: 1px solid #000;border-radius: 0.5em;width: max-content;padding: 0.3em; min-width: 400px;} 
	.conc-text {margin:0.3em; white-space: pre-wrap; font-family: Georgia, Cambria, "Times New Roman", Times, serif;font-size: 1.3em; width: max-content;}
  </style>
	'''
	metadata = ''
	if show_metadata:
		metadata = self.get_metadata().to_html()

	text_string = self.as_string(max_tokens = max_tokens)

	if max_tokens is not None and self.tokens.size > max_tokens:
		text_string += f'…\n[{max_tokens} of {self.tokens.size} tokens]'

	if reflow_paragraphs:
		text_string_chunks = re.split(paragraph_delimiter_regex, text_string)
	else:
		text_string_chunks = [text_string]

	if textwrap_width is not None:
		for i, chunk in enumerate(text_string_chunks):
			text_string_chunks[i] = textwrap.fill(chunk, width = textwrap_width, **(textwrap_args or {}))
	elif reflow_paragraphs:
		text_string_chunks = [re.sub(r'\s+', ' ', chunk.strip()) for chunk in text_string_chunks]
	
	text_string = '\n'.join(text_string_chunks)

	if output_html:
		display(HTML(style + self._div(metadata + self._div(text_string, class_str = 'conc-text'), class_str = 'conc-text-wrapper')))
	else:
		print(text_string)


In [None]:
#| hide
toy = Corpus().load(path_to_toy_corpus)
brown = Corpus().load(path_to_brown_corpus)
gardenparty = Corpus().load(path_to_gardenparty_corpus)

In [None]:
#| hide
text = Text(*toy._get_text(1))
assert str(text) == 'The cat sat on the mat.'
assert text.as_string() == 'The cat sat on the mat.'
assert text.tokens_count() == 7

In [None]:
#| hide
text.display()

Metadata,Metadata
Attribute,Value
document_id,1
source,1.txt
category,feline
species,cat


In [None]:
#| hide
# NOTE:
# commands below use a Corpus object. Text is a class that is used by Corpus to represent text documents.
# if changing text above - these won't be reflected below until reexported and restart kernel and rerun.

In [None]:
#| hide
# testing on more complex text with spaces
assert brown.text(1).as_string() == open(f'{source_path}brown/ca01.txt', encoding = 'utf8').read()

In [None]:
#| hide
# test retrieval of specific token by position with new doc_df
doc_id = 4
pos = 22487
doc_pos = gardenparty.text(doc_id).doc_df.with_row_index('doc_position').filter(pl.col('position') == pos).select(pl.col('doc_position')).collect().item() 
print(doc_pos, gardenparty.text(4).tokens[doc_pos])

663 handkerchief


#### Examples

See the note above about accessing this functionality through the [Corpus](https://geoffford.nz/conc/api/corpus.html) class.

In [None]:
gardenparty.text(12).display(max_tokens = 200)

Metadata,Metadata
Attribute,Value
document_id,12
file,the-singing-lesson.txt


In [None]:
gardenparty.text(12).display(show_metadata = False, max_tokens = 200, textwrap_width = 100, reflow_paragraphs = True)

In [None]:
gardenparty.text(12).as_string(max_tokens = 50)

'With despair—cold, sharp despair—buried deep in her heart like a wicked\r\nknife, Miss Meadows, in cap and gown and carrying a little baton, trod\r\nthe cold corridors that led to the music hall. Girls of all ages, rosy'

In [None]:
#| hide
gardenparty.text(12).doc_df.head(10).collect()

position,orth_index,lower_index,token2doc_index,has_spaces,not_space
59174,4216,4264,12,True,1
59175,947,947,12,False,1
59176,4667,4667,12,False,1
59177,1045,1045,12,False,1
59178,874,874,12,True,1
59179,4643,4643,12,True,1
59180,947,947,12,False,1
59181,4667,4667,12,False,1
59182,4889,4889,12,True,1
59183,4332,4332,12,True,1


In [None]:
#| hide
gardenparty.text(12).as_string(highlighted_token_range = (59175, 59177))



In [None]:
#| hide
import nbdev; nbdev.nbdev_export()