# corpus

> Create a conc corpus.
- toc: false
- page-layout: full

In [None]:
#| default_exp corpus

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import re
import polars as pl
import numpy as np
from great_tables import GT
import os
import glob
import spacy
from spacy.attrs import ORTH, LOWER, SPACY # May extend to POS, TAG, SENT_START, LEMMA
import string
from fastcore.basics import patch
import time
from slugify import slugify
import msgspec # tested against orjson - with validation was faster, without around the same
import unicodedata
import sys

In [None]:
#| hide
import shutil

In [None]:
#| export
from conc import __version__
from conc.core import logger, CorpusMetadata, PAGE_SIZE, EOF_TOKEN_STR, ERR_TOKEN_STR, REPOSITORY_URL, DOCUMENTATION_URL, CITATION_STR, PYPI_URL
from conc.result import Result
from conc.text import Text

In [None]:
#| hide
from conc.core import set_logger_state, spacy_attribute_name

In [None]:
#| exporti
polars_conf = pl.Config.set_tbl_hide_column_data_types(True)
polars_conf = pl.Config.set_tbl_hide_dataframe_shape(True)
polars_conf = pl.Config.set_tbl_rows(50)
polars_conf = pl.Config.set_tbl_width_chars(300)
polars_conf = pl.Config.set_fmt_str_lengths(300)

In [None]:
#| exporti
_RE_COMBINE_WHITESPACE = re.compile(r"\s+")
_RE_PUNCT = re.compile(r"^[^\s^\w^\d]$")

In [None]:
#| export
NOT_DOC_TOKEN = -1
INDEX_HEADER_LENGTH = 100

In [None]:
#| export
PUNCTUATION_STRINGS = ''.join(set(list(string.punctuation) + 
                                [chr(i) for i in range(sys.maxunicode + 1) if unicodedata.category(chr(i)).startswith("P")] + 
                                [chr(i) for i in range(sys.maxunicode + 1) if unicodedata.category(chr(i)).startswith("Sc")]
								))

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

In [None]:
#| hide
from conc.core import create_toy_corpus_sources, get_garden_party, get_nltk_corpus_sources

In [None]:
#| hide
# files that use test corpora need build_test_corpora here - not needed here
if not os.path.exists(source_path):
    os.makedirs(source_path)
if not os.path.exists(save_path):
    os.makedirs(save_path)

if not os.path.exists(f'{source_path}toy.csv.gz'):
    create_toy_corpus_sources(source_path)

if  not os.path.exists(f'{source_path}garden-party-corpus.zip'):
    get_garden_party(source_path)

if not os.path.exists(f'{source_path}brown.csv.gz'):
    get_nltk_corpus_sources(source_path)

## Corpus class

In [None]:
#| export
class Corpus:
	"""Represention of text corpus, with methods to build, load and save a corpus from a variety of formats and to work with the corpus data."""
	
	def __init__(self, 
				name: str = '', # name of corpus
				description: str = '' # description of corpus
				):
		# information about corpus
		self.name = name
		self.description = description
		self.slug = None

		# conc version that built the corpus
		self.conc_version = None
		
		# paths
		self.corpus_path = None
		self.source_path = None

		# settings
		self.SPACY_MODEL = None
		self.SPACY_MODEL_VERSION = None
		self.SPACY_EOF_TOKEN = None # set below as nlp.vocab[EOF_TOKEN_STR].orth in build or through load  - EOF_TOKEN_STR starts with space so eof_token can't match anything from corpus
		self.EOF_TOKEN = None

		# special token ids
		self.punct_tokens = None
		self.space_tokens = None

		# metadata for corpus
		self.document_count = None
		self.token_count = None
		self.unique_tokens = None

		self.word_token_count = None
		self.unique_word_tokens = None

		self.date_created = None

		# token data
		self.tokens = None
		# self.orth_index = None
		# self.lower_index = None

		# lookup mapping doc_id to every token in doc
		# self.token2doc_index = None

		# lookups to get token string or frequency 
		self.vocab = None
		# self.frequency_lookup = None

		# offsets for each document in token data
		# self.offsets = None

		# punct and space positions in token data
		# self.punct_positions = None
		# self.space_positions = None
		self.puncts = None
		self.spaces = None

		# metadata for each document
		self.metadata = None

		# lookups to get spacy tokenizer or internal ids
		# self.original_to_new = None
		# self.new_to_original = None
		
		# temporary data used when processing text, not 
		# 
		# 
		# 
		# d to disk permanently on save
		
		# self.frequency_table = None
		self.ngram_index = {}
		self.results_cache = {}


## Build and save a corpus

In [None]:
#| exporti
@patch
def _init_spacy_model(self: Corpus,
                model: str = 'en_core_web_sm', # spacy model to use for tokenization
				version: str|None = None # version of spacy model expected, if mismatch will raise a warning
				):
	try:
		self._nlp = spacy.load(model)
		self._nlp.disable_pipes(['parser', 'ner', 'lemmatizer', 'tagger', 'senter', 'tok2vec', 'attribute_ruler'])
		self._nlp.max_length = 10_000_000 # set max length to a large number to avoid issues with long documents
	except OSError as e:
		logger.error(f'Error loading model {model}. You need to run python -m spacy download YOURMODEL to download the model. See https://spacy.io/models for available models.')
		raise e
	
	if version is not None:
		if self._nlp.meta['version'] != version:
			logger.warning(f'Spacy model version mismatch: expecting {version}, got {self._nlp.meta["version"]}. This may cause issues with tokenization.')

In [None]:
#| exporti
@patch
def _process_punct_positions(self: Corpus):
	""" Process punctuation positions in token data and populates punct_tokens and punct_positions. """

	self.punct_tokens = np.array(list({k: v for k, v in self.vocab.items() if v.strip(PUNCTUATION_STRINGS) == ''}.keys()))
	punct_mask = np.isin(self.lower_index, self.punct_tokens) # faster to retrieve with isin than where
	self.punct_positions = np.nonzero(punct_mask)[0] # storing this as smaller

In [None]:
#| exporti
@patch
def _process_space_positions(self: Corpus):
	""" Process whitespace positions in token data and populates space_tokens and space_positions. """

	self.space_tokens = np.array(list({k: v for k, v in self.vocab.items() if v.strip() == ''}.keys()))
	space_mask = np.isin(self.lower_index, self.space_tokens) 	# faster to retrieve with isin than where
	self.space_positions = np.nonzero(space_mask)[0] # storing this as smaller


Conc defines a punctuation token as a token consisting only of punctuation characters. Punctuation characters are defined by combining the Python `string.punctuation` characters with unicode characters categorised as punctuation (i.e. [unicode characters with general category starting with P](https://en.wikipedia.org/wiki/Unicode_character_property#General_Category)) or currency symbols (i.e. [general category Sc](https://en.wikipedia.org/wiki/Unicode_character_property#General_Category)). This means, for example, that various versions of a dashes or quotation marks can be identified as punctuation. This also means that any emoticons that are based on sequences of punctuation characters, like :), will be defined as a punctuation token. To access reporting on punctuation is still possible in Conc reports using the relevant parameters. There are still many unicode symbol characters that are not defined as punctuation by Conc. This may change in future versions of Conc, including the ability to define punctuation strings or exceptions. Any changes will be documented.   

In [None]:
print(len(PUNCTUATION_STRINGS))
print(PUNCTUATION_STRINGS)

890
𝪋᛭；❰⁜꫰［『៘𑪚〛𑱁†﹖⸣߿܋｠𑑏⳿゠߾₪𑗒𑁉᥅⸍⹜`𑗇︴༊𑃀෴⸁＠⦖՚꧉֊،𑙠𖺚৳;𑑚৽︽⦕৲꣎•-𖺗᪤𐎟𑩃꧊𑙪⸴༔׳༇𑅵⸷༏⸡⦎§಄⸌𑪟「﹏⁑𑗃؍؉𐽕᳓︿⁖﹜⌉₷𑈹⟮𑜼﹒⹙、|‐‵〔⦊᪪⸆₍︱﹕𑿟࿑܂﹡𑱄𑑋⦏𑈻⹀־:᪠⦅⵰᭝꡵]᠈！᰽𑙣܅‖&⟆❲᪣⁃﹂჻«៙‒𖫵₎𐩖፥‽⸥⸏᳆𑂻𑿿₣﹝៚፡⳺𒑰꧌⸫꧆᠇࠻⸸꙾॰⁕⹄𐬿᚛࿔﹃𑿠៖࿚‚❪࿐𑙩𐬺‰𒿲⸈᪢>︵⁍❨฿’⸾꧄_።⹕⧼܇·–꧟⸿⸀𐮙€/〟꩞᪩྅⸳𑜽༉<𑪡𑙦𖭄᭞𖩮𐫵𑻸｛[￥𞥞₸］}࿓⁉᠀⦃፣﹀࠼﹞𑗍߷⳻︒𑗕₹₾׃𑁋⸵𖬸𐬹‼〽꫟￠᭜+𑗋⁁⁎⌊᚜＂¢𑅂₳᛫₯〈》՟𑙥𐕯𑂾⸛𐾇﹅､¶𑇍¤׆𑑝⁓⹘︷٪𑿞꠸՞꫱𐾈⹛។꛵︳𑁈࠾।𑈽𖬹𖬺၎〘⧽︹𒑲․𑇟𑱂𑗁“੶𑱰。𞲰(⹉⸊꡷〈⦗᜶᠄᯼⸑⸬﹚︲᯿﹠﴾£‱𑙡﹙⸪⸗⁙⳹❵𑩄︾𐾆𑙃﹑@⸄၍\𑂿⹎𑗂𑪛֏₼𑿝）་⸂₮₴⹅：𑱃⸲›⸜⹗꣼࠺︕߹⁛࠴︸❭＿꯫⸘𑗄⸙᪡𑇆⹏¥᠂⁗𑗎￡᳃』⟭𑩂﹉꡶᠃⸱᠁⹚;﹍𑙧⸉𑃁„࡞⸦⸞⸋꓾࠹։‹⁞๚꣸｡𑅃.꡴﹔𐤿,𑩅‶𐮜⟨࠳𝪊𖿢﴿，＄᭟𑓆𐫲𐄂᪫𐩒⸹᭽𑻷︰༼𑗅᰿𐤟⸕⹝⁊࠸‗⟫𖺙܈᳄꘏〚₱#꧅‥︐𐽘⹃‣𝪈⧚⸨᰻﹪𑙢⦓＼𑁇﹇꣏𒿱𞋿·₺₨꩜⹌﹆༅𐾉₽‘⁐︘︑₥﹁𝪉߸᭛⦒༆𑂼𑁊፠𑗑⁌︺⦍⸇𐮛⦘⹋𐩗𑩁𑑌｟᪬٫𒑴𑥅₵︔﹟❳𐄁𑑛⟪﹄𐡗𐫴𑊩⁾⸰⦈₭𑗆⹔⧙𑇝⧛❬⁏𐽗𑪜𖬻૱₩‾⸚꧍⁂᨟৻{𑗏𐩿𖩯௹༌꧋፤₧₢?𑩀𑈸⁝〝༈﹌‑𑈼⳾܌𑙂…𒑳〉꧇܍⦋𐫱〙𑇈༺⟬⦆࠲⸼𑅀𑩆❩︖࠱𐽖𑁌⦐︶𐫰⁚﹐⁽꩝𐩔⸮𐫶⟧〞‸⦔〕₶❴𑗐᠊⸒⸟꛶⁀𑙨𐫳〖؟꙳〗⁘‿𒑱𑱅౷‧⸎⸔𛲟꣺᭾‴𐩓」᳇𑑍𑥆﹛₠⁇﹩％｢»𑗉᠅₦꧈⹍❮꧃𝪇︙᪭᐀𐄀𑙁༒″࠰᱿⟯〉⹒꧞⸐⹈܃׀٭﹗࿙၌〜᰼”﷼𑙤༑⦑꛲′⸧𑥄࠵︼⁋【⁔｣᯾؝⸝𑈺᠆𑑎⦄⁈᰾^｝？𐩐₻꫞𑅁࠶៛៕⃀￦⸩⹆꘎᙮؞⁆᨞﹋・᜵܄፦＃⹊⹂⸺܀⳼⸖⌋𑱱꣹᠉‛⸠꛷%𑗔︓⦌／¿՝❫᪨⦉－𖺘₡᥄¡﹨₤﹣𖬷࿒꓿𐬻٬⸶᳀૰𑅴꤮𐺭꩟₰࠽𞥟꧁︗𐩘〃꘍~།꛴︻=༐❱𑪠᯽𑪢𑇅⸅―⸓၏⧘⹇𑗗᪦𐽙״𐏐༻᛬؛๏፧＇．၊᱾᳁※𑪞﹘᳂⹖𑇛፨⸤𑇇𑨿꥟؋𑇞༎𐬾՛（܁!՜𑗊₿＊⸻܉﹈﹎࠷𐬽⌈'༽𑙫๛؊᪥𑗌《𑜾𐩑𐬼*₲⸽₫𐩕॥𐮚𑗓･⟦⟅𑧢۔𑗈—〰⸢﹊"‷⸃⸭$᭚⦇᭠𑗖꛳܆꤯⁅﹫𑁍𑠻＆༄။‡】܊꧂‟⹁⟩⹓𑙬❯𑚹᳅)


Spacy includes space tokens in the vocab for non-destructive tokenisation. Positions of space tokens are stored so they can be filtered out for analysis and reporting. 

Tokens consisting of only punctuation are defined as punctuation tokens. These can be removed or included in analysis and reporting.

In [None]:
#| exporti
@patch
def _init_build_process(self:Corpus,
						save_path: str, # path to save corpus data 
						):
	""" Create slug, corpus_path, and create directory if needed. """

	self.conc_version = __version__
	self.slug = slugify(self.name, stopwords=['corpus'])
	self.corpus_path = os.path.join(save_path, f'{self.slug}.corpus')

	if not os.path.isdir(self.corpus_path):
		os.makedirs(self.corpus_path)

In [None]:
#| exporti
@patch
def _update_build_process(self: Corpus, 
                           orth_index: list[np.ndarray], # orthographic token ids
                           lower_index: list[np.ndarray], # lower case token ids
                           token2doc_index: list[np.ndarray], # token to document mapping
                           has_spaces: list[np.ndarray], # arrays of whether token has space
                           store_pos: int # current store pos
                           ) -> int: # next store pos
    """ Write in-progress build data to Parquet disk store. """

    pl.DataFrame([np.concatenate(orth_index), np.concatenate(lower_index), np.concatenate(token2doc_index), np.concatenate(has_spaces)], schema = [('orth_index', pl.UInt64), ('lower_index', pl.UInt64), ('token2doc_index', pl.Int32), ('has_spaces', pl.Boolean)] ).write_parquet(f'{self.corpus_path}/build_{store_pos}.parquet')
    return store_pos + 1

NOTE: currently streaming either with sink_parquet or collect(engine='streaming') can break the order of the dataframe (not just whole rows, but within specific columns leading to misaligned data). Streaming is not being used for the build, this will be reassessed in the future as the new Polars streaming functionality matures.

In [None]:
#| exporti
@patch
def _complete_build_process(self: Corpus, 
							build_process_cleanup: bool = True # Remove the build files after build is complete, retained for development and testing purposes
							):
	""" Complete the disk-based build to create representation of the corpus. """

	logger.memory_usage('init', init=True)
	tokens_df = pl.scan_parquet(f'{self.corpus_path}/build_*.parquet')

	# get unique vocab ids (combining orth and lower) and create new index
	vocab_df = pl.concat([tokens_df.select(pl.col('orth_index').unique().alias('index')), tokens_df.select(pl.col('lower_index').unique().alias('index'))])
	#vocab_df  = combined_df.select(pl.col('index').unique().sort().alias('source_id')).with_row_index('token_id', offset=1) #.collect(engine='streaming')
	vocab_df = vocab_df.select(pl.col('index').unique().sort().alias('source_id')).with_row_index('token_id', offset=1) 
	logger.memory_usage('collected vocab')

	# combined_df = (combined_df.with_columns(pl.col('index').replace(vocab_df.select(pl.col('source_id'))['source_id'], vocab_df.select(pl.col('token_id'))['token_id']).cast(pl.UInt32)))
	# combined_df = combined_df.with_columns(pl.col('index').cast(pl.UInt32))

	tokens_df = (
		tokens_df
		.join(vocab_df, left_on="orth_index", right_on="source_id", how="left", maintain_order="left")
		.drop("orth_index")
		.rename({"token_id": "orth_index"})
		.with_columns(pl.col("orth_index").cast(pl.UInt32).alias("orth_index"))
	)

	tokens_df = (
		tokens_df
		.join(vocab_df, left_on="lower_index", right_on="source_id", how="left", maintain_order="left")
		.drop("lower_index")
		.rename({"token_id": "lower_index"})
		.with_columns(pl.col("lower_index").cast(pl.UInt32).alias("lower_index"))
	) # should have aligned data for token2doc_index and has_spaces

	# this is currently the most intensive operation in terms of memory usage - this needs attention - can't use sink_parquet currently or collect(engine='streaming') as it doesn't maintain order (including changing order between two columns)
	tokens_df.select([pl.col('orth_index'), pl.col('lower_index'), pl.col('token2doc_index'), pl.col('has_spaces')]).collect().write_parquet(f'{self.corpus_path}/tokens.parquet') # if using streaming or sink will need to verify ordering - check for maintain_order parameters
	logger.memory_usage('wrote pending tokens to disk')
	tokens_df = pl.scan_parquet(f'{self.corpus_path}/tokens.parquet') # re-read as lazy frame

	# could batch this to reduce memory usage - but leaving for now
	vocab_query = vocab_df.select(pl.col('source_id')).collect().to_numpy().flatten() # get vocab ids as numpy array for faster processing
	vocab = {k:self._nlp.vocab[k].text for k in vocab_query} # get vocab strings from spacy vocab
	token_strs = list(vocab.values())
	vocab_df = vocab_df.with_columns(pl.Series(token_strs).alias('token'))
	del vocab_query
	logger.memory_usage('added vocab strings')

	self.EOF_TOKEN = vocab_df.filter(pl.col('source_id') == self.SPACY_EOF_TOKEN).select(pl.col('token_id')).collect().item() # casting to int for storage
	
	self.punct_tokens = [(k + 1) for k, v in enumerate(token_strs) if v.strip(PUNCTUATION_STRINGS) == '']
	logger.memory_usage(f'got punct tokens')
	self.space_tokens = [(k + 1) for k, v in enumerate(token_strs) if v.strip() == '']
	logger.memory_usage(f'got space tokens')
	del token_strs

	# new spaces handling
	spaces_df = tokens_df.with_row_index('position').filter(pl.col('lower_index').is_in(self.space_tokens))
	spaces_df = spaces_df.with_row_index('adjust_by').with_columns((pl.col('position') - pl.col('adjust_by')).alias('corrected'))
	spaces_df = spaces_df.with_columns(pl.col('corrected').alias('position')).drop('adjust_by').drop('corrected')
	spaces_df.collect().write_parquet(f'{self.corpus_path}/spaces.parquet') 
	logger.memory_usage('saved space positions')

	# remove spaces from tokens_df
	tokens_df = tokens_df.filter(~pl.col('lower_index').is_in(self.space_tokens))
	tokens_df.collect().write_parquet(f'{self.corpus_path}/tokens.parquet') # if using streaming or sink will need to verify ordering - check for maintain_order parameters
	logger.memory_usage('wrote final tokens to disk')
	tokens_df = pl.scan_parquet(f'{self.corpus_path}/tokens.parquet') # re-read as lazy frame

	# Create LazyFrames for punct_positions
	tokens_df.select(pl.col('lower_index')).with_row_index('position').filter(pl.col('lower_index').is_in(self.punct_tokens)).select('position').sink_parquet(f'{self.corpus_path}/puncts.parquet', maintain_order = True) #.collect(engine='streaming').to_numpy().flatten()
	logger.memory_usage('saved punct positions')

	# get counts from tokens_df
	frequency_lower = tokens_df.filter(pl.col('lower_index') != self.EOF_TOKEN).select(pl.col('lower_index')).group_by('lower_index').agg(pl.count('lower_index').alias('frequency_lower')) #.collect(engine='streaming')
	frequency_orth = tokens_df.filter(pl.col('orth_index') != self.EOF_TOKEN).select(pl.col('orth_index')).group_by('orth_index').agg(pl.count('orth_index').alias('frequency_orth')) #.collect(engine='streaming')
	vocab_df = vocab_df.join(frequency_lower, left_on = 'token_id', right_on = 'lower_index', how='left', maintain_order="left").join(frequency_orth, left_on = 'token_id', right_on = 'orth_index', how='left', maintain_order="left")
	logger.memory_usage('added frequency to vocab')

	self.unique_tokens = frequency_lower.select(pl.len()).collect(engine='streaming').item() # was len(frequency_lower) before used polars streaming
	logger.memory_usage(f'got unique tokens {self.unique_tokens}')

	del frequency_lower
	del frequency_orth

	# add column for is_punct and is_space based on punct_tokens and space_tokens and token_id
	vocab_df = vocab_df.with_columns((pl.col("token_id").is_in(self.punct_tokens)).alias("is_punct"))
	vocab_df = vocab_df.with_columns((pl.col("token_id").is_in(self.space_tokens)).alias("is_space"))
	vocab_df = vocab_df.sort(by = pl.col('token').str.to_lowercase(), descending = False).with_row_index('tokens_sort_order', offset=1) # leave with no zero for handling of error tokens
	vocab_df = vocab_df.drop('source_id').sort(by = pl.col('frequency_orth'), descending = True, nulls_last = True).with_row_index(name='rank', offset=1)
	logger.memory_usage('added is_punct is_space to vocab')

	vocab_df.collect().write_parquet(f'{self.corpus_path}/vocab.parquet') #, maintain_order = True 
	del vocab_df
	logger.memory_usage('wrote vocab to disk')

	#self.document_count = tokens_df.select(pl.col('token2doc_index').filter(pl.col('token2doc_index') != NOT_DOC_TOKEN).unique().count()).collect(engine='streaming').item()
	self.document_count = tokens_df.select(pl.col('token2doc_index')).max().collect().item()
	logger.memory_usage(f'got doc count {self.document_count}')
	# reading now excludes spaces
	input_length = tokens_df.select(pl.len()).collect(engine='streaming').item() # tested vs count - len seems to have slight memory overhead, but more correct (i.e. count only counts non-null)
	logger.memory_usage(f'got input length {input_length} (with eof headers)')

	# adjusting token count for text breaks and headers at start and end of index
	self.token_count = input_length - self.document_count - INDEX_HEADER_LENGTH - INDEX_HEADER_LENGTH 
	logger.memory_usage(f'got token count {self.token_count}')

	self.punct_token_count = pl.scan_parquet(f'{self.corpus_path}/puncts.parquet').select(pl.len()).collect(engine='streaming').item() # may be more efficient to do this prior to disk write
	logger.memory_usage(f'got punct token count ({self.punct_token_count})')
	self.space_token_count = pl.scan_parquet(f'{self.corpus_path}/spaces.parquet').select(pl.len()).collect(engine='streaming').item() # may be more efficient to do this prior to disk write
	logger.memory_usage(f'got space token count ({self.space_token_count})')
	self.word_token_count = self.token_count - self.punct_token_count
	self.unique_word_tokens = self.unique_tokens - len(self.punct_tokens)
	
	self.date_created = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())

	if build_process_cleanup:
		for f in glob.glob(f'{self.corpus_path}/build_*.parquet'):
			os.remove(f)
		logger.memory_usage('removed build files')
	
	logger.memory_usage('done')



In [None]:
#| exporti
@patch
def _create_indices(self: Corpus, 
				   orth_index: list[np.ndarray], # list of np arrays of orth token ids 
				   lower_index: list[np.ndarray], # list of np arrays of lower token ids
				   token2doc_index: list[np.ndarray] # list of np arrays of doc ids
				   ):
	""" (Deprecated) Use Numpy to create internal representation of the corpus for faster analysis and efficient representation on disk. Only used when the disk-based build process is not used. """
	
	raise DeprecationWarning('This method is deprecated, the current build process uses _complete_build_process instead.')

	self.token2doc_index = np.concatenate(token2doc_index)
	unique_values, inverse = np.unique(np.concatenate(orth_index + lower_index), return_inverse=True)

	# adding a dummy value at the 0 index to avoid 0 being used as a token id
	unique_values = np.insert(unique_values, 0, 0)
	inverse += 1
	new_values = np.arange(len(unique_values), dtype=np.uint32)
	self.original_to_new = dict(zip(unique_values, new_values))
	self.new_to_original = dict(zip(new_values, unique_values))

	self.orth_index = np.array(np.split(inverse, 2)[0], dtype=np.uint32)
	self.lower_index = np.array(np.split(inverse, 2)[1], dtype=np.uint32)
	del inverse

	vocab = {k:self._nlp.vocab.strings[k] for k in unique_values}
	vocab[0] = ERR_TOKEN_STR

	self.vocab = {**{k:vocab[self.new_to_original[k]] for k in new_values}}

	self.EOF_TOKEN = self.original_to_new[self.SPACY_EOF_TOKEN]

	self._process_punct_positions()
	self._process_space_positions()

	self.frequency_lookup = dict(zip(*np.unique(self.lower_index, return_counts=True)))
	del self.frequency_lookup[self.EOF_TOKEN]
	del unique_values

In [None]:
#| exporti
@patch
def _init_corpus_dataframes(self: Corpus):
	""" Initialize dataframes after build or load """
	
	self.vocab = pl.scan_parquet(f'{self.corpus_path}/vocab.parquet')
	self.tokens = pl.scan_parquet(f'{self.corpus_path}/tokens.parquet')
	self.puncts = pl.scan_parquet(f'{self.corpus_path}/puncts.parquet')
	self.spaces = pl.scan_parquet(f'{self.corpus_path}/spaces.parquet')
	self.metadata = pl.scan_parquet(f'{self.corpus_path}/metadata.parquet')

In [None]:
#| export
README_TEMPLATE = """# {name}

## About

This directory contains a corpus created using the [Conc]({REPOSITORY_URL}) Python library. 

## Corpus Information

{description}

Date created: {date_created}  
Document count: {document_count}  
Token count: {token_count}  
Word token count: {word_token_count}  
Unique tokens: {unique_tokens}  
Unique word tokens: {unique_word_tokens}  
Conc Version Number: {conc_version}  
spaCy model: {SPACY_MODEL}, version {SPACY_MODEL_VERSION}  

## Using this corpus
 
Conc can be installed [via pip]({PYPI_URL}). The [Conc documentation site]({DOCUMENTATION_URL}) 
has tutorials and detailed information to get you started with Conc or to work with the corpus 
data directly.  

## Cite Conc

{CITATION_STR}

"""

In [None]:
#| export
@patch
def save_corpus_metadata(self: Corpus, 
		 ):
	""" Save corpus metadata. """
	
	start_time = time.time()
	json_bytes = msgspec.json.encode(CorpusMetadata(**{k: getattr(self, k) for k in ['name', 'description', 'slug', 'conc_version', 'document_count', 'token_count', 'word_token_count', 'punct_token_count', 'space_token_count', 'unique_tokens', 'unique_word_tokens', 'date_created', 'EOF_TOKEN', 'SPACY_EOF_TOKEN', 'SPACY_MODEL', 'SPACY_MODEL_VERSION', 'punct_tokens', 'space_tokens']}))

	with open(f'{self.corpus_path}/corpus.json', 'wb') as f:
		f.write(json_bytes)

	with open(f'{self.corpus_path}/README.md', 'w', encoding='utf-8') as f:
		f.write(README_TEMPLATE.format(
			name=self.name,
			REPOSITORY_URL=REPOSITORY_URL,
			PYPI_URL=PYPI_URL,
			DOCUMENTATION_URL=DOCUMENTATION_URL,
			CITATION_STR=CITATION_STR,
			description=self.description,
			date_created=self.date_created,
			document_count=self.document_count,
			token_count=self.token_count,
			word_token_count=self.word_token_count,
			unique_tokens=self.unique_tokens,
			unique_word_tokens=self.unique_word_tokens,
			conc_version=self.conc_version,
			SPACY_MODEL=self.SPACY_MODEL,
			SPACY_MODEL_VERSION=self.SPACY_MODEL_VERSION
		))
		
	logger.info(f'Saved corpus metadata time: {(time.time() - start_time):.3f} seconds')

In [None]:
#| export
@patch
def build(self: Corpus, 
		  save_path:str, # directory where corpus will be created, a subdirectory will be automatically created with the corpus content
		  iterator: iter, # iterator of texts
		  model: str='en_core_web_sm', # spacy model to use for tokenisation
		  spacy_batch_size:int=500, # batch size for spacy tokenizer
		  build_process_batch_size:int=5000, # save in-progress build to disk every n docs
		  build_process_cleanup:bool = True # Remove the build files after build is complete, retained for development and testing purposes
		  ):
	"""Build a corpus from an iterator of texts."""

	self._init_spacy_model(model)
	
	self.SPACY_MODEL = model
	self.SPACY_MODEL_VERSION = self._nlp.meta['version']
	self.SPACY_EOF_TOKEN = self._nlp.vocab[EOF_TOKEN_STR].orth
	
	if self.corpus_path is None: # leaving for testing ... this should already be set if build has been initiated in standard way via build_from_csv, build_from_files or whatever other methods are implemented to handle build/imports in future
		self._init_build_process(save_path)
	
	logger.memory_usage('init', init=True)

	start_time = time.time()

	eof_arr = np.array([self.SPACY_EOF_TOKEN], dtype=np.uint64)
	not_doc_arr = np.array([NOT_DOC_TOKEN], dtype=np.int16)
	index_header_arr = np.array([self.SPACY_EOF_TOKEN] * INDEX_HEADER_LENGTH, dtype=np.uint64) # this is added to start and end of index to prevent out of bound issues on searches
	has_spaces_eof_arr = np.array([False], dtype=np.bool)

	orth_index = [index_header_arr]
	lower_index = [index_header_arr]
	token2doc_index = [np.array([NOT_DOC_TOKEN] * len(index_header_arr), dtype=np.int32)]
	has_spaces = [np.array([0] * len(index_header_arr), dtype=np.bool)]

	offset = INDEX_HEADER_LENGTH

	store_pos = 0

	doc_order = 1
	for doc in self._nlp.pipe(iterator, batch_size = spacy_batch_size): # was previously using self._nlp.tokenizer.pipe(iterator, batch_size=batch_size): but this is faster, test other options at some point
		orth_index.append(doc.to_array(ORTH))
		orth_index.append(eof_arr)

		lower_index_tmp = doc.to_array(LOWER)
		lower_index.append(lower_index_tmp)
		lower_index.append(eof_arr)

		token2doc_index.append(np.array([doc_order] * len(lower_index_tmp), dtype=np.int32))
		token2doc_index.append(not_doc_arr)

		has_spaces.append(doc.to_array(SPACY))
		has_spaces.append(has_spaces_eof_arr)
		# self.offsets.append(offset) 
		# offset = offset + len(lower_index_tmp) + 1
		doc_order += 1

		# update store every build_process_batch_size docs
		if doc_order % build_process_batch_size == 0:
			#was based on condition build_process_path is not None before disk-based build process
			store_pos = self._update_build_process(orth_index, lower_index, token2doc_index, has_spaces, store_pos)
			lower_index, orth_index, token2doc_index, has_spaces = [], [], [], []
			logger.memory_usage(f'processed {doc_order} documents')
			
	del iterator
	orth_index.append(index_header_arr)
	lower_index.append(index_header_arr)
	token2doc_index.append(np.array([NOT_DOC_TOKEN] * len(index_header_arr), dtype=np.int32))
	has_spaces.append(np.array([0] * len(index_header_arr), dtype=np.bool))

	logger.memory_usage(f'Completing build process')
	if save_path is not None:
		store_pos = self._update_build_process(orth_index, lower_index, token2doc_index, has_spaces, store_pos)
		lower_index, orth_index, token2doc_index, has_spaces = [], [], [], []
		self._complete_build_process(build_process_cleanup = build_process_cleanup)
	else:
		# deprecated - leaving for now
		self._create_indices(orth_index, lower_index, token2doc_index)
		# self.document_count = len(self.offsets)

		self.token_count = self.lower_index.shape[0] - self.document_count - len(index_header_arr) - len(index_header_arr) 
		self.unique_tokens = len(self.frequency_lookup)

		self.word_token_count = self.token_count - len(self.punct_positions) - len(self.space_positions)
		self.unique_word_tokens = len(self.frequency_lookup) - len(self.punct_tokens) - len(self.space_tokens)

	del orth_index
	del lower_index
	del token2doc_index
	del has_spaces
	
	logger.memory_usage(f'Completed build process')

	# save corpus metadata
	self.save_corpus_metadata()

	self._init_corpus_dataframes()

	logger.info(f'Build time: {(time.time() - start_time):.3f} seconds')


In [None]:
#| exporti
@patch
def _prepare_files(self: Corpus, 
					source_path: str, # path to folder with text files, path can be a directory, zip or tar/tar.gz file
					file_mask:str='*.txt', # mask to select files 
					metadata_file: str|None=None, # path to a CSV with metadata
					metadata_file_column:str = 'file', # column in metadata file with file names to align texts with metadata
					metadata_columns:list[str]=[], # list of column names to import from metadata
					encoding:str='utf8' # encoding of text files
					):
	"""Prepare text files and metadata for building a corpus. Returns an iterator to get file text for processing."""

	# allowing import from zip and tar files
	if os.path.isdir(source_path):
		files = glob.glob(os.path.join(source_path, file_mask))
		type = 'folder'
	elif os.path.isfile(source_path):
		import fnmatch
		if source_path.endswith('.zip'):
			import zipfile
			with zipfile.ZipFile(source_path, 'r') as z:
				files = []
				for f in z.namelist():
					if fnmatch.fnmatch(f, file_mask):
						files.append(f)
				if len(files) > 0:
					type = 'zip'
		elif source_path.endswith('.tar') or source_path.endswith('.tar.gz'):
			import tarfile
			with tarfile.open(source_path, 'r') as t:
				files = []
				for f in t.getnames():
					if fnmatch.fnmatch(f, file_mask):
						files.append(f)
				if len(files) > 0:
					type = 'tar'
		else:
			raise FileNotFoundError(f"Path '{source_path}' is not a directory, zip or tar file")
	
	if not files:
		raise FileNotFoundError(f"No files matching {file_mask} found in '{source_path}'")

	metadata = pl.LazyFrame({metadata_file_column: [os.path.basename(p) for p in files]})

	if metadata_file:
		if not os.path.isfile(metadata_file):
			raise FileNotFoundError(f"Metadata file '{metadata_file}' not found")
		try:
			if metadata_file_column not in metadata_columns:
				metadata_columns.insert(0, metadata_file_column)
			
			metadata = pl.scan_csv(metadata_file).select(metadata_columns)
			# reordering files on metadata so token data and metadata aligned
			files = metadata.select(pl.col(metadata_file_column)).collect(engine='streaming').to_numpy().flatten().tolist() # get file names from metadata
			files = [os.path.join(source_path, f) for f in files if os.path.basename(f) in files] 
		except pl.exceptions.ColumnNotFoundError as e:
			raise
	
	metadata.sink_parquet(f'{self.corpus_path}/metadata.parquet')

	self.source_path = source_path

	if type == 'folder':
		for p in files:
			yield open(p, "rb").read().decode(encoding)
	elif type == 'zip':
		with zipfile.ZipFile(source_path, 'r') as z:
			for f in files:
				yield z.read(f).decode(encoding)
	elif type == 'tar':
		with tarfile.open(source_path, 'r') as t:
			for f in files:
				yield t.extractfile(f).read().decode(encoding)		
	


In [None]:
#| export
@patch
def build_from_files(self: Corpus,
					source_path: str, # path to folder with text files 
					save_path: str, # path to save corpus
					file_mask:str='*.txt', # mask to select files 
					metadata_file: str|None=None, # path to a CSV with metadata
					metadata_file_column:str = 'file', # column in metadata file with file names to align texts with metadata
					metadata_columns:list[str]=[], # list of column names to import from metadata
					encoding:str='utf-8', # encoding of text files
					model:str='en_core_web_sm', # spacy model to use for tokenisation
					spacy_batch_size:int=1000, # batch size for spacy tokenizer
					build_process_batch_size:int=5000, # save in-progress build to disk every n docs
					build_process_cleanup:bool = True # Remove the build files after build is complete, retained for development and testing purposes
					):
	"""Build a corpus from text files in a folder."""
	
	start_time = time.time()
	self._init_build_process(save_path)
	iterator = self._prepare_files(source_path, file_mask, metadata_file, metadata_file_column, metadata_columns, encoding) #, build_process_path=build_process_path
	self.build(save_path = save_path, iterator = iterator, model = model, spacy_batch_size = spacy_batch_size, build_process_batch_size = build_process_batch_size, build_process_cleanup = build_process_cleanup) #build_process_path = build_process_path, 
	logger.info(f'Build from files time: {(time.time() - start_time):.3f} seconds')

	return self


In [None]:
#| export
@patch
def _prepare_csv(self: Corpus, 
					source_path:str, # path to csv file
					text_column:str='text', # column in csv with text
					metadata_columns:list[str]=[], # list of column names to import from csv
					encoding:str='utf8', # encoding of csv passed to Polars read_csv, see their documentation
					build_process_batch_size:int=5000 # save in-progress build to disk every n rows
					) -> iter: # iterator to return rows for processing
	"""Prepare to import from CSV, including metadata. Returns an iterator to process the text column."""

	if not os.path.isfile(source_path):
		raise FileNotFoundError(f'Path ({source_path}) is not a file')
	
	try:
		df = pl.scan_csv(source_path, encoding = encoding).select([text_column] + metadata_columns)
	except pl.exceptions.ColumnNotFoundError as e:
		raise

	self.source_path = source_path
	
	df.select(metadata_columns).sink_parquet(f'{self.corpus_path}/metadata.parquet')

	for slice_df in df.collect(engine='streaming').iter_slices(n_rows=build_process_batch_size):  
		for row in slice_df.iter_rows():
			yield row[0]  

In [None]:
#| export
@patch
def build_from_csv(self: Corpus, 
				   source_path:str, # path to csv file
				   save_path: str, # path to save corpus
				   text_column:str='text', # column in csv with text
				   metadata_columns:list[str]=[], # list of column names to import from csv
				   encoding:str='utf8', # encoding of csv passed to Polars read_csv, see their documentation
				   model:str='en_core_web_sm', # spacy model to use for tokenisation
				   spacy_batch_size:int=1000, # batch size for Spacy tokenizer
				   #build_process_path:str=None, # path to save an in-progress build to disk to reduce memory usage
				   build_process_batch_size:int=5000, # save in-progress build to disk every n docs
				   build_process_cleanup:bool = True # Remove the build files after build is complete, retained for development and testing purposes
				   ):
	"""Build a corpus from a csv file."""
	
	start_time = time.time()
	self._init_build_process(save_path)
	iterator = self._prepare_csv(source_path = source_path, text_column = text_column, metadata_columns = metadata_columns, encoding = encoding, build_process_batch_size = build_process_batch_size)
	self.build(save_path = save_path, iterator = iterator, model = model, spacy_batch_size = spacy_batch_size, build_process_batch_size = build_process_batch_size, build_process_cleanup = build_process_cleanup)
	logger.info(f'Build from csv time: {(time.time() - start_time):.3f} seconds')

	return self

In [None]:
#| hide
# document_count = 6
# token_count = 38
# word_token_count = 32
# unique_tokens = 15
# unique_word_tokens = 14

test = Corpus('test').build_from_files(source_path = f'{source_path}toy', save_path = save_path, file_mask='*.txt', metadata_file=f'{source_path}toy.csv', metadata_file_column = 'source', metadata_columns=['category'], model='en_core_web_sm', spacy_batch_size=1000, build_process_batch_size=5000, build_process_cleanup=True)

assert test.document_count == 6
assert test.token_count == 38
assert test.word_token_count == 32
assert test.unique_tokens == 15
assert test.unique_word_tokens == 14

assert type(test.metadata) == pl.LazyFrame
test.metadata = test.metadata.collect()
assert test.metadata.columns == ['source', 'category']

#display(test.metadata.head(20))

assert os.path.isfile(f'{test.corpus_path}/corpus.json')
assert os.path.isfile(f'{test.corpus_path}/README.md')
assert os.path.isfile(f'{test.corpus_path}/vocab.parquet')
assert os.path.isfile(f'{test.corpus_path}/tokens.parquet')
assert os.path.isfile(f'{test.corpus_path}/puncts.parquet')
assert os.path.isfile(f'{test.corpus_path}/spaces.parquet')
assert os.path.isfile(f'{test.corpus_path}/metadata.parquet')
#display(test.vocab.collect().head(20))
test_result = test.vocab.filter(pl.col('token') == 'the').collect(engine='streaming')
assert test_result.select(pl.col('token_id')).item() == 10
assert test_result.select(pl.col('frequency_lower')).item() == 8
assert test_result.select(pl.col('frequency_orth')).item() == 2
assert test_result.select(pl.col('is_punct')).item() == False
assert test_result.select(pl.col('is_space')).item() == False

test_result = test.vocab.filter(pl.col('token') == '.').collect(engine='streaming')
assert test_result.select(pl.col('token_id')).item() == 15
assert test_result.select(pl.col('frequency_lower')).item() == 6
assert test_result.select(pl.col('frequency_orth')).item() == 6
assert test_result.select(pl.col('is_punct')).item() == True
assert test_result.select(pl.col('is_space')).item() == False

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 0).collect(engine='streaming')
assert test_result.select(pl.col('orth_index')).item() == test.EOF_TOKEN
assert test_result.select(pl.col('lower_index')).item() == test.EOF_TOKEN
assert test_result.select(pl.col('token2doc_index')).item() == NOT_DOC_TOKEN
assert test_result.select(pl.col('has_spaces')).item() == False

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 99).collect(engine='streaming')
assert test_result.select(pl.col('orth_index')).item() == test.EOF_TOKEN

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 100).collect(engine='streaming')
assert test_result.select(pl.col('lower_index')).item() == 10
assert test_result.select(pl.col('token2doc_index')).item() == 1
assert test_result.select(pl.col('has_spaces')).item() == True

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 104).collect(engine='streaming')
assert test_result.select(pl.col('orth_index')).item() == 10
assert test_result.select(pl.col('lower_index')).item() == 10
assert test_result.select(pl.col('token2doc_index')).item() == 1
assert test_result.select(pl.col('has_spaces')).item() == True

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 105).collect(engine='streaming')
assert test_result.select(pl.col('token2doc_index')).item() == 1
assert test_result.select(pl.col('has_spaces')).item() == False

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 106).collect(engine='streaming')
assert test_result.select(pl.col('orth_index')).item() == 15
assert test_result.select(pl.col('lower_index')).item() == 15
assert test_result.select(pl.col('token2doc_index')).item() == 1
assert test_result.select(pl.col('has_spaces')).item() == False

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 7).collect(engine='streaming')
assert test_result.select(pl.col('orth_index')).item() == test.EOF_TOKEN
assert test_result.select(pl.col('lower_index')).item() == test.EOF_TOKEN
assert test_result.select(pl.col('token2doc_index')).item() == NOT_DOC_TOKEN
assert test_result.select(pl.col('has_spaces')).item() == False

#test_result = test.tokens.with_row_index('position').filter(pl.col('position') > 99).collect(engine='streaming')
#print(test_result.head(10))

if os.path.isdir(test.corpus_path):
	shutil.rmtree(test.corpus_path)

del test

In [None]:
#| hide
# document_count = 6
# token_count = 38
# word_token_count = 32
# unique_tokens = 15
# unique_word_tokens = 14

test = Corpus('test').build_from_csv(source_path = f'{source_path}toy.csv', save_path = save_path, text_column='text', metadata_columns=['source', 'category'], model='en_core_web_sm', spacy_batch_size=1000, build_process_batch_size=5000, build_process_cleanup=True)

assert test.document_count == 6
assert test.token_count == 38
assert test.word_token_count == 32
assert test.unique_tokens == 15
assert test.unique_word_tokens == 14

assert type(test.metadata) == pl.LazyFrame
test.metadata = test.metadata.collect()
assert test.metadata.columns == ['source', 'category']

#display(test.metadata.head(20))

assert os.path.isfile(f'{test.corpus_path}/corpus.json')
assert os.path.isfile(f'{test.corpus_path}/README.md')
assert os.path.isfile(f'{test.corpus_path}/vocab.parquet')
assert os.path.isfile(f'{test.corpus_path}/tokens.parquet')
assert os.path.isfile(f'{test.corpus_path}/puncts.parquet')
assert os.path.isfile(f'{test.corpus_path}/spaces.parquet')
assert os.path.isfile(f'{test.corpus_path}/metadata.parquet')
#display(test.vocab.collect().head(20))
test_result = test.vocab.filter(pl.col('token') == 'the').collect(engine='streaming')
assert test_result.select(pl.col('token_id')).item() == 10
assert test_result.select(pl.col('frequency_lower')).item() == 8
assert test_result.select(pl.col('frequency_orth')).item() == 2
assert test_result.select(pl.col('is_punct')).item() == False
assert test_result.select(pl.col('is_space')).item() == False

test_result = test.vocab.filter(pl.col('token') == '.').collect(engine='streaming')
assert test_result.select(pl.col('token_id')).item() == 15
assert test_result.select(pl.col('frequency_lower')).item() == 6
assert test_result.select(pl.col('frequency_orth')).item() == 6
assert test_result.select(pl.col('is_punct')).item() == True
assert test_result.select(pl.col('is_space')).item() == False

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 0).collect(engine='streaming')
assert test_result.select(pl.col('orth_index')).item() == test.EOF_TOKEN
assert test_result.select(pl.col('lower_index')).item() == test.EOF_TOKEN
assert test_result.select(pl.col('token2doc_index')).item() == NOT_DOC_TOKEN
assert test_result.select(pl.col('has_spaces')).item() == False

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 99).collect(engine='streaming')
assert test_result.select(pl.col('orth_index')).item() == test.EOF_TOKEN

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 100).collect(engine='streaming')
assert test_result.select(pl.col('lower_index')).item() == 10
assert test_result.select(pl.col('token2doc_index')).item() == 1
assert test_result.select(pl.col('has_spaces')).item() == True

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 104).collect(engine='streaming')
assert test_result.select(pl.col('orth_index')).item() == 10
assert test_result.select(pl.col('lower_index')).item() == 10
assert test_result.select(pl.col('token2doc_index')).item() == 1
assert test_result.select(pl.col('has_spaces')).item() == True

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 105).collect(engine='streaming')
assert test_result.select(pl.col('token2doc_index')).item() == 1
assert test_result.select(pl.col('has_spaces')).item() == False

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 106).collect(engine='streaming')
assert test_result.select(pl.col('orth_index')).item() == 15
assert test_result.select(pl.col('lower_index')).item() == 15
assert test_result.select(pl.col('token2doc_index')).item() == 1
assert test_result.select(pl.col('has_spaces')).item() == False

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 7).collect(engine='streaming')
assert test_result.select(pl.col('orth_index')).item() == test.EOF_TOKEN
assert test_result.select(pl.col('lower_index')).item() == test.EOF_TOKEN
assert test_result.select(pl.col('token2doc_index')).item() == NOT_DOC_TOKEN
assert test_result.select(pl.col('has_spaces')).item() == False

#test_result = test.tokens.with_row_index('position').filter(pl.col('position') > 99).collect(engine='streaming')
#print(test_result.head(10))

if os.path.isdir(test.corpus_path):
	shutil.rmtree(test.corpus_path)

del test

## Load a corpus

In [None]:
#| export
@patch
def load(self: Corpus, 
		 corpus_path: str # path to load corpus
		 ):
	""" Load corpus from disk and load the corresponding spaCy model. """

	logger.memory_usage('init', init=True)

	start_time = time.time()

	if not os.path.isdir(corpus_path):
		raise FileNotFoundError(f"Path '{corpus_path}' is not a directory")
	
	expected_files = ['corpus.json', 'vocab.parquet', 'tokens.parquet', 'puncts.parquet', 'spaces.parquet']
	if not all(os.path.isfile(os.path.join(corpus_path, f)) for f in expected_files):
		raise FileNotFoundError(f"Path '{corpus_path}' does not contain all expected files: {expected_files}")

	self.corpus_path = corpus_path

	with open(f'{self.corpus_path}/corpus.json', 'rb') as f:
		data = msgspec.json.decode(f.read(), type=CorpusMetadata)

	for k in data.__slots__:
		setattr(self, k, getattr(data, k))

	self._init_spacy_model(self.SPACY_MODEL, version = self.SPACY_MODEL_VERSION)

	self._init_corpus_dataframes()

	logger.info(f'Load time: {(time.time() - start_time):.3f} seconds')

	return self

In [None]:
#| hide
# document_count = 6
# token_count = 38
# word_token_count = 32
# unique_tokens = 15
# unique_word_tokens = 14

# building just in case - then  
test = Corpus('test').build_from_csv(source_path = f'{source_path}toy.csv', save_path = save_path, text_column='text', metadata_columns=['source', 'category'], model='en_core_web_sm', spacy_batch_size=1000, build_process_batch_size=5000, build_process_cleanup=True)
test = Corpus('test').load(f'{save_path}/test.corpus')

assert test.document_count == 6
assert test.token_count == 38
assert test.word_token_count == 32
assert test.unique_tokens == 15
assert test.unique_word_tokens == 14

assert type(test.metadata) == pl.LazyFrame
test.metadata = test.metadata.collect()
assert test.metadata.columns == ['source', 'category']

#display(test.metadata.head(20))

assert os.path.isfile(f'{test.corpus_path}/corpus.json')
assert os.path.isfile(f'{test.corpus_path}/README.md')
assert os.path.isfile(f'{test.corpus_path}/vocab.parquet')
assert os.path.isfile(f'{test.corpus_path}/tokens.parquet')
assert os.path.isfile(f'{test.corpus_path}/puncts.parquet')
assert os.path.isfile(f'{test.corpus_path}/spaces.parquet')
assert os.path.isfile(f'{test.corpus_path}/metadata.parquet')
#display(test.vocab.collect().head(20))
test_result = test.vocab.filter(pl.col('token') == 'the').collect(engine='streaming')
assert test_result.select(pl.col('token_id')).item() == 10
assert test_result.select(pl.col('frequency_lower')).item() == 8
assert test_result.select(pl.col('frequency_orth')).item() == 2
assert test_result.select(pl.col('is_punct')).item() == False
assert test_result.select(pl.col('is_space')).item() == False

test_result = test.vocab.filter(pl.col('token') == '.').collect(engine='streaming')
assert test_result.select(pl.col('token_id')).item() == 15
assert test_result.select(pl.col('frequency_lower')).item() == 6
assert test_result.select(pl.col('frequency_orth')).item() == 6
assert test_result.select(pl.col('is_punct')).item() == True
assert test_result.select(pl.col('is_space')).item() == False

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 0).collect(engine='streaming')
assert test_result.select(pl.col('orth_index')).item() == test.EOF_TOKEN
assert test_result.select(pl.col('lower_index')).item() == test.EOF_TOKEN
assert test_result.select(pl.col('token2doc_index')).item() == NOT_DOC_TOKEN
assert test_result.select(pl.col('has_spaces')).item() == False

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 99).collect(engine='streaming')
assert test_result.select(pl.col('orth_index')).item() == test.EOF_TOKEN

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 100).collect(engine='streaming')
assert test_result.select(pl.col('lower_index')).item() == 10
assert test_result.select(pl.col('token2doc_index')).item() == 1
assert test_result.select(pl.col('has_spaces')).item() == True

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 104).collect(engine='streaming')
assert test_result.select(pl.col('orth_index')).item() == 10
assert test_result.select(pl.col('lower_index')).item() == 10
assert test_result.select(pl.col('token2doc_index')).item() == 1
assert test_result.select(pl.col('has_spaces')).item() == True

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 105).collect(engine='streaming')
assert test_result.select(pl.col('token2doc_index')).item() == 1
assert test_result.select(pl.col('has_spaces')).item() == False

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 106).collect(engine='streaming')
assert test_result.select(pl.col('orth_index')).item() == 15
assert test_result.select(pl.col('lower_index')).item() == 15
assert test_result.select(pl.col('token2doc_index')).item() == 1
assert test_result.select(pl.col('has_spaces')).item() == False

test_result = test.tokens.with_row_index('position').filter(pl.col('position') == 7).collect(engine='streaming')
assert test_result.select(pl.col('orth_index')).item() == test.EOF_TOKEN
assert test_result.select(pl.col('lower_index')).item() == test.EOF_TOKEN
assert test_result.select(pl.col('token2doc_index')).item() == NOT_DOC_TOKEN
assert test_result.select(pl.col('has_spaces')).item() == False

#test_result = test.tokens.with_row_index('position').filter(pl.col('position') > 99).collect(engine='streaming')
#print(test_result.head(10))

if os.path.isdir(test.corpus_path):
	shutil.rmtree(test.corpus_path)

del test

In [None]:
#| hide
corpora = {}
corpora['brown'] = {'name': 'Brown Corpus', 'slug': 'brown', 'description': 'A Standard Corpus of Present-Day Edited American English, for use with Digital Computers. by W. N. Francis and H. Kucera (1964) Department of Linguistics, Brown University Providence, Rhode Island, USA Revised 1971, Revised and Amplified 1979 http://www.hit.uib.no/icame/brown/bcm.html. This version downloaded via NLTK https://www.nltk.org/nltk_data/.', 'extension': '.csv.gz'}

In [None]:
#| hide
set_logger_state('verbose')
if os.path.isdir(f'{save_path}/test-build-with-brown.corpus'):
	shutil.rmtree(f'{save_path}/test-build-with-brown.corpus')

brown = Corpus(name = 'Test Build with ' + corpora['brown']['name'], description = corpora['brown']['description']).build_from_csv(f'{source_path}/brown.csv.gz', save_path = save_path, text_column='text', metadata_columns=['source'])
del brown

if os.path.isdir(f'{save_path}/test-build-with-brown.corpus'):
	shutil.rmtree(f'{save_path}/test-build-with-brown.corpus')
set_logger_state('quiet')

2025-06-27 22:23:18 - INFO - memory_usage - init, memory usage: 4959.6640625 MB
2025-06-27 22:23:20 - INFO - memory_usage - Completing build process, memory usage: 4955.40625 MB, difference: -4.2578125 MB
2025-06-27 22:23:20 - INFO - memory_usage - init, memory usage: 4955.66015625 MB
2025-06-27 22:23:20 - INFO - memory_usage - collected vocab, memory usage: 4955.66015625 MB, difference: 0.0 MB
2025-06-27 22:23:20 - INFO - memory_usage - wrote pending tokens to disk, memory usage: 4946.734375 MB, difference: -8.92578125 MB
2025-06-27 22:23:20 - INFO - memory_usage - added vocab strings, memory usage: 4941.9140625 MB, difference: -4.8203125 MB
2025-06-27 22:23:20 - INFO - memory_usage - got punct tokens, memory usage: 4937.09375 MB, difference: -4.8203125 MB
2025-06-27 22:23:20 - INFO - memory_usage - got space tokens, memory usage: 4937.09375 MB, difference: 0.0 MB
2025-06-27 22:23:20 - INFO - memory_usage - saved space positions, memory usage: 4932.55078125 MB, difference: -4.54296875

## Information about the corpus

In [None]:
#| export
@patch
def info(self: Corpus, 
		 include_disk_usage:bool = False, # include information of size on disk in output
		 formatted:bool = True # return formatted output
		 ) -> str: # formatted information about the corpus
	""" Return information about the corpus. """
	
	result = []
	attributes = ['name', 'description', 'date_created', 'conc_version', 'corpus_path', 'document_count', 'token_count', 'word_token_count', 'unique_tokens', 'unique_word_tokens']
	for attr in attributes:
		value = getattr(self, attr)
		if isinstance(value, bool):
			result.append('True' if value else 'False')
		elif isinstance(value, int):
			result.append(f'{value:,}')
		else:
			result.append(str(value))

	if include_disk_usage:
		files = {'corpus.json': 'Corpus Metadata', 'metadata.parquet': 'Document Metadata', 'tokens.parquet': 'Tokens', 'vocab.parquet': 'Vocab', 'puncts.parquet': 'Punctuation positions', 'spaces.parquet': 'Space positions'}
		for file, file_descriptor in files.items():
			size = os.path.getsize(f'{self.corpus_path}/{file}')
			attributes.append(file_descriptor + ' (MB)')
			result.append(f'{size/1024/1024:.3f}')

	# maybe add in status of these: 'results_cache', 'ngram_index', 'frequency_table'
	# size = sys.getsizeof(getattr(self, attr))
	
	if formatted:
		attributes = [attr.replace('_', ' ').title() for attr in attributes]

	return pl.DataFrame({'Attribute': attributes, 'Value': result})



In [None]:
#| export
@patch
def report(self: Corpus, 
			include_memory_usage:bool = False # include memory usage in output
			) -> Result: # returns Result object with corpus summary information
	""" Get information about the corpus as a result object. """
	return Result('summary', self.info(include_memory_usage), 'Corpus Summary', '', {}, [])	

In [None]:
#| export
@patch
def summary(self: Corpus, 
			include_memory_usage:bool = False # include memory usage in output
			):
	""" Print information about the corpus in a formatted table. """
	result = Result('summary', self.info(include_memory_usage), 'Corpus Summary', '', {}, [])
	result.display()

In [None]:
#| exporti
@patch
def __str__(self: Corpus):
	""" Formatted information about the corpus. """
	
	return str(self.info())



You can get summary information on your corpus, including the number of documents, the token count and the number of unique tokens as a dataframe using the `info` method. You can also just print the corpus itself.

In [None]:
#| hide
brown = Corpus().load(f'{save_path}brown.corpus')
toy = Corpus().load(f'{save_path}toy.corpus')

In [None]:
print(brown) # equivalent to print(brown.info())

┌────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ Attribute          ┆ Value                                                                                                                                                                                                                                              │
╞════════════════════╪════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╡
│ Name               ┆ Brown Corpus                                                                                                                                                                 

The `info` method can also provide information on the disk usage of the corpus setting the `include_disk_usage` parameter to `True`. 

In [None]:
print(brown.info(include_disk_usage=True))

┌────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ Attribute                  ┆ Value                                                                                                                                                                                                                                              │
╞════════════════════════════╪════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╡
│ Name                       ┆ Brown Corpus                                                                                                                                 

You can get the same information in a table format by using the `summary` method. 

In [None]:
brown.summary()

Corpus Summary,Corpus Summary
Attribute,Value
Name,Brown Corpus
Description,"A Standard Corpus of Present-Day Edited American English, for use with Digital Computers. by W. N. Francis and H. Kucera (1964) Department of Linguistics, Brown University Providence, Rhode Island, USA Revised 1971, Revised and Amplified 1979 http://www.hit.uib.no/icame/brown/bcm.html. This version downloaded via NLTK https://www.nltk.org/nltk_data/."
Date Created,2025-06-25 16:58:33
Conc Version,0.1.4
Corpus Path,/home/geoff/data/conc-test-corpora/brown.corpus
Document Count,500
Token Count,1138566
Word Token Count,980144
Unique Tokens,42930
Unique Word Tokens,42907


## Working with tokens

Internally, Conc uses Polars and Numpy vector operations where possible to speed up processing.

In [None]:
#| exporti
@patch
def _init_token_arrays(self: Corpus):
	""" Prepare the temporary token arrays for the corpus. """
	if 'tokens_array' not in self.results_cache:
		start_time = time.time()
		self.results_cache['tokens_array'] = self.vocab.sort(by = pl.col('token_id')).select(pl.col('token')).collect(engine='streaming').to_numpy().flatten()
		self.results_cache['tokens_array'] = np.insert(self.results_cache['tokens_array'], 0, ERR_TOKEN_STR) # adding a dummy value at the 0 index to align token strings with token_ids
		logger.info(f'Created tokens_array in {(time.time() - start_time):.3f} seconds')

		start_time = time.time() 
		# new functionality for disk-based build 
		self.results_cache['tokens_lookup'] = dict(zip(self.results_cache['tokens_array'], range(len(self.results_cache['tokens_array']))))
		logger.info(f'Created tokens_lookup in {(time.time() - start_time):.3f} seconds')
		
		start_time = time.time()
		self.results_cache['tokens_sort_order'] = self.vocab.sort(by = pl.col('token_id')).select(pl.col('tokens_sort_order')).collect(engine='streaming').to_numpy().flatten()
		self.results_cache['tokens_sort_order'] = np.insert(self.results_cache['tokens_sort_order'], 0, 0) # adding a dummy value at the 0 index to align token strings with token_ids
		logger.info(f'Created tokens_sort_order in {(time.time() - start_time):.3f} seconds')

		# start_time = time.time()  # move tokens sort order to build process - takes > 1 second for large corpora, but not needed for all results
		# # building tokens_sort_order was implemented in _init_tokens_sort_order - deprecating to simplify as makes sense to build all these in one go
		# tokens_array_lower = np.char.lower(self.results_cache['tokens_array'].astype(str))
		# self.results_cache['tokens_sort_order'] = np.argsort(np.argsort(tokens_array_lower)) # lowercasing then sorting	
		# logger.info(f'Created tokens_sort_order in {(time.time() - start_time):.3f} seconds')
		# del tokens_array_lower	

In [None]:
#| hide
set_logger_state('verbose')
brown.results_cache = {}
%time brown._init_token_arrays()
set_logger_state('quiet')

2025-06-27 22:23:21 - INFO - _init_token_arrays - Created tokens_array in 0.018 seconds
2025-06-27 22:23:21 - INFO - _init_token_arrays - Created tokens_lookup in 0.010 seconds
2025-06-27 22:23:21 - INFO - _init_token_arrays - Created tokens_sort_order in 0.006 seconds


CPU times: user 55.8 ms, sys: 20.1 ms, total: 76 ms
Wall time: 36.1 ms


In [None]:
#| export
@patch
def token_ids_to_tokens(self: Corpus, 
						token_ids: np.ndarray|list # token ids to return token strings for 
						) -> np.ndarray: # return token strings for token ids
	""" Get token strings for a list of token ids. """ 

	self._init_token_arrays()
	
	if isinstance(token_ids, list):
		token_ids = np.array(token_ids)
	if np.any(token_ids < 0):
		raise ValueError("Token ids must be non-negative integers.")
	
	return self.results_cache['tokens_array'][token_ids]

In [None]:
#| export
@patch
def tokens_to_token_ids(self: Corpus, 
				tokens: list[str]|np.ndarray[str] # list of tokens to get ids for
				) -> np.ndarray[int]: # array of token ids, 0 for unknown tokens
	""" Convert a list or np.array of token string to token ids """
	
	self._init_token_arrays()
	
	if isinstance(tokens, list):
		tokens = np.array(tokens, dtype=str)
	
	return np.array([self.results_cache['tokens_lookup'].get(token, 0) for token in tokens])

In [None]:
#| export
@patch
def token_to_id(self: Corpus, 
				token: str # token to get id for
				) -> int: # return token id (0 if token not found in the corpus)
	""" Get the token id of a token string. """

	token_ids = self.tokens_to_token_ids([token])
	return int(token_ids[0])

A list or numpy array of token strings can be converted to a numpy array of token ids like this using `tokens_to_token_ids` ...

In [None]:
tokens = ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
token_ids = brown.tokens_to_token_ids(tokens)
print(token_ids)

[15682 37698 47121 13458   526 16875 22848 25923 23289]


To reverse this use `token_ids_to_tokens` ...

In [None]:
tokens = brown.token_ids_to_tokens(token_ids) # token_ids was set above
print(tokens)

['The' 'quick' 'brown' 'fox' 'jumps' 'over' 'the' 'lazy' 'dog']


In [None]:
#|hide
assert np.array_equal(tokens, np.array(['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']))

The `tokens_to_token_ids` method will return a 0 for any tokens not in the corpus vocabulary.

In [None]:
tokens = ['some', 'random', 'gazupinfava', 'words']
brown.tokens_to_token_ids(tokens)

array([21572, 28602,     0, 31327])

In [None]:
#|hide
assert brown.tokens_to_token_ids(['gazupinfava']) == np.array([0])

If zero is passed to `token_ids_to_tokens` it will return an error token as shown below. A negative value will raise a ValueError.

In [None]:
brown.token_ids_to_tokens([0])

array(['ERROR: not a token'], dtype=object)

In [None]:
#|hide
assert brown.token_ids_to_tokens(np.array([0])) == np.array([ERR_TOKEN_STR])

In [None]:
#| hide
try:
    brown.token_ids_to_tokens([1, 2, 0, -1])
except ValueError as e:
    print(f"Error: {e}")
    assert True
else:
    assert False, "Expected ValueError for invalid token id 1"

Error: Token ids must be non-negative integers.


The `token_to_id` method wraps `tokens_to_token_ids`. You can pass a single token string and get the token id back. As with `tokens_to_token_ids`, if the token is not in the vocabulary it will return 0.

In [None]:
print(brown.token_to_id('brown')) # returns token id
print(brown.token_to_id('Supercalifragilisticexpialidocious')) # returns 0 if token not in corpus

47121
0


In [None]:
#| hide
assert type(brown.token_to_id('brown')) == int
assert brown.token_to_id('Supercalifragilisticexpialidocious') == 0

In [None]:
#| export
@patch
def token_ids_to_sort_order(self: Corpus, 
							token_ids: np.ndarray|list # token ids to return token strings for 
							) -> np.ndarray: # rank of token ids
	""" Get the sort order of token strings corresponding to token ids """

	self._init_token_arrays()	

	if isinstance(token_ids, list):
		token_ids = np.array(token_ids)
	if np.any(token_ids < 0):
		raise ValueError("Token ids must be non-negative integers.")
	
	return self.results_cache['tokens_sort_order'][token_ids]

In [None]:
tokens = np.array(['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'])
token_ids = brown.tokens_to_token_ids(tokens)
sort_order = brown.token_ids_to_sort_order(token_ids)
sorted_tokens = tokens[np.argsort(sort_order)]

print(tokens)
print(token_ids)
print(sort_order)
print(sorted_tokens)

['The' 'quick' 'brown' 'fox' 'jumps' 'over' 'the' 'lazy' 'dog']
[15682 37698 47121 13458   526 16875 22848 25923 23289]
[50086 40359  7940 20497 27663 35982 50087 29054 15849]
['brown' 'dog' 'fox' 'jumps' 'lazy' 'over' 'quick' 'The' 'the']


In [None]:
#| hide
tokens = np.array(['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'])
token_ids = brown.tokens_to_token_ids(tokens)
sort_order = brown.token_ids_to_sort_order(token_ids)
sorted_tokens = tokens[np.argsort(sort_order)]
assert np.array_equal(sorted_tokens, np.array(['brown', 'dog', 'fox', 'jumps', 'lazy', 'over', 'quick', 'The', 'the']))

In [None]:
#| export
@patch
def get_token_count_text(self: Corpus, 
					exclude_punctuation:bool = False # exclude punctuation tokens from the count
					) -> tuple[int, str, str]: # token count with adjustments based on exclusions, token descriptor, total descriptor
	""" Get the token count for the corpus with adjustments and text for output """

	count_tokens = self.token_count
	tokens_descriptor = 'word and punctuation tokens'
	total_descriptor = 'Total word and punctuation tokens'
	if exclude_punctuation:
		count_tokens = self.word_token_count
		tokens_descriptor = 'word tokens'
		total_descriptor = 'Total word tokens'

	return count_tokens, tokens_descriptor, total_descriptor

In [None]:
#| hide
assert toy.get_token_count_text(exclude_punctuation=True) == (32, 'word tokens', 'Total word tokens')
assert toy.get_token_count_text(exclude_punctuation=False) == (38, 'word and punctuation tokens', 'Total word and punctuation tokens')

## Tokenization

In [None]:
#| export
@patch
def tokenize(self: Corpus, 
			 string:str, # string to tokenize 
			#  return_tokens = False, # return token strings
			 simple_indexing = False # use simple indexing
             ): # return tokenized string
	""" Tokenize a string using the Spacy tokenizer. """
	# NOTE: when extending this function - ensure get_token_positions is compatible (e.g. currently assumes fixed sequence length of sequences)

	start_time = time.time()
	placeholder_string = 'zzxxzzplaceholderzzxxzz' # so doesn't split tokens
	is_wildcard_search = False
	if simple_indexing == True:
		index_id = LOWER
		strings_to_tokenize = [string.strip()]
	else:
		raise('only simple_indexing implemented')
		# retained for future rework
		# if '*' in string:
		# 	is_wildcard_search = True
		# 	string = string.replace('*',placeholder_string)
		# if string.islower() == True:
		# 	index_id = LOWER
		# else:
		# 	index_id = ORTH
		# if '|' in string:
		# 	strings_to_tokenize = string.split('|')
		# else:
		# 	strings_to_tokenize = [string.strip()]
	token_sequences = []
	for doc in self._nlp.pipe(strings_to_tokenize): # was tokenizer.pipe(strings_to_tokenize) - retaining for reference
		# token_sequences.append(tuple(doc.to_array(index_id))) # not using spacy indexes once corpus created
		token_sequences.append(list(doc))
	# if is_wildcard_search == True:
	# 	tmp_token_sequence = []
	# 	sequence_count = 1
	# 	for token in doc:
	# 		tmp_token_sequence.append([])
	# 		if placeholder_string in token.text:
	# 			chunked_string = token.text.split(placeholder_string)
	# 			if len(chunked_string) > 2 or (len(chunked_string) == 2 and chunked_string[0] != '' and chunked_string[1] != ''):
	# 				# use regex
	# 				approach = 'regex'
	# 				regex = re.compile('.*'.join(chunked_string))
	# 			elif chunked_string[0] == '':
	# 				approach = 'endswith'
	# 			else:
	# 				approach = 'startswith'
	# 			for token_id in loaded_corpora[corpus_name]['frequency_lookup']:
	# 				possible_word = False
	# 				word = loaded_corpora[corpus_name]['vocab'][token_id]
	# 				if approach == 'regex':
	# 					if regex.match(word):
	# 						possible_word = word
	# 				elif getattr(word,approach)(''.join(chunked_string)):
	# 					possible_word = word
	# 				if possible_word != False:
	# 					tmp_token_sequence[token.i].append(loaded_corpora[corpus_name]['vocab'][possible_word])
	# 		else:
	# 			tmp_token_sequence[token.i].append(token.orth)
	# 		sequence_count *= len(tmp_token_sequence[token.i])
	# 	rotated_token_sequence = []
	# 	token_repeat = sequence_count
	# 	for pos in range(len(tmp_token_sequence)):
	# 		rotated_token_sequence.append([])
	# 		if len(tmp_token_sequence[pos]) == 1:
	# 			rotated_token_sequence[pos] += sequence_count * [tmp_token_sequence[pos][0]]
	# 		else:
	# 			token_repeat = token_repeat // len(tmp_token_sequence[pos])
	# 			while len(rotated_token_sequence[pos]) < sequence_count:
	# 				for token in tmp_token_sequence[pos]:
	# 					rotated_token_sequence[pos] += token_repeat * [token]
	# 	token_sequences = list(zip(*rotated_token_sequence))
	# 	#for tokens in tmp_token_sequence:
	# 	#    for token in tokens:
	# covert token_sequences to reindexed tokens using original_to_new
	
	# convert sequences to lower case
	if index_id == LOWER:
		token_sequences = [[token.lower_ for token in sequence] for sequence in token_sequences]
	token_sequences = [tuple(self.tokens_to_token_ids(sequence)) for sequence in token_sequences]
	
	logger.info(f'Tokenization time: {(time.time() - start_time):.5f} seconds')
	# if return_tokens == True:
		# return token_sequences, index_id, doc
	# else:
	return token_sequences, index_id

In [None]:
#| hide
token_strs = ['dog', 'Dog', 'Brown Fox', 'the brown fox']

for token_str in token_strs:
    brown_token_sequence, brown_index_id = brown.tokenize(token_str, simple_indexing=True)
    print(brown_token_sequence[0], spacy_attribute_name(brown_index_id))

(np.int64(23289),) LOWER
(np.int64(23289),) LOWER
(np.int64(47121), np.int64(13458)) LOWER
(np.int64(22848), np.int64(47121), np.int64(13458)) LOWER


## Work with specific texts in the corpus 

In [None]:
#| exporti
@patch
def _get_text(self:Corpus,
        doc_id: int, # the id of the document
        return_df: bool = True # returns the df with 
        ):
    """ Get tokens, space definitions and metadata for a text in the corpus """
    
    if doc_id < 1 or doc_id > self.document_count:
        raise ValueError(f"Document ID {doc_id} is out of range. Document ID should be between 1 and the count of documents ({self.document_count}).")

    doc_tokens = self.tokens.with_row_index('position').filter(pl.col('token2doc_index') == doc_id).with_columns(pl.lit(1).alias('not_space'))
    doc_space_tokens = self.spaces.filter(pl.col('token2doc_index') == doc_id).with_columns(pl.lit(0).alias('not_space'))
    doc_tokens_df = pl.concat([doc_tokens, doc_space_tokens]).sort('position', 'not_space') #.drop('position').drop('not_space') # here - creating as df for return_df

    doc_tokens = doc_tokens_df.select(['orth_index', 'has_spaces']).collect()
    tokens = self.token_ids_to_tokens(doc_tokens.select(pl.col('orth_index')).to_numpy().flatten())
    has_spaces = doc_tokens.select(pl.col('has_spaces')).to_numpy().flatten()
    metadata = self.metadata.with_row_index(offset = 1, name = 'document_id').filter(pl.col('document_id') == doc_id).collect()
    if return_df == True:
        return tokens, has_spaces, metadata, doc_tokens_df
    else:
        return tokens, has_spaces, metadata

In [None]:
#| export
@patch
def text(self:Corpus,
        doc_id: int # the id of the document
        ):
    """ Get a text document """

    return Text(*self._get_text(doc_id))

In [None]:
#| hide
assert str(toy.text(1)) == 'The cat sat on the mat.'

## Find positions of tokens

In [None]:
#| export
@patch
def get_tokens_by_index(self: Corpus, 
			   index: str = 'orth_index', # index to get tokens from i.e. 'orth_index' 'lower_index' 'token2doc_index'
			   exclude_punctuation: bool = False, # exclude punctuation tokens from the result (unused currently)
				) -> np.ndarray:
	""" Get tokens for a given index. """

	logger.debug(f'Getting tokens for index: {index}')
	start_time = time.time()
	if index not in ['orth_index', 'lower_index', 'token2doc_index']:
		raise ValueError("Index must be one of 'orth_index', 'lower_index', 'token2doc_index'")

	cache_key = index
	if exclude_punctuation:
		cache_key += '-nopuncts'
	if cache_key in self.results_cache:
		logger.info(f'Tokens for index {index} with exclude_punctuation {exclude_punctuation} already cached, returning cached result in {(time.time() - start_time):.3f} seconds')
		return self.results_cache[cache_key]
	else:
		if index not in self.results_cache: # in case build -nopuncts first - get both sorted
			self.results_cache[index] = self.tokens.select(pl.col(index)).collect(engine='streaming').to_numpy().flatten()
		if exclude_punctuation == False:
			logger.info(f'Got tokens for index {index} with exclude_punctuation {exclude_punctuation} in {(time.time() - start_time):.3f} seconds')
			return self.results_cache[index]
		else:
			if 'puncts' not in self.results_cache:
				self.results_cache['puncts'] = self.puncts.select(pl.col('position')).collect(engine='streaming').to_numpy().flatten()
			self.results_cache[cache_key] = np.delete(self.results_cache[index], self.results_cache['puncts'])
			self.results_cache[f'{cache_key}-positions'] = np.delete(np.arange(len(self.results_cache[index])), self.results_cache['puncts'])
			#self.results_cache[f'{index}-nopuncts'] = self.tokens.with_row_index('position').select(pl.col('position'), pl.col(index)).join(self.puncts.select('position'), on='position', how='anti').drop('position').collect(engine='streaming').to_numpy().flatten()
			logger.info(f'Got tokens for index {index} with exclude_punctuation {exclude_punctuation} in {(time.time() - start_time):.3f} seconds')
			return self.results_cache[cache_key]


In [None]:
#| hide
set_logger_state('verbose')
brown.results_cache = {}
tokens = brown.get_tokens_by_index('orth_index')
print(f'Length of tokens array: {len(tokens)}')
print(tokens[100:110])  # print first 10 tokens
print(brown.token_ids_to_tokens(tokens[100:110]))  # print first 10 token strings
set_logger_state('quiet')

2025-06-27 22:23:21 - DEBUG - get_tokens_by_index - Getting tokens for index: orth_index
2025-06-27 22:23:21 - INFO - get_tokens_by_index - Got tokens for index orth_index with exclude_punctuation False in 0.021 seconds


Length of tokens array: 1139266
[15682  4361 14610 54713 45742 53250  8699 45680 30305  2739]


2025-06-27 22:23:21 - INFO - _init_token_arrays - Created tokens_array in 0.012 seconds
2025-06-27 22:23:21 - INFO - _init_token_arrays - Created tokens_lookup in 0.004 seconds
2025-06-27 22:23:21 - INFO - _init_token_arrays - Created tokens_sort_order in 0.005 seconds


['The' 'Fulton' 'County' 'Grand' 'Jury' 'said' 'Friday' 'an'
 'investigation' 'of']


In [None]:
#| hide
# THIS WILL BREAK CI AS CONGRESS DATA NOT UP - SO COMMENT OUT BEFORE COMMIT
# set_logger_state('verbose')
# congress = Corpus().load(f'{save_path}/us-congressional-speeches-subset-500k.corpus')
# congress.results_cache = {}
# tokens = congress.get_tokens_by_index('orth_index', exclude_punctuation=False)
# import sys
# tokens_size_mb = sys.getsizeof(tokens) / (1024 * 1024)
# print(f'Tokens array size in memory: {tokens_size_mb:.3f} MB')
# print(f'Length of tokens array: {len(tokens)}')
# print(tokens[100:110])  # print first 10 tokens
# print(congress.token_ids_to_tokens(tokens[100:110]))  # print first 10 token strings
# set_logger_state('quiet')

2025-06-27 22:23:21 - INFO - memory_usage - init, memory usage: 4937.19921875 MB
2025-06-27 22:23:21 - INFO - load - Load time: 0.222 seconds
2025-06-27 22:23:22 - DEBUG - get_tokens_by_index - Getting tokens for index: orth_index
2025-06-27 22:23:23 - INFO - get_tokens_by_index - Got tokens for index orth_index with exclude_punctuation False in 1.462 seconds
2025-06-27 22:23:23 - INFO - _init_token_arrays - Created tokens_array in 0.118 seconds


Tokens array size in memory: 383.006 MB
Length of tokens array: 100402793
[767113 514572 635422 628652 789648 190869 374120 714548 499625 283795]


2025-06-27 22:23:23 - INFO - _init_token_arrays - Created tokens_lookup in 0.243 seconds
2025-06-27 22:23:24 - INFO - _init_token_arrays - Created tokens_sort_order in 0.042 seconds


['Mr.' 'Speaker' '.' 'with' 'regard' 'to' 'the' 'House' 'investigation'
 'on']


In [None]:
#| hide
# THIS WILL BREAK CI AS CONGRESS DATA NOT UP - SO COMMENT OUT BEFORE COMMIT
# import sys
# set_logger_state('verbose')
# congress = Corpus().load(f'{save_path}/us-congressional-speeches-subset-500k.corpus')
# congress.results_cache = {}
# tokens = congress.get_tokens_by_index('orth_index', exclude_punctuation=True)
# tokens_size_mb = sys.getsizeof(tokens) / (1024 * 1024)
# print(f'Tokens array size in memory: {tokens_size_mb:.3f} MB')
# print(f'Length of tokens array: {len(tokens)}')
# print(tokens[100:110])  # print first 10 tokens
# print(congress.token_ids_to_tokens(tokens[100:110]))  # print first 10 token strings
# print(congress.results_cache['orth_index-nopuncts-positions'][100:110])  # print first 10 token strings
# # %timeit np.delete(congress.results_cache['orth_index'], congress.results_cache['puncts'])
# # %timeit np.setdiff1d(np.arange(len(congress.results_cache['orth_index'])), congress.results_cache['puncts'], assume_unique=True)
# # %timeit np.delete(np.arange(len(congress.results_cache['orth_index'])), congress.results_cache['puncts'])
# tokens = congress.get_tokens_by_index('orth_index', exclude_punctuation=True)
# set_logger_state('quiet')

2025-06-27 22:23:24 - INFO - memory_usage - init, memory usage: 3882.6015625 MB
2025-06-27 22:23:24 - INFO - load - Load time: 0.265 seconds
2025-06-27 22:23:24 - DEBUG - get_tokens_by_index - Getting tokens for index: orth_index
2025-06-27 22:23:25 - INFO - get_tokens_by_index - Got tokens for index orth_index with exclude_punctuation True in 1.415 seconds
2025-06-27 22:23:25 - INFO - _init_token_arrays - Created tokens_array in 0.114 seconds
2025-06-27 22:23:26 - INFO - _init_token_arrays - Created tokens_lookup in 0.205 seconds


Tokens array size in memory: 346.535 MB
Length of tokens array: 90842144
[767113 514572 628652 789648 190869 374120 714548 499625 283795 374120]


2025-06-27 22:23:26 - INFO - _init_token_arrays - Created tokens_sort_order in 0.033 seconds
2025-06-27 22:23:26 - DEBUG - get_tokens_by_index - Getting tokens for index: orth_index
2025-06-27 22:23:26 - INFO - get_tokens_by_index - Tokens for index orth_index with exclude_punctuation True already cached, returning cached result in 0.000 seconds


['Mr.' 'Speaker' 'with' 'regard' 'to' 'the' 'House' 'investigation' 'on'
 'the']
[100 101 103 104 105 106 107 108 109 110]


In [None]:
#| export
@patch
def get_ngrams_by_index(self: Corpus, 
				ngram_length:int, # length of ngrams to get
				index:str,  # index to get tokens from, e.g. 'orth_index' 'lower_index'
				exclude_punctuation: bool = False # exclude punctuation tokens from the result (unused currently)
				) -> np.ndarray:
	""" Get ngrams for a given index and ngram length. """

	if index not in ['orth_index', 'lower_index']:
		raise ValueError("Index must be either 'orth_index' or 'lower_index'")

	if (index, ngram_length, exclude_punctuation) not in self.ngram_index:
		slices = []
		[slices.append(np.roll(self.get_tokens_by_index(index, exclude_punctuation), shift)) for shift in -np.arange(ngram_length)]
		seq = np.vstack(slices).T
		self.ngram_index[(index, ngram_length, exclude_punctuation)] = seq

	return self.ngram_index[(index, ngram_length, exclude_punctuation)]

In [None]:
toy.get_ngrams_by_index(ngram_length=2, index='lower_index')[100:110]

array([[10,  6],
       [ 6, 12],
       [12,  8],
       [ 8, 10],
       [10, 13],
       [13, 15],
       [15, 17],
       [17, 10],
       [10, 11],
       [11, 12]], dtype=uint32)

In [None]:
#| hide
# congress = Corpus().load(f'{save_path}/us-congressional-speeches-subset-500k.corpus')
# sys.getsizeof(congress.get_tokens_by_index('orth_index'))/1024/1024

In [None]:
#| export
@patch
def get_token_positions(self: Corpus, 
					token_sequence: list[np.ndarray], # token sequence to get index for 
					index_id: int, # index to search (i.e. ORTH, LOWER)
					exclude_punctuation: bool = False # exclude punctuation tokens from the result (unused currently)
					) -> np.ndarray: # positions of token sequence
	""" Get the positions of a token sequence in the corpus. """
	
	start_time = time.time()

	results = []

	sequence_len = len(token_sequence[0]) # Check when extend tokenization
	variants_len = len(token_sequence)

	if index_id == ORTH:
		index = 'orth_index'
	else:
		index = 'lower_index'

	if variants_len == 1:
		results.append(np.where(np.all(self.get_ngrams_by_index(ngram_length = sequence_len, index = index, exclude_punctuation = exclude_punctuation) == token_sequence[0], axis=1))[0])
	else:
		condition_list = []
		choice_list = variants_len * [True]
		for seq in token_sequence:
			condition_list.append(self.get_ngrams_by_index(ngram_length = sequence_len, index = index, exclude_punctuation = exclude_punctuation) == seq)
		results.append(np.where(np.all(np.select(condition_list, choice_list),axis=1))[0])

	logger.info(f'Token indexing ({len(results[0])}) time: {(time.time() - start_time):.5f} seconds')
	return results

In [None]:
token_str = 'dog'
token_sequence, index_id = brown.tokenize(token_str, simple_indexing=True)
token_positions = brown.get_token_positions(token_sequence, index_id)
print(token_positions)

[array([  18833,   18870,   18880,   18950,   18957,   37578,   88691,
        125019,  137037,  137687,  137722,  137731,  137775,  143860,
        188374,  248842,  248982,  249204,  249217,  249243,  249311,
        249337,  249397,  249425,  249535,  250476,  250495,  250554,
        250613,  250645,  250699,  250709,  251033,  252740,  253700,
        255256,  255360,  255532,  330282,  359785,  437987,  437991,
        438046,  438051,  463456,  463485,  463507,  521175,  648316,
        694080,  694129,  694289,  694481,  694760,  695139,  695216,
        695313,  861865,  861872,  863503,  863521,  875531,  875573,
        875660,  887598,  994901, 1012130, 1028088, 1050598, 1050607,
       1052032, 1074911, 1084765, 1086020, 1086052, 1086639, 1104994,
       1128317, 1137426])]


In [None]:
#| hide
token_str = 'dog'
token_sequence, index_id = toy.tokenize(token_str, simple_indexing=True)
token_positions = toy.get_token_positions(token_sequence, index_id)
assert np.array_equal(token_positions[0], np.array([109, 123, 137])) # validated using toy.token_ids_to_tokens(toy.get_tokens_by_index('orth_index')[100:-100])
assert np.array_equal(toy.token_ids_to_tokens(toy.get_tokens_by_index('orth_index')[token_positions[0]]), np.array(['dog', 'dog', 'dog']))

In [None]:
#| exporti
@patch
def _shift_zeroes_to_end(self:Corpus,
						arr:np.ndarray # Numpy array of collocate frequencies to process
						):
	""" Move 0 value positions for punctuation and space removal, zeroes get moved to the end of each column. """
	result = np.empty_like(arr)
	for col in range(arr.shape[1]):
		col_data = arr[:, col]
		mask = col_data != 0
		result[:mask.sum(), col] = col_data[mask]
		result[mask.sum():, col] = 0
	return result

In [None]:
#| exporti
@patch
def _shift_zeroes_to_start(self:Corpus,
						arr:np.ndarray # Numpy array of collocate frequencies to process
						):
	""" Move 0 value positions for punctuation and space removal to the start of each column """
	result = np.empty_like(arr)
	for col in range(arr.shape[1]):
		col_data = arr[:, col]
		mask = col_data != 0
		n_zeros = (~mask).sum()
		result[:n_zeros, col] = 0
		result[n_zeros:, col] = col_data[mask]
	return result

In [None]:
#| exporti
@patch
def _zero_after_value(self:Corpus,
					  arr:np.ndarray, # Numpy array of collocate frequencies to process
					  target: int # Target value to find in the array (e.g., an end-of-file token or a specific collocate frequency)
					  ):
	""" Set values from first occurence of target value to 0 in each column (for processing tokens outside text using eof token) """
	arr = arr.copy()  
	for col in range(arr.shape[1]):
		col_data = arr[:, col]
		idx = np.where(col_data == target)[0]
		if idx.size > 0:
			first_idx = idx[0]
			arr[first_idx:, col] = 0
	return arr

In [None]:
#| export
@patch
def get_tokens_in_context(self:Corpus,
							   token_positions:np.ndarray, # Numpy array of token positions in the corpus
							   index:str, # Index to use - lower_index, orth_index
							   context_length:int = 5, # Number of context words to consider on each side of the token
							   position_offset:int = 1, # offset to start retrieving context words - negatve is left of node, positive for right - may want to adjust if sequence_len > 1
							   position_offset_step:int = 1, # step to move position offset by, this sets direct, -1 for left, 1 for right
							   exclude_punctuation:bool = True, # ignore punctuation from context retrieved
							   convert_eof:bool = True # if True (for collocation functionality), contexts with end of file tokens will have eof token and tokens after set to zero, otherwise EOF retained (e.g. False used for ngrams)
							   ) -> Result:
	""" Get tokens in context for given token positions, context length and direction, operates one side at a time. """

	start_time = time.time()

	if context_length < 1:
		# return empty result
		return np.zeros((0, 0), dtype=np.int32)

	tokens_for_removal = []
	if exclude_punctuation:
		tokens_for_removal += self.punct_tokens
	len_tokens_for_removal = len(tokens_for_removal)

	collected = False
	context_tokens_arr = []
	while collected == False:
		new_positions = np.array(token_positions[0] + position_offset, dtype = token_positions[0].dtype)
		context_tokens_arr.append(self.get_tokens_by_index(index)[new_positions])
		position_offset += position_offset_step
		if len(context_tokens_arr) >= context_length: 
			context_tokens = np.array(context_tokens_arr, dtype = token_positions[0].dtype)
			# shape = (context_length, len(token_positions[0]))
			logger.debug(f"Context tokens collected: {context_tokens.shape}")
			if len_tokens_for_removal > 0: # cleaning punctuation and check if need more iterations
				context_tokens = np.where(np.isin(context_tokens, tokens_for_removal), 0, context_tokens)
			counts = np.count_nonzero(context_tokens, axis=0)
			if np.min(counts) < context_length:
				pass
			else:
				collected = True

	context_tokens = self._shift_zeroes_to_end(context_tokens)
	context_tokens = context_tokens[:context_length, :]

	if convert_eof: # delete any context that contains self.EOF_TOKEN
		if self.EOF_TOKEN in context_tokens:
			context_tokens = self._zero_after_value(context_tokens, self.EOF_TOKEN)

	logger.info(f"Context retrieved in {time.time() - start_time:.2f} seconds.")

	return context_tokens

In [None]:
#| export
def build_test_corpora(
		source_path:str, # path to folder with corpora
		save_path:str, # path to save corpora
		force_rebuild:bool = False # force rebuild of corpora, useful for development and testing
		):
	"""(Deprecated - moved to conc.corpora) Build all test corpora from source files."""

	raise DeprecationWarning("Calling build_test_corpora via conc.corpus is deprecated and will be removed by v1.0.0, instead import as 'from conc.corpora import build_test_corpora' and use 'build_sample_corpora'.")

Note: `build_sample_corpora` was accessible via conc.corpus as `build_test_corpora` up to version 0.1.1. Calling it this way will raise a deprecation warning. It will be removed for version 1.0. 

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()