# corpus

> Create a conc corpus.

In [None]:
#| default_exp corpus

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
# requirements - numpy pandas polars spacy nltk great_tables
# dev requirements - nbdev, jupyterlab, memory_profiler
# TODO check

from __future__ import annotations
import re
import polars as pl
import numpy as np
from great_tables import GT
import os
import glob
import spacy
from spacy.attrs import ORTH, LOWER # TODO - add ENT_TYPE, ENT_IOB? from spacy.attrs import SPACY, POS, TAG, SENT_START, LEMMA
import sys
import string
from fastcore.basics import patch
import time
from slugify import slugify
import msgspec # tested against orjson - with validation was faster, without around the same


In [None]:
#| hide
import shutil

In [None]:
# from memory_profiler import memory_usage
# import pickle
# import gc

In [None]:
#| export
from conc import __version__
from conc.core import logger, set_logger_state, PAGE_SIZE, EOF_TOKEN_STR, REPOSITORY_URL, DOCUMENTATION_URL, CITATION_STR
from conc.result import Result

In [None]:
#| exporti
polars_conf = pl.Config.set_tbl_hide_column_data_types(True)
polars_conf = pl.Config.set_tbl_hide_dataframe_shape(True)
polars_conf = pl.Config.set_tbl_rows(50)
polars_conf = pl.Config.set_tbl_width_chars(300)
polars_conf = pl.Config.set_fmt_str_lengths(300)

In [None]:
#| exporti
_RE_COMBINE_WHITESPACE = re.compile(r"\s+")
_RE_PUNCT = re.compile(r"^[^\s^\w^\d]$")

In [None]:
#| export
NOT_DOC_TOKEN = -1
INDEX_HEADER_LENGTH = 100

## Corpus metadata validator

In [None]:
#| export
class CorpusMetadata(msgspec.Struct): 
    """ JSON validation schema for corpus metadata """
    name: str
    description: str
    slug: str
    conc_version: str
    document_count: int
    token_count: int
    word_token_count: int
    punct_token_count: int
    space_token_count: int
    unique_tokens: int
    unique_word_tokens: int
    date_created: str
    #source_path: str
    EOF_TOKEN: int
    SPACY_EOF_TOKEN: int
    SPACY_MODEL: str
    SPACY_MODEL_VERSION: str
    punct_tokens: list[int]
    space_tokens: list[int]



## Corpus class

In [None]:
#| export
class Corpus:
	"""Represention of text corpus, with methods to build, load and save a corpus from a variety of formats and to work with the corpus data."""
	
	def __init__(self, 
				name: str = '', # name of corpus
				description: str = '' # description of corpus
				):
		# information about corpus
		self.name = name
		self.description = description
		self.slug = None

		# conc version that built the corpus
		self.conc_version = None
		
		# paths
		self.corpus_path = None
		self.source_path = None

		# settings
		self.SPACY_MODEL = None
		self.SPACY_MODEL_VERSION = None
		self.SPACY_EOF_TOKEN = None # set below as nlp.vocab[EOF_TOKEN_STR].orth in build or through load  - EOF_TOKEN_STR starts with space so eof_token can't match anything from corpus
		self.EOF_TOKEN = None

		# special token ids
		self.punct_tokens = None
		self.space_tokens = None

		# metadata for corpus
		self.document_count = None
		self.token_count = None
		self.unique_tokens = None

		self.word_token_count = None
		self.unique_word_tokens = None

		self.date_created = None

		# token data
		self.orth_index = None
		self.lower_index = None

		# lookup mapping doc_id to every token in doc
		self.token2doc_index = None

		# lookups to get token string or frequency 
		self.vocab = None
		self.frequency_lookup = None

		# offsets for each document in token data
		self.offsets = None

		# punct and space positions in token data
		self.punct_positions = None
		self.space_positions = None

		# metadata for each document
		self.metadata = []

		# lookups to get spacy tokenizer or internal ids
		self.original_to_new = None
		self.new_to_original = None
		
		# temporary data used when processing text, not 
		# 
		# 
		# 
		# d to disk permanently on save
		self.frequency_table = None
		self.ngram_index = {}
		self.results_cache = {}


## Build and save a corpus

In [None]:
#| exporti
@patch
def _init_spacy_model(self: Corpus,
                model: str = 'en_core_web_sm', # spacy model to use for tokenization
				version: str|None = None # version of spacy model expected, if mismatch will raise a warning
				):
	try:
		self._nlp = spacy.load(model)
		self._nlp.disable_pipes(['parser', 'ner', 'lemmatizer', 'tagger', 'senter', 'tok2vec', 'attribute_ruler'])
		self._nlp.max_length = 10_000_000 # set max length to a large number to avoid issues with long documents
	except OSError as e:
		logger.error(f'Error loading model {model}. You need to run python -m spacy download YOURMODEL to download the model. See https://spacy.io/models for available models.')
		raise e
	
	if version is not None:
		if self._nlp.meta['version'] != version:
			logger.warning(f'Spacy model version mismatch: expecting {version}, got {self._nlp.meta["version"]}. This may cause issues with tokenization.')

In [None]:
#| exporti
@patch
def _process_punct_positions(self: Corpus):
	""" Process punctuation positions in token data and populates punct_tokens and punct_positions. """

	self.punct_tokens = np.array(list({k: v for k, v in self.vocab.items() if v.strip(string.punctuation) == ''}.keys()))
	punct_mask = np.isin(self.lower_index, self.punct_tokens) # faster to retrieve with isin than where
	self.punct_positions = np.nonzero(punct_mask)[0] # storing this as smaller

Punctuation tokens are defined using Python `string.punctuation` ...

In [None]:
@patch
def _process_space_positions(self: Corpus):
	""" Process whitespace positions in token data and populates space_tokens and space_positions. """

	self.space_tokens = np.array(list({k: v for k, v in self.vocab.items() if v.strip() == ''}.keys()))
	space_mask = np.isin(self.lower_index, self.space_tokens) 	# faster to retrieve with isin than where
	self.space_positions = np.nonzero(space_mask)[0] # storing this as smaller


Spacy includes space tokens in the vocab for non-destructive tokenisation. Positions of space tokens are stored so they can be filtered out for analysis and reporting. 

In [None]:
#| hide
# reminder of string.punctuation characters
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

Tokens consisting of only punctuation are defined as punctuation tokens. These can be removed or included in analysis and reporting.

In [None]:
#| exporti
@patch
def _init_build_process(self:Corpus,
						save_path: str, # path to save corpus data 
						):
	""" Create slug, corpus_path, and create directory if needed. """

	self.conc_version = __version__
	self.slug = slugify(self.name, stopwords=['corpus'])
	self.corpus_path = f'{save_path}/{self.slug}.corpus'

	if not os.path.isdir(self.corpus_path):
		os.makedirs(self.corpus_path)

In [None]:
#| exporti
@patch
def _update_build_process(self: Corpus, 
                           orth_index: list[np.ndarray], # orthographic token ids
                           lower_index: list[np.ndarray], # lower case token ids
                           token2doc_index: list[np.ndarray], # token to document mapping
                           store_pos: int # current store pos
                           ) -> int: # next store pos
    """ Write in-progress build data to Parquet disk store. """

    pl.DataFrame([np.concatenate(orth_index), np.concatenate(lower_index), np.concatenate(token2doc_index)], schema = [('orth_index', pl.UInt64), ('lower_index', pl.UInt64), ('token2doc_index', pl.Int32)] ).write_parquet(f'{self.corpus_path}/build_{store_pos}.parquet')
    return store_pos + 1

In [None]:
#| exporti
@patch
def _complete_build_process(self: Corpus, 
							build_process_cleanup: bool = True # Remove the build files after build is complete, retained for development and testing purposes
							):
	""" Complete the disk-based build to create representation of the corpus. """

	logger.memory_usage('init', init=True)
	input_df = pl.scan_parquet(f'{self.corpus_path}/build_*.parquet')
	# combining indexes to reindex
	combined_df = pl.concat([input_df.select(pl.col('orth_index').alias('index')), input_df.select(pl.col('lower_index').alias('index'))])

	input_length = input_df.select(pl.len()).collect(engine='streaming').item() # tested vs count - len seems to have slight memory overhead, but more correct (i.e. count only counts non-null)
	logger.memory_usage(f'got input length {input_length}')

	# get unique vocab ids (combining orth and lower) and create new index
	vocab_df  = combined_df.select(pl.col('index').unique().sort().alias('source_id')).with_row_index('token_id', offset=1) #.collect(engine='streaming')
	logger.memory_usage('collected vocab')

	# combined_df = (combined_df.with_columns(pl.col('index').replace(vocab_df.select(pl.col('source_id'))['source_id'], vocab_df.select(pl.col('token_id'))['token_id']).cast(pl.UInt32)))
	# combined_df = combined_df.with_columns(pl.col('index').cast(pl.UInt32))

	combined_df = (
		combined_df
		.join(vocab_df, left_on="index", right_on="source_id", how="left", maintain_order="left")
		.drop("index")
		.rename({"token_id": "index"})
		.with_columns(pl.col("index").cast(pl.UInt32).alias("index"))
	)

	tokens_df = pl.concat(
									[combined_df.select(pl.col('index').alias('orth_index')).slice(0, input_length), 
									combined_df.select(pl.col('index').alias('lower_index')).slice(input_length),
									input_df.select(pl.col('token2doc_index'))], how='horizontal'
							)
	
	del combined_df
	del input_df
	logger.memory_usage('freed up combined_df and input_df')

	vocab_query = vocab_df.select(pl.col('source_id')).collect(engine='streaming').to_numpy().flatten() # get vocab ids as numpy array for faster processing

	vocab = {k:self._nlp.vocab[k].text for k in vocab_query} # get vocab strings from spacy vocab
	token_strs = list(vocab.values())
	logger.memory_usage('got vocab strings')
	vocab_df = vocab_df.with_columns(pl.Series(token_strs).alias('token'))
	logger.memory_usage('added vocab strings')

	self.EOF_TOKEN = vocab_df.filter(pl.col('source_id') == self.SPACY_EOF_TOKEN).select(pl.col('token_id')).collect(engine='streaming').item() # casting to int for storage
	
	self.punct_tokens = [(k + 1) for k, v in enumerate(token_strs) if v.strip(string.punctuation) == '']
	logger.memory_usage(f'got punct tokens')
	self.space_tokens = [(k + 1) for k, v in enumerate(token_strs) if v.strip() == '']
	logger.memory_usage(f'got space tokens')

	del token_strs

	# Create LazyFrames for punct_positions and space_positions
	tokens_df.select(pl.col('lower_index')).with_row_index('position').filter(pl.col('lower_index').is_in(self.punct_tokens)).select('position').sink_parquet(f'{self.corpus_path}/puncts.parquet') #.collect(engine='streaming').to_numpy().flatten()
	logger.memory_usage('saved punct positions')
	tokens_df.select(pl.col('lower_index')).with_row_index('position').filter(pl.col('lower_index').is_in(self.space_tokens)).select('position').sink_parquet(f'{self.corpus_path}/spaces.parquet') #.collect(engine='streaming').to_numpy().flatten()
	logger.memory_usage('saved space positions')

	# get counts from tokens_df
	frequency_lower = tokens_df.filter(pl.col('lower_index') != self.EOF_TOKEN).select(pl.col('lower_index')).group_by('lower_index').agg(pl.count('lower_index').alias('frequency_lower')) #.collect(engine='streaming')
	frequency_orth = tokens_df.filter(pl.col('orth_index') != self.EOF_TOKEN).select(pl.col('orth_index')).group_by('orth_index').agg(pl.count('orth_index').alias('frequency_orth')) #.collect(engine='streaming')
	vocab_df = vocab_df.join(frequency_lower, left_on = 'token_id', right_on = 'lower_index', how='left', maintain_order="left").join(frequency_orth, left_on = 'token_id', right_on = 'orth_index', how='left', maintain_order="left")
	logger.memory_usage('added frequency to vocab')

	self.unique_tokens = frequency_lower.select(pl.len()).collect(engine='streaming').item() # was len(frequency_lower) before used polars streaming # TODO - validate correct - make sure that EOF_TOKEN not included
	logger.memory_usage(f'got unique tokens {self.document_count}')

	# add column for is_punct and is_space based on punct_tokens and space_tokens and token_id
	vocab_df = vocab_df.with_columns((pl.col("token_id").is_in(self.punct_tokens)).alias("is_punct"))
	vocab_df = vocab_df.with_columns((pl.col("token_id").is_in(self.space_tokens)).alias("is_space"))
	logger.memory_usage('added is_punct is_space to vocab')

	vocab_df.sink_parquet(f'{self.corpus_path}/vocab.parquet')
	logger.memory_usage('wrote vocab to disk')
	tokens_df.sink_parquet(f'{self.corpus_path}/tokens.parquet')
	logger.memory_usage('wrote tokens to disk')

	self.document_count = tokens_df.select(pl.col('token2doc_index').filter(pl.col('token2doc_index') != NOT_DOC_TOKEN).unique().count()).collect(engine='streaming').item()
	logger.memory_usage(f'got doc count {self.document_count}')
	# adjusting for text breaks and headers at start and end of index
	self.token_count = input_length - self.document_count - INDEX_HEADER_LENGTH - INDEX_HEADER_LENGTH 
	logger.memory_usage('got token count')

	del tokens_df

	self.punct_token_count = pl.scan_parquet(f'{self.corpus_path}/puncts.parquet').select(pl.len()).collect(engine='streaming').item() # TODO - may be more efficient to do this prior to disk write
	logger.memory_usage('got punct token count')
	self.space_token_count = pl.scan_parquet(f'{self.corpus_path}/spaces.parquet').select(pl.len()).collect(engine='streaming').item() # TODO - may be more efficient to do this prior to disk write
	logger.memory_usage('got space token count')
	self.word_token_count = self.token_count - self.punct_token_count - self.space_token_count # TODO - validate correct - make sure that EOF_TOKEN not included
	self.unique_word_tokens = self.unique_tokens - len(self.punct_tokens) - len(self.space_tokens) # TODO - validate correct - make sure that EOF_TOKEN not included
	
	self.date_created = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())

	if build_process_cleanup:
		for f in glob.glob(f'{self.corpus_path}/build_*.parquet'):
			os.remove(f)
		logger.memory_usage('removed build files')
	
	logger.memory_usage('done')



In [None]:
#| exporti
@patch
def _create_indices(self: Corpus, 
				   orth_index: list[np.ndarray], # list of np arrays of orth token ids 
				   lower_index: list[np.ndarray], # list of np arrays of lower token ids
				   token2doc_index: list[np.ndarray] # list of np arrays of doc ids
				   ):
	""" (Depreciated) Use Numpy to create internal representation of the corpus for faster analysis and efficient representation on disk. Only used when the disk-based build process is not used. """

	self.token2doc_index = np.concatenate(token2doc_index)
	unique_values, inverse = np.unique(np.concatenate(orth_index + lower_index), return_inverse=True)

	# adding a dummy value at the 0 index to avoid 0 being used as a token id
	unique_values = np.insert(unique_values, 0, 0)
	inverse += 1
	new_values = np.arange(len(unique_values), dtype=np.uint32)
	self.original_to_new = dict(zip(unique_values, new_values))
	self.new_to_original = dict(zip(new_values, unique_values))

	self.orth_index = np.array(np.split(inverse, 2)[0], dtype=np.uint32)
	self.lower_index = np.array(np.split(inverse, 2)[1], dtype=np.uint32)
	del inverse

	vocab = {k:self._nlp.vocab.strings[k] for k in unique_values}
	vocab[0] = 'ERROR: not a token'

	self.vocab = {**{k:vocab[self.new_to_original[k]] for k in new_values}}

	self.EOF_TOKEN = self.original_to_new[self.SPACY_EOF_TOKEN]

	self._process_punct_positions()
	self._process_space_positions()

	self.frequency_lookup = dict(zip(*np.unique(self.lower_index, return_counts=True)))
	del self.frequency_lookup[self.EOF_TOKEN]
	del unique_values



In [None]:
#| export
@patch
def save_corpus_metadata(self: Corpus, 
		 ):
	""" Save corpus metadata. """
	
	start_time = time.time()
	json_bytes = msgspec.json.encode(CorpusMetadata(**{k: getattr(self, k) for k in ['name', 'description', 'slug', 'conc_version', 'document_count', 'token_count', 'word_token_count', 'punct_token_count', 'space_token_count', 'unique_tokens', 'unique_word_tokens', 'date_created', 'EOF_TOKEN', 'SPACY_EOF_TOKEN', 'SPACY_MODEL', 'SPACY_MODEL_VERSION', 'punct_tokens', 'space_tokens']}))

	with open(f'{self.corpus_path}/corpus.json', 'wb') as f:
		f.write(json_bytes)

	with open(f'{self.corpus_path}/README.md', 'w', encoding='utf-8') as f:
		f.write(f'# {self.name}\n\n{self.description}\n\n## About\n\nThis directory contains a corpus created by [Conc]({REPOSITORY_URL}) (version {self.conc_version}) on {self.date_created}. \n\nIt was created using spaCy model: {self.SPACY_MODEL} (version {self.SPACY_MODEL_VERSION})\n\n')
		f.write(f'## Corpus Information\n\nDocument count: {self.document_count}  \nToken count: {self.token_count}  \nWord token count: {self.word_token_count}  \nUnique tokens: {self.unique_tokens}  \nUnique word tokens: {self.unique_word_tokens}\n\n')
		f.write(f'## Using this corpus\n\nConc can be installed with pip using:  \n\nDocumentation with tutorials to get you started is available at {DOCUMENTATION_URL} \n\n')
		f.write(f'## Cite Conc\n\n{CITATION_STR}')
		
	logger.info(f'Saved corpus metadata time: {(time.time() - start_time):.3f} seconds')

In [None]:
#| export
@patch
def build(self: Corpus, 
		  save_path:str, # directory where corpus will be created, a subdirectory will be automatically created with the corpus content
		  iterator: iter, # iterator of texts
		  model: str='en_core_web_sm', # spacy model to use for tokenisation
		  spacy_batch_size:int=500, # batch size for spacy tokenizer
		  #build_process_path:str|None=None, # path to save an in-progress build to disk to reduce memory usage, default of None disables 
		  build_process_batch_size:int=5000, # save in-progress build to disk every n docs
		  build_process_cleanup:bool = True # Remove the build files after build is complete, retained for development and testing purposes
		  ):
	"""Build a corpus from an iterator of texts."""

	self._init_spacy_model(model)
	
	self.SPACY_MODEL = model
	self.SPACY_MODEL_VERSION = self._nlp.meta['version']
	self.SPACY_EOF_TOKEN = self._nlp.vocab[EOF_TOKEN_STR].orth
	
	if self.corpus_path is None: # leaving for testing ... this should already be set if build has been initiated in standard way via build_from_csv, build_from_files or whatever other methods are implemented to handle build/imports in future
		self._init_build_process(save_path)
	
	logger.memory_usage('init', init=True)

	start_time = time.time()

	eof_arr = np.array([self.SPACY_EOF_TOKEN], dtype=np.uint64)
	not_doc_arr = np.array([NOT_DOC_TOKEN], dtype=np.int16)
	index_header_arr = np.array([self.SPACY_EOF_TOKEN] * INDEX_HEADER_LENGTH, dtype=np.uint64) # this is added to start and end of index to prevent out of bound issues on searches

	orth_index = [index_header_arr]
	lower_index = [index_header_arr]
	token2doc_index = [np.array([NOT_DOC_TOKEN] * len(index_header_arr), dtype=np.int32)]

	offset = INDEX_HEADER_LENGTH
	self.offsets = [] # TODO - check that this is being used  - consider removing

	store_pos = 0

	doc_order = 0
	for doc in self._nlp.pipe(iterator, batch_size = spacy_batch_size): # was previously using self._nlp.tokenizer.pipe(iterator, batch_size=batch_size): but this is faster, test other options at some point
		orth_index.append(doc.to_array(ORTH))
		orth_index.append(eof_arr)

		lower_index_tmp = doc.to_array(LOWER)
		lower_index.append(lower_index_tmp)
		lower_index.append(eof_arr)

		token2doc_index.append(np.array([doc_order] * len(lower_index_tmp), dtype=np.int32))
		token2doc_index.append(not_doc_arr)

		self.offsets.append(offset) 
		offset = offset + len(lower_index_tmp) + 1
		doc_order += 1

		# update store every build_process_batch_size docs
		if doc_order % build_process_batch_size == 0:
			#was based on condition build_process_path is not None before disk-based build process
			store_pos = self._update_build_process(orth_index, lower_index, token2doc_index, store_pos)
			lower_index, orth_index, token2doc_index = [], [], []
			logger.memory_usage(f'processed {doc_order} documents')
			
	del iterator
	orth_index.append(index_header_arr)
	lower_index.append(index_header_arr)
	token2doc_index.append(np.array([NOT_DOC_TOKEN] * len(index_header_arr), dtype=np.int32))

	logger.memory_usage(f'Completing build process')
	if save_path is not None:
		store_pos = self._update_build_process(orth_index, lower_index, token2doc_index, store_pos)
		self._complete_build_process(build_process_cleanup = build_process_cleanup)
	else:
		# depreciated - leaving for now
		self._create_indices(orth_index, lower_index, token2doc_index)
		self.document_count = len(self.offsets)

		self.token_count = self.lower_index.shape[0] - self.document_count - len(index_header_arr) - len(index_header_arr) 
		self.unique_tokens = len(self.frequency_lookup)

		self.word_token_count = self.token_count - len(self.punct_positions) - len(self.space_positions)
		self.unique_word_tokens = len(self.frequency_lookup) - len(self.punct_tokens) - len(self.space_tokens)

	del orth_index
	del lower_index
	del token2doc_index
	
	logger.memory_usage(f'Completed build process')

	# save corpus metadata
	self.save_corpus_metadata()

	logger.info(f'Build time: {(time.time() - start_time):.3f} seconds')


In [None]:
#| exporti
@patch
def _prepare_files(self: Corpus, 
					source_path: str, # path to folder with text files, path can be a directory, zip or tar/tar.gz file
					file_mask:str='*.txt', # mask to select files 
					metadata_file: str|None=None, # path to a CSV with metadata
					metadata_file_column:str = 'file', # column in metadata file with file names to align texts with metadata
					metadata_columns:list[str]=[], # list of column names to import from metadata
					encoding:str='utf8' # encoding of text files
					):
	"""Prepare text files and metadata for building a corpus. Returns an iterator to get file text for processing."""

	# allowing import from zip and tar files
	if os.path.isdir(source_path):
		files = glob.glob(os.path.join(source_path, file_mask))
		type = 'folder'
	elif os.path.isfile(source_path):
		import fnmatch
		if source_path.endswith('.zip'):
			import zipfile
			with zipfile.ZipFile(source_path, 'r') as z:
				files = []
				for f in z.namelist():
					if fnmatch.fnmatch(f, file_mask):
						files.append(f)
				if len(files) > 0:
					type = 'zip'
		elif source_path.endswith('.tar') or source_path.endswith('.tar.gz'):
			import tarfile
			with tarfile.open(source_path, 'r') as t:
				files = []
				for f in t.getnames():
					if fnmatch.fnmatch(f, file_mask):
						files.append(f)
				if len(files) > 0:
					type = 'tar'
		else:
			raise FileNotFoundError(f"Path '{source_path}' is not a directory, zip or tar file")
	
	if not files:
		raise FileNotFoundError(f"No files matching {file_mask} found in '{source_path}'")

	metadata = pl.LazyFrame({metadata_file_column: [os.path.basename(p) for p in files]})

	if metadata_file:
		if not os.path.isfile(metadata_file):
			raise FileNotFoundError(f"Metadata file '{metadata_file}' not found")
		try:
			metadata_columns = set([metadata_file_column] + metadata_columns)
			
			# ordering metadata based on order of files so token data and metadata aligned
			metadata = metadata.join(pl.scan_csv(metadata_file).select(metadata_columns), on=metadata_file_column, how='left')
		except pl.exceptions.ColumnNotFoundError as e:
			raise
	
	metadata.sink_parquet(f'{self.corpus_path}/metadata.parquet')

	self.source_path = source_path

	if type == 'folder':
		for p in files:
			yield open(p, "rb").read().decode(encoding)
	elif type == 'zip':
		with zipfile.ZipFile(source_path, 'r') as z:
			for f in files:
				yield z.read(f).decode(encoding)
	elif type == 'tar':
		with tarfile.open(source_path, 'r') as t:
			for f in files:
				yield t.extractfile(f).read().decode(encoding)		
	


In [None]:
#| export
@patch
def build_from_files(self: Corpus,
					source_path: str, # path to folder with text files 
					save_path: str, # path to save corpus
					file_mask:str='*.txt', # mask to select files 
					metadata_file: str|None=None, # path to a CSV with metadata
					metadata_file_column:str = 'file', # column in metadata file with file names to align texts with metadata
					metadata_columns:list[str]=[], # list of column names to import from metadata
					encoding:str='utf-8', # encoding of text files
					model:str='en_core_web_sm', # spacy model to use for tokenisation
					spacy_batch_size:int=1000, # batch size for spacy tokenizer
					#build_process_path:str=None, # path to save an in-progress build to disk to reduce memory usage
					build_process_batch_size:int=5000, # save in-progress build to disk every n docs
					build_process_cleanup:bool = True # Remove the build files after build is complete, retained for development and testing purposes
					):
	"""Build a corpus from text files in a folder."""
	
	start_time = time.time()
	self._init_build_process(save_path)
	iterator = self._prepare_files(source_path, file_mask, metadata_file, metadata_file_column, metadata_columns, encoding) #, build_process_path=build_process_path
	self.build(save_path = save_path, iterator = iterator, model = model, spacy_batch_size = spacy_batch_size, build_process_batch_size = build_process_batch_size, build_process_cleanup = build_process_cleanup) #build_process_path = build_process_path, 
	logger.info(f'Build from files time: {(time.time() - start_time):.3f} seconds')

	return self


In [None]:
# #| hide
# test = Corpus('test')
# texts = []
# for text in test._prepare_files('../test-corpora/source/toy', file_mask='*1.txt'):
# 	texts.append(text)
# assert len(texts) == 1
# assert texts[0] == 'The cat sat on the mat.'

# texts = []
# for text in test._prepare_files('../test-corpora/source/toy', file_mask='*.txt', metadata_file='../test-corpora/source/toy.csv', metadata_file_column = 'source', metadata_columns=['category']):
# 	texts.append(text)

# assert len(texts) == 6
# assert 'The cat sat on the mat.' in texts
# assert test.metadata.shape[0] == 6
# assert test.metadata.columns == ['source', 'category']

# cat_sat_index = texts.index('The cat sat on the mat.') 
# assert test.metadata['source'][cat_sat_index] == '1.txt'
# assert test.metadata['category'][cat_sat_index] == 'feline'

# del test

In [None]:
#| export
@patch
def _prepare_csv(self: Corpus, 
					source_path:str, # path to csv file
					text_column:str='text', # column in csv with text
					metadata_columns:list[str]=[], # list of column names to import from csv
					encoding:str='utf8', # encoding of csv passed to Polars read_csv, see their documentation
					build_process_batch_size:int=5000 # save in-progress build to disk every n rows
					) -> iter: # iterator to return rows for processing
	"""Prepare to import from CSV, including metadata. Returns an iterator to process the text column."""

	if not os.path.isfile(source_path):
		raise FileNotFoundError(f'Path ({source_path}) is not a file')
	
	try:
		df = pl.scan_csv(source_path, encoding = encoding).select([text_column] + metadata_columns)
	except pl.exceptions.ColumnNotFoundError as e:
		raise

	self.source_path = source_path
	
	df.select(metadata_columns).sink_parquet(f'{self.corpus_path}/metadata.parquet')

	for slice_df in df.collect(engine='streaming').iter_slices(n_rows=build_process_batch_size):  
		for row in slice_df.iter_rows():
			yield row[0]  

In [None]:
#| export
@patch
def build_from_csv(self: Corpus, 
				   source_path:str, # path to csv file
				   save_path: str, # path to save corpus
				   text_column:str='text', # column in csv with text
				   metadata_columns:list[str]=[], # list of column names to import from csv
				   encoding:str='utf8', # encoding of csv passed to Polars read_csv, see their documentation
				   model:str='en_core_web_sm', # spacy model to use for tokenisation
				   spacy_batch_size:int=1000, # batch size for Spacy tokenizer
				   #build_process_path:str=None, # path to save an in-progress build to disk to reduce memory usage
				   build_process_batch_size:int=5000, # save in-progress build to disk every n docs
				   build_process_cleanup:bool = True # Remove the build files after build is complete, retained for development and testing purposes
				   ):
	"""Build a corpus from a csv file."""
	
	start_time = time.time()
	self._init_build_process(save_path)
	iterator = self._prepare_csv(source_path = source_path, text_column = text_column, metadata_columns = metadata_columns, encoding = encoding, build_process_batch_size = build_process_batch_size)
	self.build(save_path = save_path, iterator = iterator, model = model, spacy_batch_size = spacy_batch_size, build_process_batch_size = build_process_batch_size, build_process_cleanup = build_process_cleanup)
	logger.info(f'Build from csv time: {(time.time() - start_time):.3f} seconds')

	return self


In [None]:
# #| hide
# test = Corpus('test')
# texts = []

# for text in test._prepare_csv('../test-corpora/source/toy.csv', text_column='text', metadata_columns=['source', 'category']):
# 	texts.append(text)

# assert len(texts) == 6
# cat_sat_index = 0
# assert texts[cat_sat_index] == 'The cat sat on the mat.'
# assert test.metadata.shape[0] == 6
# assert test.metadata.columns == ['source', 'category']
# assert test.metadata['source'][cat_sat_index] == '1.txt'
# assert test.metadata['category'][cat_sat_index] == 'feline'
# del test

In [None]:
#| hide
source_path = '../test-corpora/source/'
save_path = '../test-corpora/saved/'

## Load a corpus

In [None]:
#| export
@patch
def load(self: Corpus, 
		 corpus_path: str # path to load corpus
		 ):
	""" Load corpus from disk and load the corresponding spaCy model. """

	start_time = time.time()

	if not os.path.isdir(corpus_path):
		raise FileNotFoundError(f"Path '{corpus_path}' is not a directory")
	
	expected_files = ['corpus.json', 'vocab.parquet', 'tokens.parquet', 'puncts.parquet', 'spaces.parquet']
	if not all(os.path.isfile(os.path.join(corpus_path, f)) for f in expected_files):
		raise FileNotFoundError(f"Path '{corpus_path}' does not contain all expected files: {expected_files}")

	self.corpus_path = corpus_path

	with open(f'{self.corpus_path}/corpus.json', 'rb') as f:
		data = msgspec.json.decode(f.read(), type=CorpusMetadata)

	for k in data.__slots__:
		setattr(self, k, getattr(data, k))

	self._init_spacy_model(self.SPACY_MODEL, version = self.SPACY_MODEL_VERSION)

	logger.info(f'Load time: {(time.time() - start_time):.3f} seconds')

	return self

In [None]:
#| hide
#| eval: false
corpora = {}
corpora['toy'] = {'name': 'Toy Corpus', 'slug': 'toy', 'description': 'Toy corpus for testing', 'extension': '.csv.gz'}
corpora['brown'] = {'name': 'Brown Corpus', 'slug': 'brown', 'description': 'A Standard Corpus of Present-Day Edited American English, for use with Digital Computers. by W. N. Francis and H. Kucera (1964) Department of Linguistics, Brown University Providence, Rhode Island, USA Revised 1971, Revised and Amplified 1979 http://www.hit.uib.no/icame/brown/bcm.html', 'extension': '.csv.gz'}
corpora['reuters'] = {'name': 'Reuters Corpus', 'slug': 'reuters', 'description': 'From NLTK TODO', 'extension': '.csv.gz'}
corpora['gutenberg'] = {'name': 'Gutenberg Corpus', 'slug': 'gutenberg', 'description': 'From NLTK TODO', 'extension': '.csv.gz'}
corpora['garden-party-corpus'] = {'name': 'Garden Party Corpus', 'slug': 'garden-party', 'description': 'https://github.com/ucdh/scraping-garden-party', 'extension': '.zip'}

set_logger_state('verbose')
for corpus_name, corpus_details in corpora.items():
	logger.info(f'Loading {corpus_name} corpus')
	try:
		corpus = Corpus().load(f"{save_path}{corpus_details['slug']}.corpus")
	except FileNotFoundError:
		if 'csv' in corpus_details['extension']:
			corpus = Corpus(name = corpus_details['name'], description = corpus_details['description']).build_from_csv(source_path = f'{source_path}{corpus_name}.csv.gz', text_column='text', metadata_columns=['source'], save_path = save_path)
		else:
			corpus = Corpus(name = corpus_details['name'], description = corpus_details['description']).build_from_files(source_path = f'{source_path}{corpus_name}{corpus_details["extension"]}', save_path = save_path)
	except Exception as e:
		raise e
	del corpus
set_logger_state('quiet')


2025-05-27 14:34:12 - INFO - <module> - Loading toy corpus
2025-05-27 14:34:12 - INFO - load - Load time: 0.228 seconds
2025-05-27 14:34:12 - INFO - <module> - Loading brown corpus
2025-05-27 14:34:12 - INFO - load - Load time: 0.281 seconds
2025-05-27 14:34:12 - INFO - <module> - Loading reuters corpus
2025-05-27 14:34:12 - INFO - load - Load time: 0.207 seconds
2025-05-27 14:34:12 - INFO - <module> - Loading gutenberg corpus
2025-05-27 14:34:13 - INFO - load - Load time: 0.200 seconds
2025-05-27 14:34:13 - INFO - <module> - Loading garden-party-corpus corpus
2025-05-27 14:34:13 - INFO - load - Load time: 0.196 seconds


In [None]:
#| hide
#| eval: false
set_logger_state('verbose')
# TODO - add tests for build and save and load
if os.path.isdir(f'{save_path}/toy.corpus'):
	shutil.rmtree(f'{save_path}/toy.corpus')
	
try:
	toy = Corpus().load(f'{save_path}/toy.corpus')
except FileNotFoundError:
	toy = Corpus(name = corpora['toy']['name'], description = corpora['toy']['description']).build_from_csv(f'{source_path}toy.csv.gz', save_path = save_path, text_column='text', metadata_columns=['source'])
except Exception as e:
	raise e
del toy
set_logger_state('quiet')

2025-05-27 14:34:13 - INFO - memory_usage - init, memory usage: 1335.890625 MB
2025-05-27 14:34:13 - INFO - memory_usage - Completing build process, memory usage: 1337.81640625 MB, difference: 1.92578125 MB
2025-05-27 14:34:13 - INFO - memory_usage - init, memory usage: 1337.81640625 MB
2025-05-27 14:34:13 - INFO - memory_usage - got input length 244, memory usage: 1339.75390625 MB, difference: 1.9375 MB
2025-05-27 14:34:13 - INFO - memory_usage - collected vocab, memory usage: 1339.75390625 MB, difference: 0.0 MB
2025-05-27 14:34:13 - INFO - memory_usage - freed up combined_df and input_df, memory usage: 1339.75390625 MB, difference: 0.0 MB
2025-05-27 14:34:13 - INFO - memory_usage - got vocab strings, memory usage: 1340.50390625 MB, difference: 0.75 MB
2025-05-27 14:34:13 - INFO - memory_usage - added vocab strings, memory usage: 1340.50390625 MB, difference: 0.0 MB
2025-05-27 14:34:13 - INFO - memory_usage - got punct tokens, memory usage: 1341.25390625 MB, difference: 0.75 MB
2025-

In [None]:
#| hide
#| eval: false
set_logger_state('verbose')
# TODO - add tests for build and save and load
if os.path.isdir(f'{save_path}/brown.corpus'):
	shutil.rmtree(f'{save_path}/brown.corpus')

try:
	brown = Corpus().load(f'{save_path}/brown.corpus')
except FileNotFoundError:
	brown = Corpus(name = corpora['brown']['name'], description = corpora['brown']['description']).build_from_csv(f'{source_path}/brown.csv.gz', save_path = save_path, text_column='text', metadata_columns=['source'])
except Exception as e:
	raise e
del brown
set_logger_state('quiet')

2025-05-27 14:34:13 - INFO - memory_usage - init, memory usage: 1370.765625 MB
2025-05-27 14:34:15 - INFO - memory_usage - Completing build process, memory usage: 1360.5859375 MB, difference: -10.1796875 MB
2025-05-27 14:34:15 - INFO - memory_usage - init, memory usage: 1360.5859375 MB
2025-05-27 14:34:15 - INFO - memory_usage - got input length 1141605, memory usage: 1360.4609375 MB, difference: -0.125 MB
2025-05-27 14:34:15 - INFO - memory_usage - collected vocab, memory usage: 1360.4609375 MB, difference: 0.0 MB
2025-05-27 14:34:15 - INFO - memory_usage - freed up combined_df and input_df, memory usage: 1360.4609375 MB, difference: 0.0 MB
2025-05-27 14:34:16 - INFO - memory_usage - got vocab strings, memory usage: 1359.11328125 MB, difference: -1.34765625 MB
2025-05-27 14:34:16 - INFO - memory_usage - added vocab strings, memory usage: 1359.11328125 MB, difference: 0.0 MB
2025-05-27 14:34:16 - INFO - memory_usage - got punct tokens, memory usage: 1354.328125 MB, difference: -4.78515

## List available corpora

In [None]:
#| export
def list_corpora(
		path: str # path to load corpus
		) -> pl.DataFrame: # Dataframe with path, corpus, corpus name, document count, token count
	""" Scan a directory for available corpora """
	
	available_corpora = {'path': path, 'corpus': [], 'name': [], 'date_created': [], 'document_count': [], 'token_count': []}
	for dir in os.listdir(path):
		if os.path.isdir(os.path.join(path, dir)) and os.path.isfile( os.path.join(path, dir, 'corpus.json')):
			with open(os.path.join(path, dir, 'corpus.json'), 'rb') as f:
				data = msgspec.json.decode(f.read(), type=CorpusMetadata)

			available_corpora['corpus'].append(dir)
			for k in ['name', 'document_count', 'token_count', 'date_created']:
				attr = getattr(data, k)
				if isinstance(attr, int):
					attr = f'{attr:,}'
				available_corpora[k].append(attr)

	return pl.DataFrame(available_corpora)

In [None]:
print(list_corpora(save_path))

┌────────────────────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────────┬─────────────┐
│ path                   ┆ corpus              ┆ name                ┆ date_created        ┆ document_count ┆ token_count │
╞════════════════════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════════╪═════════════╡
│ ../test-corpora/saved/ ┆ gutenberg.corpus    ┆ Gutenberg Corpus    ┆ 2025-05-27 14:30:06 ┆ 18             ┆ 2,777,046   │
│ ../test-corpora/saved/ ┆ garden-party.corpus ┆ Garden Party Corpus ┆ 2025-05-27 14:30:06 ┆ 15             ┆ 79,940      │
│ ../test-corpora/saved/ ┆ brown.corpus        ┆ Brown Corpus        ┆ 2025-05-27 14:34:16 ┆ 500            ┆ 1,140,905   │
│ ../test-corpora/saved/ ┆ toy.corpus          ┆ Toy Corpus          ┆ 2025-05-27 14:34:13 ┆ 6              ┆ 38          │
│ ../test-corpora/saved/ ┆ reuters.corpus      ┆ Reuters Corpus      ┆ 2025-05-27 14:29:56 ┆ 10,788         ┆ 1,726,826   │
└───────

## Information about the corpus

In [None]:
#| export
@patch
def info(self: Corpus, 
		 include_disk_usage:bool = False, # include information of size on disk in output
		 formatted:bool = True # return formatted output
		 ) -> str: # formatted information about the corpus
	""" Return information about the corpus. """
	
	result = []
	attributes = ['name', 'description', 'date_created', 'conc_version', 'corpus_path', 'document_count', 'token_count', 'word_token_count', 'unique_tokens', 'unique_word_tokens']
	for attr in attributes:
		value = getattr(self, attr)
		if isinstance(value, bool):
			result.append('True' if value else 'False')
		elif isinstance(value, int):
			result.append(f'{value:,}')
		else:
			result.append(str(value))

	if include_disk_usage:
		files = {'corpus.json': 'Corpus Metadata', 'metadata.parquet': 'Document Metadata', 'tokens.parquet': 'Tokens', 'vocab.parquet': 'Vocab', 'puncts.parquet': 'Punctuation positions', 'spaces.parquet': 'Space positions'}
		for file, file_descriptor in files.items():
			size = os.path.getsize(f'{self.corpus_path}/{file}')
			attributes.append(file_descriptor + ' (MB)')
			result.append(f'{size/1024/1024:.3f}')

	# maybe add in status of these: 'results_cache', 'ngram_index', 'frequency_table'
	# size = sys.getsizeof(getattr(self, attr))
	
	if formatted:
		attributes = [attr.replace('_', ' ').title() for attr in attributes]

	return pl.DataFrame({'Attribute': attributes, 'Value': result})



In [None]:
#| export
@patch
def summary(self: Corpus, 
			include_memory_usage:bool = False # include memory usage in output
			):
	""" Print information about the corpus in a formatted table. """
	result = Result('summary', self.info(include_memory_usage), 'Corpus Summary', '', {}, [])
	result.display()

In [None]:
#| exporti
@patch
def __str__(self: Corpus):
	""" Formatted information about the corpus. """
	
	return str(self.info())



In [None]:
#| exporti
@patch
def _index_name(self: Corpus, index):
	"""Get name of index from spacy."""

	return list(spacy.attrs.IDS.keys())[list(spacy.attrs.IDS.values()).index(index)]

You can get summary information on your corpus, including the number of documents, the token count and the number of unique tokens as a dataframe using the `info` method. You can also just print the corpus itself.

In [None]:
brown = Corpus().load(f'{save_path}/brown.corpus')
print(brown) # equivalent to print(brown.info())

┌────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ Attribute          ┆ Value                                                                                                                                                                                                                                              │
╞════════════════════╪════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╡
│ Name               ┆ Brown Corpus                                                                                                                                                                 

The `info` method can also provide information on the disk usage of the corpus setting the `include_disk_usage` parameter to `True`. 

In [None]:
print(brown.info(include_disk_usage=True))

┌────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ Attribute                  ┆ Value                                                                                                                                                                                                                                              │
╞════════════════════════════╪════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╡
│ Name                       ┆ Brown Corpus                                                                                                                                 

You can get the same information in a nicer format by using the `summary` method.

In [None]:
brown.summary()

Corpus Summary,Corpus Summary
Attribute,Value
Name,Brown Corpus
Description,"A Standard Corpus of Present-Day Edited American English, for use with Digital Computers. by W. N. Francis and H. Kucera (1964) Department of Linguistics, Brown University Providence, Rhode Island, USA Revised 1971, Revised and Amplified 1979 http://www.hit.uib.no/icame/brown/bcm.html"
Date Created,2025-05-27 14:34:16
Conc Version,0.0.1
Corpus Path,../test-corpora/saved//brown.corpus
Document Count,500
Token Count,1140905
Word Token Count,980144
Unique Tokens,42937
Unique Word Tokens,42907


## Anatomy of a corpus

A Conc corpus is a directory containing specific files as follows:

```
corpus-name.corpus/
	README.md - Human readable information about the corpus to aide distribution
	corpus.json - Machine readable information about the corpus, including name, description, various summary statistics, and models used to build the corpus
	vocab.parquet - A table mapping token strings to token IDs and frequency information
	tokens.parquet - A table with indices based on token positions used to query the corpus with tokens represented by numeric IDs
	metadata.parquet - A table with metadata for each document (if there is any)
```

Note: by default the library creates a directory with the `.corpus` suffix. This is not necessary, but this makes corpora on your filesystem easier to find or identify.

To distribute a corpus, send a zip of the directory for others to extract or just share the directory as-is.

### README.md

Below is an example of the README.md file generated by the Conc.

In [None]:
#| hide
from IPython.display import Markdown, display

In [None]:
#| echo: true
with open(f'{brown.corpus_path}/README.md', 'rb') as f:
    markdown = '<div class="alert alert-block alert-success">\n\n' + f.read().decode('utf-8') + '\n'
    markdown = markdown.replace('\n#', '\n##') # making headings smaller for display
    markdown += '</div>'
    display(Markdown(markdown))

<div class="alert alert-block alert-success">

## Brown Corpus

A Standard Corpus of Present-Day Edited American English, for use with Digital Computers. by W. N. Francis and H. Kucera (1964) Department of Linguistics, Brown University Providence, Rhode Island, USA Revised 1971, Revised and Amplified 1979 http://www.hit.uib.no/icame/brown/bcm.html

### About

This directory contains a corpus created by [Conc](https://github.com/polsci/conc) (version 0.0.1) on 2025-05-27 14:34:16. 

It was created using spaCy model: en_core_web_sm (version 3.8.0)

### Corpus Information

Document count: 500  
Token count: 1140905  
Word token count: 980144  
Unique tokens: 42937  
Unique word tokens: 42907

### Using this corpus

Conc can be installed with pip using:  

Documentation with tutorials to get you started is available at https://geoffford.nz/conc 

### Cite Conc

If you use Conc in your work, please cite it as follows:
</div>

### corpus.json file

Below is the schema for the `corpus.json` file.

In [None]:
#| echo: true
properties = msgspec.json.schema(CorpusMetadata)['$defs']['CorpusMetadata']['properties']
display(properties)

{'name': {'type': 'string'},
 'description': {'type': 'string'},
 'slug': {'type': 'string'},
 'conc_version': {'type': 'string'},
 'document_count': {'type': 'integer'},
 'token_count': {'type': 'integer'},
 'word_token_count': {'type': 'integer'},
 'punct_token_count': {'type': 'integer'},
 'space_token_count': {'type': 'integer'},
 'unique_tokens': {'type': 'integer'},
 'unique_word_tokens': {'type': 'integer'},
 'date_created': {'type': 'string'},
 'EOF_TOKEN': {'type': 'integer'},
 'SPACY_EOF_TOKEN': {'type': 'integer'},
 'SPACY_MODEL': {'type': 'string'},
 'SPACY_MODEL_VERSION': {'type': 'string'},
 'punct_tokens': {'type': 'array', 'items': {'type': 'integer'}},
 'space_tokens': {'type': 'array', 'items': {'type': 'integer'}}}

### vocab.parquet

In [None]:
#| echo: true
display(pl.scan_parquet(f'{brown.corpus_path}/vocab.parquet').filter(pl.col('frequency_lower') > 0).sort(by = pl.col('frequency_lower'), descending = True).head(5).collect(engine='streaming'))

token_id,source_id,token,frequency_lower,frequency_orth,is_punct,is_space
22848,7425985699627899538,"""the""",63516,62473,False,False
8128,2593208677638477497,""",""",58331,58331,True,False
38309,12646065887601541794,""".""",49907,49907,True,False
2739,886050111519832510,"""of""",36321,36122,False,False
7126,2283656566040971221,"""and""",27787,27633,False,False


Explain how frequency stored - i.e. with different word forms.

In [None]:
#| echo: true
display(pl.scan_parquet(f'{brown.corpus_path}/vocab.parquet').filter(pl.col('token').str.to_lowercase() == 'the').head(5).collect(engine='streaming'))

token_id,source_id,token,frequency_lower,frequency_orth,is_punct,is_space
15682,5059648917813135842,"""The""",,1043,False,False
22848,7425985699627899538,"""the""",63516.0,62473,False,False


### tokens.parquet

In [None]:
#| echo: true
pl.scan_parquet(f'{brown.corpus_path}/tokens.parquet').with_row_index('position').filter(pl.col('position').is_between(99, 107)).collect(engine='streaming')

position,orth_index,lower_index,token2doc_index
99,46333,46333,-1
100,27276,27276,0
101,15682,22848,0
102,4361,41672,0
103,14610,29725,0
104,54713,49998,0
105,45742,19078,0
106,53250,53250,0
107,8699,35796,0


Explain this token2doc_index -1 above and various other fields mapped below.

In [None]:
#| echo: true
pl.scan_parquet(f'{brown.corpus_path}/tokens.parquet').with_row_index('position').filter(pl.col('position').is_between(99, 107)).join(
    pl.scan_parquet(f'{brown.corpus_path}/vocab.parquet').select(pl.col('token_id'), pl.col('token')),
    left_on='orth_index', right_on='token_id', how='left', maintain_order='left').collect(engine='streaming')

position,orth_index,lower_index,token2doc_index,token
99,46333,46333,-1,""" conc-end-of-file-token"""
100,27276,27276,0,""" 	"""
101,15682,22848,0,"""The"""
102,4361,41672,0,"""Fulton"""
103,14610,29725,0,"""County"""
104,54713,49998,0,"""Grand"""
105,45742,19078,0,"""Jury"""
106,53250,53250,0,"""said"""
107,8699,35796,0,"""Friday"""


### spaces.parquet and puncts.parquet

The format of spaces.parquet and puncts.parquet are the same. Each table contains one field, namely `position`, which indexes the position of punctuation or space tokens in the corpus. Here are the first three rows of a `puncts.parquet` file:

In [None]:
#| echo: true
pl.scan_parquet(f'{brown.corpus_path}/puncts.parquet').head(3).collect(engine='streaming')

position
117
118
121


### metadata.parquet

The `metadata.parquet` should not be confused with the metadata of the corpus itself, which is accessible in `corpus.jon`.

If populated, the `metadata.parquet` file contains metadata for each document in the corpus. 

In [None]:
#| echo: true
corpus = Corpus().load(f'{save_path}/us-congressional-speeches-subset-10k.corpus')
display(pl.scan_parquet(f'{corpus.corpus_path}/metadata.parquet').head(3).collect(engine='streaming'))

speech_id,date,speaker,chamber,state
530182158,"""1895-01-10T00:00:00.000000""","""Mr. COCKRELL""","""S""","""Unknown"""
890274849,"""1966-08-31T00:00:00.000000""","""Mr. LONG of Louisiana""","""S""","""Louisiana"""
880088363,"""1963-09-11T00:00:00.000000""","""Mr. FULBRIGHT""","""S""","""Unknown"""


For corpora created from files, there will always be a field for the source file at the time of creation. This is in the same order as documents are represented in the `tokens.parquet` file.

In [None]:
#| echo: true
corpus = Corpus().load(f'{save_path}/garden-party.corpus')
display(pl.scan_parquet(f'{corpus.corpus_path}/metadata.parquet').head(3).collect(engine='streaming'))

file
"""an-ideal-family.txt"""
"""at-the-bay.txt"""
"""bank-holiday.txt"""


## Working with tokens

In [None]:
#| exporti
@patch
def _init_frequency_table(self: Corpus):
	""" Prepare the frequency table for the corpus. """
	# TODO work out case sensitivity issues - currently if do token lookup for The - not there
	if self.frequency_table is None:
		# note: don't sort this - leave in order of token_id - sorts can be done when required
		start_time = time.time()
		self.frequency_table = pl.DataFrame({'token_id': list(self.frequency_lookup.keys()), 'frequency': list(self.frequency_lookup.values())})  
		self.frequency_table = self.frequency_table.join(pl.DataFrame({'token_id': list(self.vocab.keys()), 'token': list(self.vocab.values())}), on='token_id', how='left')
		self.frequency_table = self.frequency_table.with_columns(self.frequency_table['token_id'].is_in(self.punct_tokens).alias('is_punct')).with_columns(self.frequency_table['token_id'].is_in(self.space_tokens).alias('is_space'))	
		self.frequency_table = self.frequency_table.with_row_index(name='rank', offset=1)
		logger.info(f'Frequency table created in {(time.time() - start_time):.3f} seconds')

In [None]:
#| exporti
@patch
def _mask_from_positions(self: Corpus, 
						 positions # positions to create mask from
						 ):
	""" Convert positions to mask """
	mask_from_positions = np.zeros(self.lower_index.shape, dtype=bool)
	mask_from_positions[positions] = True
	return mask_from_positions

In [None]:
#| exporti
@patch
def _init_tokens_array(self: Corpus):
	""" Prepare the tokens array for the corpus. """
	if 'tokens_array' not in self.results_cache:
		start_time = time.time()
		self.results_cache['tokens_array'] = np.array(list(self.vocab.values()))
		logger.info(f'Create tokens_array in {(time.time() - start_time):.3f} seconds')

In [None]:
#| exporti
@patch
def _init_tokens_sort_order(self: Corpus):
	""" Prepare the tokens sort order for the corpus. """
	if 'tokens_sort_order' not in self.results_cache:
		self._init_tokens_array()
		# lowercasing then sorting ...
		tokens_array_lower = np.strings.lower(self.results_cache['tokens_array'])
		self.results_cache['tokens_sort_order'] = np.argsort(np.argsort(tokens_array_lower))

In [None]:
#| export
@patch
# TODO maybe convert to using tokens_array rather than frequency_table
def token_to_id(self: Corpus, 
				token: str # token to get id for
				) -> int|bool: # return token id or False if not found in the corpus
	""" Get the id for a token string. """

	self._init_frequency_table()
	token = self.frequency_table.filter(pl.col('token') == token)['token_id']
	if token.shape[0] == 0:
		return False
	else:
		token = token[0]
	return token

Get the ID of the token 'dog' like this:

In [None]:
brown.token_to_id('dog')

23289

In [None]:
#| export
@patch
def token_ids_to_tokens(self: Corpus, 
						token_ids: np.ndarray|list # token ids to retrieve as tokens
						) -> np.ndarray: # return token strings for token ids
	""" Get token strings for a list of token ids. """ 

	self._init_tokens_array()
	if isinstance(token_ids, list):
		token_ids = np.array(token_ids)
	return self.results_cache['tokens_array'][token_ids]

Internally, conc uses Numpy vector operations where possible. A list or numpy array of Token IDs can be converted to a numpy array of token strings like this:

In [None]:
token_ids = [23288, 24576, 47803]
brown.token_ids_to_tokens(token_ids)

array(['acid', '395,000', 'mckinney'], dtype='<U30')

In [None]:
#| export
@patch
def token_ids_to_sort_order(self: Corpus, 
							token_ids: np.ndarray # token ids to get rank 
							) -> np.ndarray: # rank of token ids
	""" Get the rank of token ids in the frequency table. """
	#TODO document that this is a rank
	self._init_tokens_sort_order()	

	return self.results_cache['tokens_sort_order'][token_ids]

In [None]:
test_token_ids = [
brown.token_to_id('the'),
brown.token_to_id('dog'),
brown.token_to_id('went'),
]

print(test_token_ids)
print(brown.token_ids_to_tokens(test_token_ids))
print(brown.token_ids_to_sort_order(test_token_ids))


[22848, 23289, 18808]
['the' 'dog' 'went']
[50087 15848 54497]


In [None]:
#| export
@patch
def frequency_of(self: Corpus, 
				 token:str|int # token id or string to get frequency for
				 ) -> int|bool: # return frequency of token or False if not found
	""" Get the frequency of a specific token. """
	# TODO - make work with case insensitive tokens

	start_time = time.time()
	self._init_frequency_table()
	
	if type(token) == str:
		token = self.token_to_id(token)
		if token == False:
			return False

	logger.info(f'Token frequency retrieval time: {(time.time() - start_time):.5f} seconds')

	if token in self.frequency_lookup:
		return int(self.frequency_lookup[token])
	else:
		return False

In [None]:
token = 'go'
token_id = brown.token_to_id(token)
print(f'Token [id={token_id}, {token}] occurs {brown.frequency_of(token_id)} times.')
print(f'Token [{token}] occurs {brown.frequency_of(token)} times.')

Token [id=24577, go] occurs 625 times.
Token [go] occurs 625 times.


In [None]:
#| hide

# lower_without_punct = test.lower_index[~(test._mask_from_positions(test.punct_positions))]
# lower_without_space = test.lower_index[~(test._mask_from_positions(test.space_positions))]
# lower_without_space_punct = test.lower_index[~(test._mask_from_positions(test.space_positions) | test._mask_from_positions(test.punct_positions))]


## Tokenization

In [None]:
#| export
@patch
def tokenize(self: Corpus, 
			 string:str, # string to tokenize 
			 return_doc = False, # return doc object
			 simple_indexing = False # use simple indexing
             ): # return tokenized string
	""" Tokenize a string using the Spacy tokenizer. """
	# TODO implement case insensitive tokenization
	# TODO implement wildcard search and multiple strings

	start_time = time.time()
	placeholder_string = 'zzxxzzplaceholderzzxxzz' # so doesn't split tokens
	is_wildcard_search = False
	if simple_indexing == True:
		index_id = LOWER
		strings_to_tokenize = [string.strip()]
	else:
		raise('only simple_indexing implemented')
		# TODO rework
		# if '*' in string:
		# 	is_wildcard_search = True
		# 	string = string.replace('*',placeholder_string)
		# if string.islower() == True:
		# 	index_id = LOWER
		# else:
		# 	index_id = ORTH
		# if '|' in string:
		# 	strings_to_tokenize = string.split('|')
		# else:
		# 	strings_to_tokenize = [string.strip()]
	token_sequences = []
	for doc in self._nlp.tokenizer.pipe(strings_to_tokenize):
		token_sequences.append(tuple(doc.to_array(index_id)))
	# if is_wildcard_search == True:
	# 	tmp_token_sequence = []
	# 	sequence_count = 1
	# 	for token in doc:
	# 		tmp_token_sequence.append([])
	# 		if placeholder_string in token.text:
	# 			chunked_string = token.text.split(placeholder_string)
	# 			if len(chunked_string) > 2 or (len(chunked_string) == 2 and chunked_string[0] != '' and chunked_string[1] != ''):
	# 				# use regex
	# 				approach = 'regex'
	# 				regex = re.compile('.*'.join(chunked_string))
	# 			elif chunked_string[0] == '':
	# 				approach = 'endswith'
	# 			else:
	# 				approach = 'startswith'
	# 			for token_id in loaded_corpora[corpus_name]['frequency_lookup']:
	# 				possible_word = False
	# 				word = loaded_corpora[corpus_name]['vocab'][token_id]
	# 				if approach == 'regex':
	# 					if regex.match(word):
	# 						possible_word = word
	# 				elif getattr(word,approach)(''.join(chunked_string)):
	# 					possible_word = word
	# 				if possible_word != False:
	# 					tmp_token_sequence[token.i].append(loaded_corpora[corpus_name]['vocab'][possible_word])
	# 		else:
	# 			tmp_token_sequence[token.i].append(token.orth)
	# 		sequence_count *= len(tmp_token_sequence[token.i])
	# 	rotated_token_sequence = []
	# 	token_repeat = sequence_count
	# 	for pos in range(len(tmp_token_sequence)):
	# 		rotated_token_sequence.append([])
	# 		if len(tmp_token_sequence[pos]) == 1:
	# 			rotated_token_sequence[pos] += sequence_count * [tmp_token_sequence[pos][0]]
	# 		else:
	# 			token_repeat = token_repeat // len(tmp_token_sequence[pos])
	# 			while len(rotated_token_sequence[pos]) < sequence_count:
	# 				for token in tmp_token_sequence[pos]:
	# 					rotated_token_sequence[pos] += token_repeat * [token]
	# 	token_sequences = list(zip(*rotated_token_sequence))
	# 	#for tokens in tmp_token_sequence:
	# 	#    for token in tokens:
	# covert token_sequences to reindexed tokens using original_to_new
	token_sequences = [tuple([self.original_to_new[token] for token in sequence]) for sequence in token_sequences] # TODO - check as may not be portable
	logger.info(f'Tokenization time: {(time.time() - start_time):.5f} seconds')
	if return_doc == True:
		return token_sequences, index_id, doc
	else:
		return token_sequences, index_id

In [None]:
token_str = 'dog'
brown_token_sequence, brown_index_id = brown.tokenize(token_str, simple_indexing=True)

print(brown_token_sequence, brown._index_name(brown_index_id))

## Find positions of tokens

In [None]:
#| export
@patch
def get_token_index(self: Corpus, 
					token_sequence: list[np.ndarray], # token sequence to get index for 
					index_id: int # index to search (i.e. ORTH, LOWER)
					) -> np.ndarray: # positions of token sequence
	""" Get the positions of a token sequence in the corpus. """
	
	#TODO - refactor token_sequence?
	start_time = time.time()

	results = []

	sequence_len = len(token_sequence[0])
	variants_len = len(token_sequence)

	if index_id == ORTH:
		index = 'orth_index'
	else:
		index = 'lower_index'

	if (index, sequence_len) not in self.ngram_index:
		slices = [] # TODO adjust so not just lower below - so need a var to pass to this function with whether islower
		[slices.append(np.roll(getattr(self, index), shift)) for shift in -np.arange(sequence_len)]
		seq = np.vstack(slices).T
		self.ngram_index[(index, sequence_len)] = seq

	if variants_len == 1:
		results.append(np.where(np.all(self.ngram_index[(index, sequence_len)] == token_sequence[0], axis=1))[0])
	else:
		condition_list = []
		choice_list = variants_len * [True]
		for seq in token_sequence:
			condition_list.append(self.ngram_index[(index, sequence_len)] == seq)
		results.append(np.where(np.all(np.select(condition_list, choice_list),axis=1))[0])

	logger.info(f'Token indexing ({len(results[0])}) time: {(time.time() - start_time):.5f} seconds')
	return results

In [None]:
token_str = 'dog'
brown_token_sequence, brown_token_id = brown.tokenize(token_str, simple_indexing=True)
brown_token_index = brown.get_token_index(brown_token_sequence, brown_index_id)
print(brown_token_index)

[array([  18944,   18981,   18992,   19062,   19069,   37777,   89076,
        125511,  137608,  138261,  138296,  138305,  138349,  144502,
        189104,  249691,  249831,  250054,  250067,  250093,  250161,
        250187,  250247,  250275,  250386,  251335,  251354,  251414,
        251473,  251505,  251559,  251569,  251894,  253602,  254562,
        256120,  256224,  256397,  331441,  360984,  439241,  439245,
        439300,  439305,  464727,  464756,  464778,  522492,  649908,
        695780,  695829,  695989,  696181,  696460,  696839,  696916,
        697014,  863902,  863909,  865540,  865558,  877577,  877619,
        877706,  889653,  997085, 1014338, 1030313, 1052840, 1052849,
       1054274, 1077178, 1087042, 1088300, 1088332, 1088919, 1107306,
       1130649, 1139762])]


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()