# corpus

> Create a conc corpus.

In [None]:
#| default_exp corpus

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
# requirements - numpy pandas polars spacy nltk great_tables
# dev requirements - nbdev, jupyterlab, memory_profiler
# TODO check

import re
import polars as pl
import numpy as np
from great_tables import GT
import os
import glob
import spacy
from spacy.attrs import ORTH, LOWER # TODO - add ENT_TYPE, ENT_IOB?
import sys
import pickle
import string
from fastcore.basics import patch
import time


In [None]:
#| export
from conc import __version__
from conc.core import logger, set_logger_state, PAGE_SIZE, EOF_TOKEN_STR
from conc.result import Result


In [None]:
#| exporti
polars_conf = pl.Config.set_tbl_hide_column_data_types(True)
polars_conf = pl.Config.set_tbl_hide_dataframe_shape(True)
polars_conf = pl.Config.set_tbl_rows(50)

In [None]:
#| exporti
_RE_COMBINE_WHITESPACE = re.compile(r"\s+")
_RE_PUNCT = re.compile(r"^[^\s^\w^\d]$")

In [None]:
#| export
# first release will support english and spacy as a backend to parse the text - support for other languages and backends will come later.
try:
	nlp = spacy.load("en_core_web_sm")
except:
	logger.error('Error loading model en_core_web_sm. You probably need to run python -m spacy download en_core_web_sm to download the model.')	
	# download
	


In [None]:
#| export
EOF_TOKEN = nlp.vocab[EOF_TOKEN_STR].orth # starts with space so eof_token can't match anything from corpus
NOT_DOC_TOKEN = -1
INDEX_HEADER_LENGTH = 100

## Corpus class

In [None]:
#| export
class Corpus:
	"""Represention of text corpus, with methods to build, load and save a corpus from a variety of formats and to work with the corpus data."""
	
	def __init__(self, 
				name: str|None = None, # name of corpus
				description: str|None = None # description of corpus
				):
		# information about corpus
		self.name = name
		self.description = description

		# conc version that built the corpus
		self.conc_version = None
		
		# paths
		self.corpus_path = None
		self.source_path = None

		# settings
		self.EOF_TOKEN = None

		# special token ids
		self.punct_tokens = None
		self.space_tokens = None

		# metadata for corpus
		self.document_count = None
		self.token_count = None
		self.unique_tokens = None

		self.word_token_count = None
		self.unique_word_tokens = None

		# token data
		self.orth_index = None
		self.lower_index = None

		# lookup mapping doc_id to every token in doc
		self.token2doc_index = None

		# lookups to get token string or frequency 
		self.vocab = None
		self.frequency_lookup = None

		# offsets for each document in token data
		self.offsets = None

		# punct and space positions in token data
		self.punct_positions = None
		self.space_positions = None

		# metadata for each document
		self.metadata = []

		# lookups to get spacy tokenizer or internal ids
		self.original_to_new = None
		self.new_to_original = None
		
		# temporary data used when processing text, not saved to disk permanently on save
		self.frequency_table = None
		self.ngram_index = {}
		self.results_cache = {}


## Build, load and save a corpus

In [None]:
#| exporti
@patch
def _process_punct_positions(self: Corpus):
	""" Process punct positions in token data. """
	self.punct_tokens = np.array(list({k: v for k, v in self.vocab.items() if v.strip(string.punctuation) == ''}.keys()))
	# faster to retrieve with isin than where
	punct_mask = np.isin(self.lower_index, self.punct_tokens) 
	# storing this as smaller
	self.punct_positions = np.nonzero(punct_mask)[0] 

# Spacy includes space tokens in the vocab for non-destructive tokenisation, storing positions so can filter them out 
# for processing and analysis.

@patch
def _process_space_positions(self: Corpus):
	""" Process space positions in token data. """
	self.space_tokens = np.array(list({k: v for k, v in self.vocab.items() if v.strip() == ''}.keys()))
	# faster to retrieve with isin than where
	space_mask = np.isin(self.lower_index, self.space_tokens) 
	# storing this as smaller
	self.space_positions = np.nonzero(space_mask)[0] 


In [None]:
#| exporti
@patch
def _create_indices(self: Corpus, 
				   orth_index: list[np.ndarray], 
				   lower_index: list[np.ndarray], 
				   token2doc_index: list[np.ndarray]
				   ):
	""" Create iinternal representation of the corpus for faster analysis and efficient representation on disk. """

	unique_values, inverse = np.unique(np.concatenate(orth_index + lower_index), return_inverse=True)
	# add a dummy value at the 0 index to avoid 0 being used as a token id
	unique_values = np.insert(unique_values, 0, 0)
	inverse += 1
	new_values = np.arange(len(unique_values), dtype=np.uint32)
	self.original_to_new = dict(zip(unique_values, new_values))
	self.new_to_original = dict(zip(new_values, unique_values))

	self.orth_index = np.array(np.split(inverse, 2)[0], dtype=np.uint32)
	self.lower_index = np.array(np.split(inverse, 2)[1], dtype=np.uint32)
	del inverse

	vocab = {k:nlp.vocab.strings[k] for k in unique_values}
	vocab[0] = 'ERROR: not a token'

	self.vocab = {**{k:vocab[self.new_to_original[k]] for k in new_values}}

	self.EOF_TOKEN = self.original_to_new[EOF_TOKEN]

	self._process_punct_positions()
	self._process_space_positions()

	self.frequency_lookup = dict(zip(*np.unique(self.lower_index, return_counts=True)))
	del self.frequency_lookup[self.EOF_TOKEN]

	self.token2doc_index = np.concatenate(token2doc_index)

In [None]:
#| export
@patch
def load(self: Corpus, 
		 corpus_path: str # path to load corpus
		 ):
	""" Load corpus from disk. """

	start_time = time.time()
	if not os.path.isfile(corpus_path):
		raise FileNotFoundError(f"Path '{corpus_path}' is not a file")
	npz = np.load(corpus_path)
	data = pickle.loads(npz['corpus'])
	for k, v in data.items():
		setattr(self, k, v)
	self.orth_index = npz['orth_index']
	self.lower_index = npz['lower_index']
	self.token2doc_index = npz['token2doc_index']
	self.offsets = npz['offsets']

	self.punct_tokens = npz['punct_tokens']
	self.space_tokens = npz['space_tokens']
	self.punct_positions = npz['punct_positions']
	self.space_positions = npz['space_positions']

	self.corpus_path = corpus_path
	logger.info(f'Load time: {(time.time() - start_time):.3f} seconds')

	return self

In [None]:
#| export
@patch
def save(self: Corpus, 
		 corpus_path: str # path to save corpus
		 ):
	""" Save corpus to disk. """
	
	start_time = time.time()
	if not os.path.isdir(os.path.dirname(corpus_path)):
		os.makedirs(os.path.dirname(corpus_path))
	self.corpus_path = corpus_path
	corpus_bytes = pickle.dumps({k: getattr(self, k) for k in ['metadata', 'vocab', 'frequency_lookup', 'original_to_new', 'new_to_original', 'document_count', 'token_count', 'unique_tokens', 'word_token_count', 'unique_word_tokens', 'source_path', 'name', 'description', 'conc_version', 'EOF_TOKEN']})
	with open(corpus_path, 'wb') as f:
		np.savez_compressed(f, corpus=corpus_bytes, orth_index=self.orth_index, lower_index=self.lower_index, token2doc_index=self.token2doc_index, offsets=self.offsets, punct_tokens=self.punct_tokens, space_tokens=self.space_tokens, punct_positions=self.punct_positions, space_positions=self.space_positions)
	logger.info(f'Save time: {(time.time() - start_time):.3f} seconds')

In [None]:
#| export
@patch
def build(self: Corpus, 
		  iterator: iter, # iterator of texts
		  batch_size:int=1000 # batch size for spacy tokenizer
		  ):
	"""Build a corpus from an iterator of texts."""
	
	# get from library
	self.conc_version = __version__

	start_time = time.time()
	eof_arr = np.array([EOF_TOKEN], dtype=np.uint64)
	not_doc_arr = np.array([NOT_DOC_TOKEN], dtype=np.int16)
	index_header_arr = np.array([EOF_TOKEN] * INDEX_HEADER_LENGTH, dtype=np.uint64) # this is added to start and end of index to prevent out of bound issues on searches

	orth_index = [index_header_arr]
	lower_index = [index_header_arr]
	token2doc_index = [np.array([NOT_DOC_TOKEN] * len(index_header_arr), dtype=np.int32)]

	offset = INDEX_HEADER_LENGTH
	self.offsets = [] # TODO - check that this is being used  - consider removing

	doc_order = 0
	for doc in nlp.tokenizer.pipe(iterator, batch_size=batch_size): # test varying this TODO
		#TODO  - as corpus size increases memory requirements will increase - consider buffering orth_index, lower_index, token2doc_index and writing to disk periodically
		orth_index.append(doc.to_array(ORTH))
		orth_index.append(eof_arr)

		lower_index_tmp = doc.to_array(LOWER)
		lower_index.append(lower_index_tmp)
		lower_index.append(eof_arr)

		token2doc_index.append(np.array([doc_order] * len(lower_index_tmp), dtype=np.int32))
		token2doc_index.append(not_doc_arr)

		self.offsets.append(offset) 
		offset = offset + len(lower_index_tmp) + 1
		doc_order += 1

	orth_index.append(index_header_arr)
	lower_index.append(index_header_arr)
	token2doc_index.append(np.array([NOT_DOC_TOKEN] * len(index_header_arr), dtype=np.int32))

	self._create_indices(orth_index, lower_index, token2doc_index)

	self.document_count = len(self.offsets)
	# adjusting for text breaks and jeaders at start and end of index
	self.token_count = self.lower_index.shape[0] - self.document_count - len(index_header_arr) - len(index_header_arr) 
	self.unique_tokens = len(self.frequency_lookup)

	self.word_token_count = len(self.lower_index) - len(self.punct_positions) - len(self.space_positions)
	self.unique_word_tokens = len(self.frequency_lookup) - len(self.punct_tokens) - len(self.space_tokens)

	del orth_index
	del lower_index
	del token2doc_index

	logger.info(f'Build time: {(time.time() - start_time):.3f} seconds')


In [None]:
#| exporti
@patch
def _prepare_files(self: Corpus, 
					source_path: str, # path to folder with text files 
					file_mask:str='*.txt', # mask to select files 
					metadata_file: str|None=None, # path to a CSV with metadata
					metadata_file_column:str = 'file', # column in metadata file with file names to align texts with metadata
					metadata_columns:list[str]=[], # list of column names to import from metadata
					encoding:str='utf8' # encoding of text files
					):
	"""Prepare text files and metadata for building a corpus. Returns an iterator to get file text for processing."""

	# allowing import from zip and tar files
	if os.path.isdir(source_path):
		files = glob.glob(os.path.join(source_path, file_mask))
		type = 'folder'
	elif os.path.isfile(source_path):
		import fnmatch
		if source_path.endswith('.zip'):
			import zipfile
			with zipfile.ZipFile(source_path, 'r') as z:
				files = []
				for f in z.namelist():
					if fnmatch.fnmatch(f, file_mask):
						files.append(f)
				if len(files) > 0:
					type = 'zip'
		elif source_path.endswith('.tar') or source_path.endswith('.tar.gz'):
			import tarfile
			with tarfile.open(source_path, 'r') as t:
				files = []
				for f in t.getnames():
					if fnmatch.fnmatch(f, file_mask):
						files.append(f)
				if len(files) > 0:
					type = 'tar'
		else:
			raise FileNotFoundError(f"Path '{source_path}' is not a directory, zip or tar file")
	
	if not files:
		raise FileNotFoundError(f"No files matching {file_mask} found in '{source_path}'")

	order = pl.DataFrame({metadata_file_column: [os.path.basename(p) for p in files]})

	if metadata_file:
		if not os.path.isfile(metadata_file):
			raise FileNotFoundError(f"Metadata file '{metadata_file}' not found")
		try:
			metadata_columns = set([metadata_file_column] + metadata_columns)
			
			# ordering metadata based on order of files so token data and metadata aligned
			metadata = pl.read_csv(metadata_file).select(metadata_columns)
			self.metadata = order.join(metadata, on=metadata_file_column, how='left')
		except pl.exceptions.ColumnNotFoundError as e:
			raise
	else:
		self.metadata = order

	self.source_path = source_path

	if type == 'folder':
		for p in files:
			yield open(p, "rb").read().decode(encoding)
	elif type == 'zip':
		with zipfile.ZipFile(source_path, 'r') as z:
			for f in files:
				yield z.read(f).decode(encoding)
	elif type == 'tar':
		with tarfile.open(source_path, 'r') as t:
			for f in files:
				yield t.extractfile(f).read().decode(encoding)		
	


In [None]:
#| export
@patch
def build_from_files(self: Corpus,
					source_path: str, # path to folder with text files 
					file_mask:str='*.txt', # mask to select files 
					metadata_file: str|None=None, # path to a CSV with metadata
					metadata_file_column:str = 'file', # column in metadata file with file names to align texts with metadata
					metadata_columns:list[str]=[], # list of column names to import from metadata
					encoding:str='utf-8', # encoding of text files
					batch_size:int=1000 # batch size for spacy tokenizer
					):
	"""Build a corpus from text files in a folder."""
	
	start_time = time.time()
	iterator = self._prepare_files(source_path, file_mask, metadata_file, metadata_file_column, metadata_columns, encoding)
	self.build(iterator, batch_size)
	logger.info(f'Build from files time: {(time.time() - start_time):.3f} seconds')

	return self


In [None]:
#| hide
test = Corpus('test')
texts = []
for text in test._prepare_files('../test-corpora/source/toy', file_mask='*1.txt'):
	texts.append(text)
assert len(texts) == 1
assert texts[0] == 'The cat sat on the mat.'

texts = []
for text in test._prepare_files('../test-corpora/source/toy', file_mask='*.txt', metadata_file='../test-corpora/source/toy.csv', metadata_file_column = 'source', metadata_columns=['category']):
	texts.append(text)

assert len(texts) == 6
assert 'The cat sat on the mat.' in texts
assert test.metadata.shape[0] == 6
assert test.metadata.columns == ['source', 'category']

cat_sat_index = texts.index('The cat sat on the mat.') 
assert test.metadata['source'][cat_sat_index] == '1.txt'
assert test.metadata['category'][cat_sat_index] == 'feline'

del test

In [None]:
#| export
@patch
def _prepare_csv(self: Corpus, 
					source_path:str, # path to csv file
					text_column:str='text', # column in csv with text
					metadata_columns:list[str]=[], # list of column names to import from csv
					encoding:str='utf8' # encoding of csv passed to Polars read_csv, see their documentation
					) -> iter: # iterator to return rows for processing
	"""Prepare to import from CSV, including metadata. Returns an iterator to process the text column."""

	# TODO - add encoding parameter

	if not os.path.isfile(source_path):
		raise FileNotFoundError(f'Path ({source_path}) is not a file')
	
	try:
		df = pl.read_csv(source_path, encoding = encoding).select([text_column] + metadata_columns)
	except pl.exceptions.ColumnNotFoundError as e:
		raise

	self.source_path = source_path
	self.metadata = df.select(metadata_columns)

	for row in df.iter_rows():
		yield row[0]

In [None]:
#| export
@patch
def build_from_csv(self: Corpus, 
				   source_path:str, # path to csv file
				   text_column:str='text', # column in csv with text
				   metadata_columns:list[str]=[], # list of column names to import from csv
				   encoding:str='utf8', # encoding of csv passed to Polars read_csv, see their documentation
				   batch_size:int=1000 # batch size for Spacy tokenizer
				   ):
	"""Build a corpus from a csv file."""
	
	start_time = time.time()
	iterator = self._prepare_csv(source_path, text_column, metadata_columns, encoding)
	self.build(iterator, batch_size)
	logger.info(f'Build from csv time: {(time.time() - start_time):.3f} seconds')

	return self


In [None]:
#| hide
test = Corpus('test')
texts = []

for text in test._prepare_csv('../test-corpora/source/toy.csv', text_column='text', metadata_columns=['source', 'category']):
	texts.append(text)

assert len(texts) == 6
cat_sat_index = 0
assert texts[cat_sat_index] == 'The cat sat on the mat.'
assert test.metadata.shape[0] == 6
assert test.metadata.columns == ['source', 'category']
assert test.metadata['source'][cat_sat_index] == '1.txt'
assert test.metadata['category'][cat_sat_index] == 'feline'
del test

In [None]:
#| hide
source_path = '../test-corpora/source/'
save_path = '../test-corpora/saved/'

In [None]:
#| hide
#| eval: false
corpora = {}
corpora['toy'] = {'name': 'Toy Corpus', 'description': 'Toy corpus for testing', 'extension': '.csv.gz'}
corpora['brown'] = {'name': 'Brown Corpus', 'description': 'A Standard Corpus of Present-Day Edited American English, for use with Digital Computers. by W. N. Francis and H. Kucera (1964) Department of Linguistics, Brown University Providence, Rhode Island, USA Revised 1971, Revised and Amplified 1979 http://www.hit.uib.no/icame/brown/bcm.html', 'extension': '.csv.gz'}
corpora['reuters'] = {'name': 'Reuters Corpus', 'description': 'From NLTK TODO', 'extension': '.csv.gz'}
corpora['gutenberg'] = {'name': 'Gutenberg Corpus', 'description': 'From NLTK TODO', 'extension': '.csv.gz'}
corpora['garden-party-corpus'] = {'name': 'Garden Party Corpus', 'description': 'https://github.com/ucdh/scraping-garden-party', 'extension': '.zip'}

set_logger_state('verbose')
for corpus_name, corpus_details in corpora.items():
	try:
		corpus = Corpus().load(f'{save_path}{corpus_name}.corpus')
	except FileNotFoundError:
		if 'csv' in corpus_details['extension']:
			corpus = Corpus(name = corpus_details['name'], description = corpus_details['description']).build_from_csv(f'{source_path}{corpus_name}.csv.gz', text_column='text', metadata_columns=['source'])
		else:
			corpus = Corpus(name = corpus_details['name'], description = corpus_details['description']).build_from_files(f'{source_path}{corpus_name}{corpus_details["extension"]}')
		corpus.save(f'{save_path}{corpus_name}.corpus')
	except Exception as e:
		raise e
set_logger_state('quiet')


2025-03-11 10:58:57 - INFO - load - Load time: 0.002 seconds
2025-03-11 10:58:57 - INFO - load - Load time: 0.113 seconds
2025-03-11 10:58:57 - INFO - load - Load time: 0.132 seconds
2025-03-11 10:58:57 - INFO - load - Load time: 0.149 seconds
2025-03-11 10:58:57 - INFO - build - Build time: 0.260 seconds
2025-03-11 10:58:57 - INFO - build_from_files - Build from files time: 0.260 seconds
2025-03-11 10:58:57 - INFO - save - Save time: 0.080 seconds


In [None]:
# TODO - add tests for build and save and load

try:
	toy = Corpus().load(f'{save_path}/toy.corpus')
except FileNotFoundError:
	brown = Corpus(name = corpora['toy']['name'], description = corpora['toy']['description']).build_from_csv(f'{source_path}/toy', text_column='text', metadata_columns=['source'])
	brown.save(f'{save_path}/toy.corpus')
except Exception as e:
	raise e

In [None]:
# TODO - add tests for build and save and load

try:
	brown = Corpus().load(f'{save_path}/brown.corpus')
except FileNotFoundError:
	brown = Corpus(name = corpora['brown']['name'], description = corpora['brown']['description']).build_from_csv(f'{source_path}/brown', text_column='text', metadata_columns=['source'])
	brown.save(f'{save_path}/brown.corpus')
except Exception as e:
	raise e

## Information about the corpus

In [None]:
#| export
@patch
def info(self: Corpus, 
		 include_memory_usage:bool = False, # include memory usage in output
		 formatted:bool = True # return formatted output
		 ) -> str: # formatted information about the corpus
	""" Return information about the corpus. """
	
	result = []
	attributes = ['name', 'description', 'conc_version', 'corpus_path', 'source_path', 'document_count', 'token_count', 'unique_tokens', 'word_token_count', 'unique_word_tokens']
	for attr in attributes:
		value = getattr(self, attr)
		if isinstance(value, bool):
			result.append('True' if value else 'False')
		elif isinstance(value, int):
			result.append(f'{value:,}')
		else:
			result.append(str(value))

	if include_memory_usage:
		size_attributes = ['orth_index', 'lower_index', 'token2doc_index', 'vocab', 'frequency_lookup', 'offsets', 'metadata', 'original_to_new', 'new_to_original', 'results_cache', 'ngram_index', 'frequency_table']
		for attr in size_attributes:
			size = sys.getsizeof(getattr(self, attr))
			attributes.append(attr + ' (MB)')
			result.append(f'{size/1024/1024:.3f}')
	
	if formatted:
		attributes = [attr.replace('_', ' ').title() for attr in attributes]

	return pl.DataFrame({'Attribute': attributes, 'Value': result})



In [None]:
#| export
@patch
def summary(self: Corpus, 
			include_memory_usage:bool = False # include memory usage in output
			):
	""" Print information about the corpus in a formatted table. """
	result = Result('summary', self.info(include_memory_usage), 'Corpus Summary', '', {}, [])
	result.display()

In [None]:
#| exporti
@patch
def __str__(self: Corpus):
	""" Formatted information about the corpus. """
	
	return str(self.info())



In [None]:
#| exporti
@patch
def _index_name(self: Corpus, index):
	"""Get name of index from spacy."""

	return list(spacy.attrs.IDS.keys())[list(spacy.attrs.IDS.values()).index(index)]

You can get summary information on your corpus, including the number of documents, the token count and the number of unique tokens as a dataframe using the `info` method. You can also just print the corpus itself.

In [None]:
print(brown)

┌────────────────────┬─────────────────────────────────┐
│ Attribute          ┆ Value                           │
╞════════════════════╪═════════════════════════════════╡
│ Name               ┆ Brown Corpus                    │
│ Description        ┆ A Standard Corpus of Present-D… │
│ Conc Version       ┆ 0.0.1                           │
│ Corpus Path        ┆ ../test-corpora/saved//brown.c… │
│ Source Path        ┆ ../test-corpora/source/brown.c… │
│ Document Count     ┆ 500                             │
│ Token Count        ┆ 1,140,905                       │
│ Unique Tokens      ┆ 42,937                          │
│ Word Token Count   ┆ 980,844                         │
│ Unique Word Tokens ┆ 42,907                          │
└────────────────────┴─────────────────────────────────┘


In [None]:
#| hide
print(toy)

┌────────────────────┬─────────────────────────────────┐
│ Attribute          ┆ Value                           │
╞════════════════════╪═════════════════════════════════╡
│ Name               ┆ Toy Corpus                      │
│ Description        ┆ Toy corpus for testing          │
│ Conc Version       ┆ 0.0.1                           │
│ Corpus Path        ┆ ../test-corpora/saved//toy.cor… │
│ Source Path        ┆ ../test-corpora/source/toy.csv… │
│ Document Count     ┆ 6                               │
│ Token Count        ┆ 38                              │
│ Unique Tokens      ┆ 15                              │
│ Word Token Count   ┆ 238                             │
│ Unique Word Tokens ┆ 14                              │
└────────────────────┴─────────────────────────────────┘


You can get the same information in a nicer format by using the `summary` method.

In [None]:
brown.summary()

Corpus Summary,Corpus Summary
Attribute,Value
Name,Brown Corpus
Description,"A Standard Corpus of Present-Day Edited American English, for use with Digital Computers. by W. N. Francis and H. Kucera (1964) Department of Linguistics, Brown University Providence, Rhode Island, USA Revised 1971, Revised and Amplified 1979 http://www.hit.uib.no/icame/brown/bcm.html"
Conc Version,0.0.1
Corpus Path,../test-corpora/saved//brown.corpus
Source Path,../test-corpora/source/brown.csv.gz
Document Count,500
Token Count,1140905
Unique Tokens,42937
Word Token Count,980844
Unique Word Tokens,42907


## Anatomy of a corpus

Explain the various indices.

## Working with tokens

In [None]:
#| exporti
@patch
def _init_frequency_table(self: Corpus):
	""" Prepare the frequency table for the corpus. """
	# TODO work out case sensitivity issues - currently if do token lookup for The - not there
	if self.frequency_table is None:
		# note: don't sort this - leave in order of token_id - sorts can be done when required
		start_time = time.time()
		self.frequency_table = pl.DataFrame({'token_id': list(self.frequency_lookup.keys()), 'frequency': list(self.frequency_lookup.values())})  
		self.frequency_table = self.frequency_table.join(pl.DataFrame({'token_id': list(self.vocab.keys()), 'token': list(self.vocab.values())}), on='token_id', how='left')
		self.frequency_table = self.frequency_table.with_columns(self.frequency_table['token_id'].is_in(self.punct_tokens).alias('is_punct')).with_columns(self.frequency_table['token_id'].is_in(self.space_tokens).alias('is_space'))	
		self.frequency_table = self.frequency_table.with_row_index(name='rank', offset=1)
		logger.info(f'Frequency table created in {(time.time() - start_time):.3f} seconds')

In [None]:
#| exporti
@patch
def _mask_from_positions(self: Corpus, 
						 positions # positions to create mask from
						 ):
	""" Convert positions to mask """
	mask_from_positions = np.zeros(self.lower_index.shape, dtype=bool)
	mask_from_positions[positions] = True
	return mask_from_positions

In [None]:
#| exporti
@patch
def _init_tokens_array(self: Corpus):
	""" Prepare the tokens array for the corpus. """
	if 'tokens_array' not in self.results_cache:
		start_time = time.time()
		self.results_cache['tokens_array'] = np.array(list(self.vocab.values()))
		logger.info(f'Create tokens_array in {(time.time() - start_time):.3f} seconds')

In [None]:
#| exporti
@patch
def _init_tokens_sort_order(self: Corpus):
	""" Prepare the tokens sort order for the corpus. """
	if 'tokens_sort_order' not in self.results_cache:
		self._init_tokens_array()
		# lowercasing then sorting ...
		tokens_array_lower = np.strings.lower(self.results_cache['tokens_array'])
		self.results_cache['tokens_sort_order'] = np.argsort(np.argsort(tokens_array_lower))

In [None]:
#| export
@patch
# TODO maybe convert to using tokens_array rather than frequency_table
def token_to_id(self: Corpus, 
				token: str # token to get id for
				) -> int|bool: # return token id or False if not found in the corpus
	""" Get the id for a token string. """

	self._init_frequency_table()
	token = self.frequency_table.filter(pl.col('token') == token)['token_id']
	if token.shape[0] == 0:
		return False
	else:
		token = token[0]
	return token

Get the ID of the token 'dog' like this:

In [None]:
brown.token_to_id('dog')

23289

In [None]:
#| export
@patch
def token_ids_to_tokens(self: Corpus, 
						token_ids: np.ndarray|list # token ids to retrieve as tokens
						) -> np.ndarray: # return token strings for token ids
	""" Get token strings for a list of token ids. """ 

	self._init_tokens_array()
	if isinstance(token_ids, list):
		token_ids = np.array(token_ids)
	return self.results_cache['tokens_array'][token_ids]

Internally, conc uses Numpy vector operations where possible. A list or numpy array of Token IDs can be converted to a numpy array of token strings like this:

In [None]:
token_ids = [23288, 24576, 47803]
brown.token_ids_to_tokens(token_ids)

array(['acid', '395,000', 'mckinney'], dtype='<U30')

In [None]:
#| export
@patch
def token_ids_to_sort_order(self: Corpus, 
							token_ids: np.ndarray # token ids to get rank 
							) -> np.ndarray: # rank of token ids
	""" Get the rank of token ids in the frequency table. """
	#TODO document that this is a rank
	self._init_tokens_sort_order()	

	return self.results_cache['tokens_sort_order'][token_ids]

In [None]:
test_token_ids = [
brown.token_to_id('the'),
brown.token_to_id('dog'),
brown.token_to_id('went'),
]

print(test_token_ids)
print(brown.token_ids_to_tokens(test_token_ids))
print(brown.token_ids_to_sort_order(test_token_ids))


[22848, 23289, 18808]
['the' 'dog' 'went']
[50087 15848 54497]


In [None]:
#| export
@patch
def frequency_of(self: Corpus, 
				 token:str|int # token id or string to get frequency for
				 ) -> int|bool: # return frequency of token or False if not found
	""" Get the frequency of a specific token. """
	# TODO - make work with case insensitive tokens

	start_time = time.time()
	self._init_frequency_table()
	
	if type(token) == str:
		token = self.token_to_id(token)
		if token == False:
			return False

	logger.info(f'Token frequency retrieval time: {(time.time() - start_time):.5f} seconds')

	if token in self.frequency_lookup:
		return int(self.frequency_lookup[token])
	else:
		return False

In [None]:
token = 'go'
token_id = brown.token_to_id(token)
print(f'Token [id={token_id}, {token}] occurs {brown.frequency_of(token_id)} times.')
print(f'Token [{token}] occurs {brown.frequency_of(token)} times.')

Token [id=24577, go] occurs 625 times.
Token [go] occurs 625 times.


In [None]:
#| hide

# lower_without_punct = test.lower_index[~(test._mask_from_positions(test.punct_positions))]
# lower_without_space = test.lower_index[~(test._mask_from_positions(test.space_positions))]
# lower_without_space_punct = test.lower_index[~(test._mask_from_positions(test.space_positions) | test._mask_from_positions(test.punct_positions))]


## Tokenization

In [None]:
#| export
@patch
def tokenize(self: Corpus, 
			 string:str, # string to tokenize 
			 return_doc = False, # return doc object
			 simple_indexing = False # use simple indexing
             ): # return tokenized string
	""" Tokenize a string using the Spacy tokenizer. """
	# TODO implement case insensitive tokenization
	# TODO implement wildcard search and multiple strings

	start_time = time.time()
	placeholder_string = 'zzxxzzplaceholderzzxxzz' # so doesn't split tokens
	is_wildcard_search = False
	if simple_indexing == True:
		index_id = LOWER
		strings_to_tokenize = [string.strip()]
	else:
		raise('only simple_indexing implemented')
		# TODO rework
		# if '*' in string:
		# 	is_wildcard_search = True
		# 	string = string.replace('*',placeholder_string)
		# if string.islower() == True:
		# 	index_id = LOWER
		# else:
		# 	index_id = ORTH
		# if '|' in string:
		# 	strings_to_tokenize = string.split('|')
		# else:
		# 	strings_to_tokenize = [string.strip()]
	token_sequences = []
	for doc in nlp.tokenizer.pipe(strings_to_tokenize):
		token_sequences.append(tuple(doc.to_array(index_id)))
	# if is_wildcard_search == True:
	# 	tmp_token_sequence = []
	# 	sequence_count = 1
	# 	for token in doc:
	# 		tmp_token_sequence.append([])
	# 		if placeholder_string in token.text:
	# 			chunked_string = token.text.split(placeholder_string)
	# 			if len(chunked_string) > 2 or (len(chunked_string) == 2 and chunked_string[0] != '' and chunked_string[1] != ''):
	# 				# use regex
	# 				approach = 'regex'
	# 				regex = re.compile('.*'.join(chunked_string))
	# 			elif chunked_string[0] == '':
	# 				approach = 'endswith'
	# 			else:
	# 				approach = 'startswith'
	# 			for token_id in loaded_corpora[corpus_name]['frequency_lookup']:
	# 				possible_word = False
	# 				word = loaded_corpora[corpus_name]['vocab'][token_id]
	# 				if approach == 'regex':
	# 					if regex.match(word):
	# 						possible_word = word
	# 				elif getattr(word,approach)(''.join(chunked_string)):
	# 					possible_word = word
	# 				if possible_word != False:
	# 					tmp_token_sequence[token.i].append(loaded_corpora[corpus_name]['vocab'][possible_word])
	# 		else:
	# 			tmp_token_sequence[token.i].append(token.orth)
	# 		sequence_count *= len(tmp_token_sequence[token.i])
	# 	rotated_token_sequence = []
	# 	token_repeat = sequence_count
	# 	for pos in range(len(tmp_token_sequence)):
	# 		rotated_token_sequence.append([])
	# 		if len(tmp_token_sequence[pos]) == 1:
	# 			rotated_token_sequence[pos] += sequence_count * [tmp_token_sequence[pos][0]]
	# 		else:
	# 			token_repeat = token_repeat // len(tmp_token_sequence[pos])
	# 			while len(rotated_token_sequence[pos]) < sequence_count:
	# 				for token in tmp_token_sequence[pos]:
	# 					rotated_token_sequence[pos] += token_repeat * [token]
	# 	token_sequences = list(zip(*rotated_token_sequence))
	# 	#for tokens in tmp_token_sequence:
	# 	#    for token in tokens:
	# covert token_sequences to reindexed tokens using original_to_new
	token_sequences = [tuple([self.original_to_new[token] for token in sequence]) for sequence in token_sequences]
	logger.info(f'Tokenization time: {(time.time() - start_time):.5f} seconds')
	if return_doc == True:
		return token_sequences, index_id, doc
	else:
		return token_sequences, index_id

In [None]:
token_str = 'dog'
brown_token_sequence, brown_index_id = brown.tokenize(token_str, simple_indexing=True)

print(brown_token_sequence, brown._index_name(brown_index_id))

[(np.uint32(23289),)] LOWER


## Find positions of tokens

In [None]:
#| export
@patch
def get_token_index(self: Corpus, 
					token_sequence: list[np.ndarray], # token sequence to get index for 
					index_id: int # index to search (i.e. ORTH, LOWER)
					) -> np.ndarray: # positions of token sequence
	""" Get the positions of a token sequence in the corpus. """
	
	#TODO - refactor token_sequence?
	start_time = time.time()

	results = []

	sequence_len = len(token_sequence[0])
	variants_len = len(token_sequence)

	if index_id == ORTH:
		index = 'orth_index'
	else:
		index = 'lower_index'

	if (index, sequence_len) not in self.ngram_index:
		slices = [] # TODO adjust so not just lower below - so need a var to pass to this function with whether islower
		[slices.append(np.roll(getattr(self, index), shift)) for shift in -np.arange(sequence_len)]
		seq = np.vstack(slices).T
		self.ngram_index[(index, sequence_len)] = seq

	if variants_len == 1:
		results.append(np.where(np.all(self.ngram_index[(index, sequence_len)] == token_sequence[0], axis=1))[0])
	else:
		condition_list = []
		choice_list = variants_len * [True]
		for seq in token_sequence:
			condition_list.append(self.ngram_index[(index, sequence_len)] == seq)
		results.append(np.where(np.all(np.select(condition_list, choice_list),axis=1))[0])

	logger.info(f'Token indexing ({len(results[0])}) time: {(time.time() - start_time):.5f} seconds')
	return results

In [None]:
token_str = 'dog'
brown_token_sequence, brown_token_id = brown.tokenize(token_str, simple_indexing=True)
brown_token_index = brown.get_token_index(brown_token_sequence, brown_index_id)
print(brown_token_index)

[array([  18944,   18981,   18992,   19062,   19069,   37777,   89076,
        125511,  137608,  138261,  138296,  138305,  138349,  144502,
        189104,  249691,  249831,  250054,  250067,  250093,  250161,
        250187,  250247,  250275,  250386,  251335,  251354,  251414,
        251473,  251505,  251559,  251569,  251894,  253602,  254562,
        256120,  256224,  256397,  331441,  360984,  439241,  439245,
        439300,  439305,  464727,  464756,  464778,  522492,  649908,
        695780,  695829,  695989,  696181,  696460,  696839,  696916,
        697014,  863902,  863909,  865540,  865558,  877577,  877619,
        877706,  889653,  997085, 1014338, 1030313, 1052840, 1052849,
       1054274, 1077178, 1087042, 1088300, 1088332, 1088919, 1107306,
       1130649, 1139762])]


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()