# core

> Fill in a module description here

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
%load_ext memory_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [None]:
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [None]:
#| export
# requirements - numpy pandas spacy nltk
# dev requirements - nbdev, jupyterlab, memory_profiler

import re
import pandas as pd
import polars as pl # alternative to pandas
import numpy as np
import os
import glob
#import dill as pickle
import math
import json
from collections import Counter
from distutils.dir_util import copy_tree # for combine_corpora
import spacy
from spacy.attrs import ORTH, LOWER # remove? - add ENT_TYPE, ENT_IOB
from datetime import datetime
import sys
import pickle
import gzip
import csv
import pyarrow.csv
from fastcore.basics import patch
import logging
import time


In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

In [None]:
pl.Config.set_tbl_hide_column_data_types(True)
pl.Config.set_tbl_hide_dataframe_shape(True)
pl.Config.set_tbl_rows(50)

polars.config.Config

In [None]:
#| export
_RE_COMBINE_WHITESPACE = re.compile(r"\s+")
_RE_PUNCT = re.compile(r"^[^\s^\w^\d]$")

In [None]:
#| export
# check if can load model - if exception then probably not there - improve message and / or setup
try:
	nlp = spacy.load("en_core_web_sm")
except:
	print('Error loading model en_core_web_sm. You probably need to run python -m spacy download en_core_web_sm to download the model.')	
	raise

In [None]:
#| export
PAGE_SIZE = 20
EOF_TOKEN = nlp.vocab[' context-end-of-file-token'].orth # starts with space so eof_token can't match anything from corpus
NOT_DOC_TOKEN = -1
INDEX_HEADER_LENGTH = 100

In [None]:
get_sample_data = True
if get_sample_data == True:
	data = []
	data.append(['1.txt', 'The cat sat on the mat.', 'feline', 'cat'])
	data.append(['2.txt', 'The dog sat on the mat.', 'canine', 'dog'])
	data.append(['3.txt', 'The cat is meowing.', 'feline', 'cat'])
	data.append(['4.txt', 'The dog is barking.', 'canine', 'dog'])
	data.append(['5.txt', 'The cat is climbing a tree.', 'feline', 'cat'])
	data.append(['6.txt', 'The dog is digging a hole.', 'canine', 'dog'])

	os.makedirs('../test-corpora/source/toy', exist_ok=True)
	for row in data:
		with open(f'../test-corpora/source/toy/{row[0]}', 'w', encoding='utf-8') as f:
			f.write(row[1])

	df = pl.DataFrame(data, orient='row', schema=(('source', str), ('text', str), ('category', str), ('species', str)))
	df.write_csv('../test-corpora/source/toy.csv')
	df.write_csv('../test-corpora/source/toy.csv.gz')

	# print contents of toy corpus
	print('Contents of toy csv:')
	with open('../test-corpora/source/toy.csv', 'r') as f:
		print(f.read())



Contents of toy csv:
source,text,category,species
1.txt,The cat sat on the mat.,feline,cat
2.txt,The dog sat on the mat.,canine,dog
3.txt,The cat is meowing.,feline,cat
4.txt,The dog is barking.,canine,dog
5.txt,The cat is climbing a tree.,feline,cat
6.txt,The dog is digging a hole.,canine,dog



In [None]:
get_sample_data = False
if get_sample_data == True:
	# get brown corpus from nltk
	import nltk
	#nltk.download('gutenberg')
	#nltk.download('brown')
	#nltk.download('reuters')
	from nltk.corpus import gutenberg
	from nltk.corpus import reuters
	from nltk.corpus import brown
	len(reuters.fileids())

	# words just look like this ...
	#Under/in committee/nn rules/nns ,/, it/pps went/vbd automatically/rb to/in a/at subcommittee/nn for/in one/cd week/nn ./.

	# function to clean the annotation /etc 
	def clean_text(text):
		# to match words/punc that followed by /tags
		pattern = re.compile(r"(\S+)(/[^ ]+)") # match non-space followed by / and non-space
		return pattern.sub(r"\1", text)

	os.makedirs('../test-corpora/source/brown', exist_ok=True)
	for fileid in brown.fileids():
		with open(f'../test-corpora/source/brown/{fileid}.txt', 'w', encoding='utf-8') as f:
			f.write(clean_text(brown.raw(fileid)))

	os.makedirs('../test-corpora/source/gutenberg', exist_ok=True)
	for fileid in gutenberg.fileids():
		with open(f'../test-corpora/source/gutenberg/{fileid}.txt', 'w', encoding='utf-8') as f:
			f.write(clean_text(gutenberg.raw(fileid)))

	# save files to ../test-corpora/brown
	os.makedirs('../test-corpora/source/reuters', exist_ok=True)
	for fileid in reuters.fileids():
		fileid_name = fileid.split('/')[1]
		with open(f'../test-corpora/source/reuters/{fileid_name}.txt', 'w', encoding='utf-8') as f:
			f.write(clean_text(reuters.raw(fileid)))





In [None]:
create_sample_data = False
if create_sample_data == True:
	# load standard csv library
	import csv
	import gzip

	for max_i in [10000, 100000, 200000, 500000]:
		max_i_label = int(max_i / 1000)
		# create version with just first 100000 rows
		with gzip.open('../test-corpora/source/rnz.csv.gz', 'rt') as f:
			reader = csv.DictReader(f)
			with gzip.open(f'../test-corpora/source/rnz-{max_i_label}k.csv.gz', 'wt') as f_out:
				writer = csv.DictWriter(f_out, fieldnames=reader.fieldnames)
				writer.writeheader()
				for i, row in enumerate(reader):
					if i > max_i - 1:
						break
					writer.writerow(row)
			print(f'Created file rnz-{max_i_label}k.csv.gz')


In [None]:
# class for a corpus with methods to load_from_files, load_from_csv
# the loaders get a path and return a generator
# class is initiated with a corpus name

#| export
class Corpus:
	"""Class for a corpus with methods to load texts from files, csv, metadata from csv, json. Class is initiated with a corpus name."""
	
	def __init__(self, name):
		# information about corpus
		self.name = name
		self.corpus_path = None
		self.source_path = None

		# settings
		self.EOF_TOKEN = None

		# metadata for corpus
		self.document_count = None
		self.token_count = None
		self.unique_tokens = None

		# token data
		self.orth_index = None
		self.lower_index = None

		# lookup mapping doc_id to every token in doc
		self.token2doc_index = None

		# lookups to get token string or frequency 
		self.vocab = None
		self.frequency_lookup = None

		# offsets for each document in token data
		self.offsets = None

		# metadata for each document
		self.metadata = []

		# lookups to get spacy tokenizer or internal ids
		self.original_to_new = None
		self.new_to_original = None
		
		# temporary data used when processing text, not saved to disk permanently on save
		self.frequency_table = None
		self.ngram_index = {}
		self.results_cache = {}

	def load_from_files(self, source_path, file_mask='*.txt', metadata_file=None, metadata_file_column = 'file', metadata_columns=[]):
		"""Load texts from files in path with file_mask."""
		# TOOD - allow input from a compressed folder

		if not os.path.isdir(source_path):
			raise FileNotFoundError(f"Path '{source_path}' is not a directory")
		files = glob.glob(os.path.join(source_path, file_mask))
		if not files:
			raise FileNotFoundError(f"No files found in '{source_path}'")

		order = pl.DataFrame({metadata_file_column: [os.path.basename(p) for p in files]})

		if metadata_file:
			if not os.path.isfile(metadata_file):
				raise FileNotFoundError(f"Metadata file '{metadata_file}' not found")
			try:
				metadata_columns = set([metadata_file_column] + metadata_columns)
				
				# ordering metadata based on order of files so token data and metadata aligned
				metadata = pl.read_csv(metadata_file).select(metadata_columns)
				self.metadata = order.join(metadata, on=metadata_file_column, how='left')
			except pl.exceptions.ColumnNotFoundError as e:
				raise
		else:
			self.metadata = order

		self.source_path = source_path

		for p in files:
			yield open(p, "rb").read().decode("utf-8")
	
	def load_from_csv(self, source_path, text_column='text', metadata_columns=[]):
		"""Load texts from csv in path with text_column."""

		# TODO - used polars instead of pandas, but loads full csv into memory - consider streaming this another way or loading in chunks

		if not os.path.isfile(source_path):
			raise FileNotFoundError(f"Path '{source_path}' is not a file")
		
		try:
			df = pl.read_csv(source_path).select([text_column] + metadata_columns)
		except pl.exceptions.ColumnNotFoundError as e:
			raise

		self.source_path = source_path
		self.metadata = df.select(metadata_columns)

		for row in df.iter_rows():
			yield row[0]
		

	def build(self, iterator, batch_size=1000):
		start_time = time.time()
		eof_arr = np.array([EOF_TOKEN], dtype=np.uint64)
		not_doc_arr = np.array([NOT_DOC_TOKEN], dtype=np.int16)
		# this is added to start and end of index to prevent out of bound issues on searches
		index_header_arr = np.array([EOF_TOKEN] * INDEX_HEADER_LENGTH, dtype=np.uint64)

		#corpus['texts'] = texts
		#corpus['docs'] = []
		orth_index = [index_header_arr]
		lower_index = [index_header_arr]
		token2doc_index = [np.array([NOT_DOC_TOKEN] * len(index_header_arr), dtype=np.int32)]

		offset = INDEX_HEADER_LENGTH
		self.offsets = [] # TODO - check that this is being used  - consider removing

		doc_order = 0
		for doc in nlp.tokenizer.pipe(iterator, batch_size=batch_size): # test varying this TODO
			#TODO  - as corpus size increases memory requirements will increase - consider buffering orth_index, lower_index, token2doc_index and writing to disk periodically
			orth_index.append(doc.to_array(ORTH))
			orth_index.append(eof_arr)

			lower_index_tmp = doc.to_array(LOWER)
			lower_index.append(lower_index_tmp)
			lower_index.append(eof_arr)

			#token2doc_index.append(np.full(np.shape(lower_index_tmp),doc_order, dtype=np.int32))
			# speed up using list
			token2doc_index.append(np.array([doc_order] * len(lower_index_tmp), dtype=np.int32))
			token2doc_index.append(not_doc_arr)

			self.offsets.append(offset) 
			offset = offset + len(lower_index_tmp) + 1
			doc_order += 1

		orth_index.append(index_header_arr)
		lower_index.append(index_header_arr)
		token2doc_index.append(np.array([NOT_DOC_TOKEN] * len(index_header_arr), dtype=np.int32))

		self.create_indices(orth_index, lower_index, token2doc_index)

		self.document_count = len(self.offsets)
		self.token_count = self.lower_index.shape[0] - self.document_count - len(index_header_arr) - len(index_header_arr) # adjust for text breaks and start and end of index
		self.unique_tokens = len(self.frequency_lookup)
		# cleanup all tmp arrays
		del orth_index
		del lower_index
		del token2doc_index
		logging.info(f'Build time: {(time.time() - start_time):.3f} seconds')

	def create_indices(self, orth_index, lower_index, token2doc_index):
		""" Takes as input a list of token ids (np.uint64) and reindexes that outputting a lookup and the reindexed token_index. """
		unique_values, inverse = np.unique(np.concatenate(orth_index + lower_index), return_inverse=True)
		new_values = np.arange(len(unique_values), dtype=np.uint32)
		self.original_to_new = dict(zip(unique_values, new_values))
		self.new_to_original = dict(zip(new_values, unique_values))

		# the order_index and lower_index are first and second half of inverse - so split it
		self.orth_index = np.array(np.split(inverse, 2)[0], dtype=np.uint32)
		self.lower_index = np.array(np.split(inverse, 2)[1], dtype=np.uint32)
		del inverse

		vocab = {k:nlp.vocab.strings[k] for k in unique_values}
		#reindexed_vocab = {**{original_to_new[k]:vocab[k] for k in vocab}, **{vocab[k]:original_to_new[k] for k in vocab}}
		# speed up reindexed_vocab by using the new_to_original dict
		#
		# make self.vocab a two-way lookup - one line to join both - so can look up token or token_id
		self.vocab = {**{k:vocab[self.new_to_original[k]] for k in new_values}}
	
		# del self.vocab[self.original_to_new[EOF_TOKEN]]
		self.EOF_TOKEN = self.original_to_new[EOF_TOKEN]

		self.frequency_lookup = dict(zip(*np.unique(self.lower_index, return_counts=True)))
		del self.frequency_lookup[self.EOF_TOKEN]

		self.token2doc_index = np.concatenate(token2doc_index)
	
	def info(self, include_memory_usage = False):
		result = []
		attributes = ['name', 'corpus_path', 'source_path', 'token_count', 'unique_tokens', 'document_count']
		for attr in attributes:
			value = getattr(self, attr)
			# if int
			if isinstance(value, int):
				result.append(f'{value:,}')
			else:
				result.append(str(value))
		if include_memory_usage:
			size_attributes = ['orth_index', 'lower_index', 'token2doc_index', 'vocab', 'frequency_lookup', 'offsets', 'metadata', 'original_to_new', 'new_to_original', 'results_cache', 'ngram_index', 'frequency_table']
			for attr in size_attributes:
				size = sys.getsizeof(getattr(self, attr))
				attributes.append(attr + ' (MB)')
				result.append(f'{size/1024/1024:.3f}')
		return str(pl.DataFrame({'Attribute': attributes, 'Value': result}))

	def __str__(self):
		return self.info()

	def index_name(self, index):
		"""Return name of index."""
		return list(spacy.attrs.IDS.keys())[list(spacy.attrs.IDS.values()).index(index)]
	

In [None]:
@patch
def save(self: Corpus, corpus_path):
	start_time = time.time()
	if not os.path.isdir(corpus_path):
		os.makedirs(corpus_path)
	self.corpus_path = corpus_path
	np.savez_compressed(os.path.join(corpus_path, 'arrays.npz'), orth_index=self.orth_index, lower_index=self.lower_index, token2doc_index=self.token2doc_index, offsets=self.offsets)
	# save vocab, frequency_lookup, original_to_new, new_to_original, document_count, token_count, unique_tokens, source_path, name, EOF_TOKEN, 
	with gzip.open(os.path.join(corpus_path, 'corpus.pkl.gz'), 'wb') as f:
		# save only the necessary data
		pickle.dump({k: getattr(self, k) for k in ['metadata', 'vocab', 'frequency_lookup', 'original_to_new', 'new_to_original', 'document_count', 'token_count', 'unique_tokens', 'source_path', 'name', 'EOF_TOKEN']}, f)
	logging.info(f'Save time: {(time.time() - start_time):.3f} seconds')

In [None]:
@patch
def compare_saved_to_source(self: Corpus):
	# get size of directory or file at source_path and compare to size of directory at corpus_path
	print('Source text data:')
	source_size = 0
	if os.path.isdir(self.source_path):
		# loop through files in source_path and get size
		i = 0
		for root, dirs, files in os.walk(self.source_path):
			for file in files:
				i += 1
				file_path = os.path.join(root, file)
				source_size += os.path.getsize(file_path)
				if i < 5:
					print(f'\t{os.path.basename(file_path)}: {os.path.getsize(file_path)/1024/1024:.3f} MB')
				elif i == 5:
					print('\t...')
	else:
		# show file
		print(f'\t{os.path.basename(self.source_path)}: {os.path.getsize(self.source_path)/1024/1024:.3f} MB')
		source_size = os.path.getsize(self.source_path)
	print(f'Source total size: {source_size/1024/1024:.3f} MB')
	print()

	# print files in corpus_path and size of each - nice formatting
	print('Saved corpus:')
	corpus_size = 0
	for root, dirs, files in os.walk(self.corpus_path):
		for file in files:
			file_path = os.path.join(root, file)
			corpus_size += os.path.getsize(file_path)
			print(f'\t{os.path.basename(file_path)}: {os.path.getsize(file_path)/1024/1024:.3f} MB')
	print(f'Corpus total size: {corpus_size/1024/1024:.3f} MB')



In [None]:
@patch
def load(self: Corpus, corpus_path):
	start_time = time.time()
	if not os.path.isdir(corpus_path):
		raise FileNotFoundError(f"Path '{corpus_path}' is not a directory")
	self.corpus_path = corpus_path
	with gzip.open(os.path.join(corpus_path, 'corpus.pkl.gz'), 'rb') as f:
		data = pickle.load(f)
		for k, v in data.items():
			setattr(self, k, v)
	npz = np.load(os.path.join(corpus_path, 'arrays.npz'))
	self.orth_index = npz['orth_index']
	self.lower_index = npz['lower_index']
	self.token2doc_index = npz['token2doc_index']
	self.offsets = npz['offsets']
	logging.info(f'Load time: {(time.time() - start_time):.3f} seconds')

In [None]:
rebuild = False

toy = Corpus('toy')
rnz = Corpus('rnz-10k')
rnz100 = Corpus('rnz-100k')
rnz200 = Corpus('rnz-200k')
rnz500 = Corpus('rnz-500k')

if rebuild == True:
	toy.build(toy.load_from_files('../test-corpora/source/toy'))
	rnz.build(rnz.load_from_csv('../test-corpora/source/rnz-10k.csv.gz', text_column='description'))
	rnz100.build(rnz100.load_from_csv('../test-corpora/source/rnz-100k.csv.gz', text_column='description'))
	rnz200.build(rnz200.load_from_csv('../test-corpora/source/rnz-200k.csv.gz', text_column='description'))
	rnz500.build(rnz500.load_from_csv('../test-corpora/source/rnz-500k.csv.gz', text_column='description'))

	toy.save('../test-corpora/saved/toy')
	rnz.save('../test-corpora/saved/rnz-10k')
	rnz100.save('../test-corpora/saved/rnz-100k')
	rnz200.save('../test-corpora/saved/rnz-200k')
	rnz500.save('../test-corpora/saved/rnz-500k')
else:
	toy.load('../test-corpora/saved/toy')
	rnz.load('../test-corpora/saved/rnz-10k')
	rnz100.load('../test-corpora/saved/rnz-100k')
	rnz200.load('../test-corpora/saved/rnz-200k')
	rnz500.load('../test-corpora/saved/rnz-500k')

2025-02-13 11:03:38 - INFO - Load time: 0.003 seconds
2025-02-13 11:03:38 - INFO - Load time: 0.087 seconds
2025-02-13 11:03:38 - INFO - Load time: 0.322 seconds
2025-02-13 11:03:39 - INFO - Load time: 0.583 seconds
2025-02-13 11:03:40 - INFO - Load time: 1.554 seconds


In [None]:
print(toy.info(include_memory_usage=True))
print(rnz.info(include_memory_usage=True))
print(rnz100.info(include_memory_usage=True))
print(rnz200.info(include_memory_usage=True))
print(rnz500.info(include_memory_usage=True))

rnz500.compare_saved_to_source()

┌───────────────────────┬────────────────────────────┐
│ Attribute             ┆ Value                      │
╞═══════════════════════╪════════════════════════════╡
│ name                  ┆ toy                        │
│ corpus_path           ┆ ../test-corpora/saved/toy  │
│ source_path           ┆ ../test-corpora/source/toy │
│ token_count           ┆ 38                         │
│ unique_tokens         ┆ 15                         │
│ document_count        ┆ 6                          │
│ orth_index (MB)       ┆ 0.001                      │
│ lower_index (MB)      ┆ 0.001                      │
│ token2doc_index (MB)  ┆ 0.001                      │
│ vocab (MB)            ┆ 0.001                      │
│ frequency_lookup (MB) ┆ 0.001                      │
│ offsets (MB)          ┆ 0.000                      │
│ metadata (MB)         ┆ 0.000                      │
│ original_to_new (MB)  ┆ 0.001                      │
│ new_to_original (MB)  ┆ 0.001                      │
│ results_

In [None]:
@patch
def create_frequency_table(self: Corpus):
	# note: don't sort this - leave in order of token_id - sorts can be done when required
	start_time = time.time()
	self.frequency_table = pl.DataFrame({'token_id': list(self.frequency_lookup.keys()), 'frequency': list(self.frequency_lookup.values())}) #'token': list([''] * self.unique_tokens), 
	self.frequency_table = self.frequency_table.join(pl.DataFrame({'token_id': list(self.vocab.keys()), 'token': list(self.vocab.values())}), on='token_id', how='left')
	self.frequency_table = self.frequency_table.with_row_index(name='rank', offset=1)
	logging.info(f'Frequency table created in {(time.time() - start_time):.3f} seconds')

In [None]:
@patch
def frequencies(self: Corpus, n=None, show_token_id=False, normalize_by=False, sort_by='frequency'):
	start_time = time.time()
	if self.frequency_table is None:
		self.create_frequency_table()

	columns = ['rank', 'token', 'frequency']
	if show_token_id == True:
		columns = ['rank', 'token_id', 'token', 'frequency']

	if normalize_by != False:
		# if a number is passed then normalize by that number
		if type(normalize_by) != int:
			raise ValueError('normalize_by must be an integer, e.g. 1000000 or 10000')
		self.frequency_table = self.frequency_table.with_columns((pl.col('frequency') * normalize_by / self.token_count).alias('normalized_frequency'))
		columns.append('normalized_frequency')

	# TODO - add back if needed
	# if sort_by in ['frequency', 'normalized_frequency']:
	# 	self.frequency_table = self.frequency_table.sort(sort_by, descending=True)
	# 	self.frequency_table = self.frequency_table.drop('rank').with_row_index(name='rank', offset=1)

	logging.info(f'Frequencies report time: {(time.time() - start_time):.5f} seconds')
	if n:
		return self.frequency_table.sort('frequency', descending=True)[columns].head(n).to_pandas().set_index('rank')
	else:
		return self.frequency_table.sort('frequency', descending=True)[columns].to_pandas().set_index('rank')

In [None]:
display(toy.frequencies(n=3, normalize_by=1000))
display(rnz.frequencies(n=3, normalize_by=1000))
display(rnz100.frequencies(n=3, normalize_by=1000))
display(rnz200.frequencies(n=3, normalize_by=1000))
display(rnz500.frequencies(n=3, normalize_by=1000))

2025-02-13 11:03:51 - INFO - Frequency table created in 0.002 seconds
2025-02-13 11:03:51 - INFO - Frequencies report time: 0.00325 seconds


Unnamed: 0_level_0,token,frequency,normalized_frequency
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9,the,8,210.526316
14,.,6,157.894737
2,is,4,105.263158


2025-02-13 11:03:51 - INFO - Frequency table created in 0.022 seconds
2025-02-13 11:03:51 - INFO - Frequencies report time: 0.02356 seconds


Unnamed: 0_level_0,token,frequency,normalized_frequency
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8543,the,19545,57.204153
14546,.,13966,40.875579
3047,",",9178,26.862098


2025-02-13 11:03:51 - INFO - Frequency table created in 0.057 seconds
2025-02-13 11:03:51 - INFO - Frequencies report time: 0.05805 seconds


Unnamed: 0_level_0,token,frequency,normalized_frequency
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
21551,the,186137,59.399384
36623,.,115595,36.88827
34506,a,76002,24.253491


2025-02-13 11:03:51 - INFO - Frequency table created in 0.063 seconds
2025-02-13 11:03:51 - INFO - Frequencies report time: 0.06519 seconds


Unnamed: 0_level_0,token,frequency,normalized_frequency
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
30913,the,360131,60.681226
52610,.,227333,38.305075
3730,of,142599,24.027596


2025-02-13 11:03:51 - INFO - Frequency table created in 0.093 seconds
2025-02-13 11:03:51 - INFO - Frequencies report time: 0.09483 seconds


Unnamed: 0_level_0,token,frequency,normalized_frequency
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
48480,the,939251,61.28475
82289,.,577156,37.658582
5848,of,378584,24.702051


In [None]:
@patch
def token_ids_to_tokens(self: Corpus, token_ids):
	if 'tokens_array' not in self.results_cache:
		start_time = time.time()
		self.results_cache['tokens_array'] = np.array(list(self.vocab.values()))
		logging.info(f'Create tokens_array in {(time.time() - start_time):.3f} seconds')
	return self.results_cache['tokens_array'][token_ids]

In [None]:
# destroy token_ids_to_tokens cache
if 'tokens_array' in toy.results_cache:
	del toy.results_cache['tokens_array']
if 'tokens_array' in rnz.results_cache:
	del rnz.results_cache['tokens_array']
if 'tokens_array' in rnz100.results_cache:
	del rnz100.results_cache['tokens_array']
if 'tokens_array' in rnz200.results_cache:
	del rnz200.results_cache['tokens_array']
if 'tokens_array' in rnz500.results_cache:
	del rnz500.results_cache['tokens_array']

# warm up tokens_array
warm_up_array = np.array(range(0,5))
toy.token_ids_to_tokens(warm_up_array)
rnz.token_ids_to_tokens(warm_up_array)
rnz100.token_ids_to_tokens(warm_up_array)
rnz200.token_ids_to_tokens(warm_up_array)
rnz500.token_ids_to_tokens(warm_up_array)

# testing token range at start of array using warm_up_array
%timeit toy.token_ids_to_tokens(warm_up_array)
%timeit rnz.token_ids_to_tokens(warm_up_array)
%timeit rnz100.token_ids_to_tokens(warm_up_array)
%timeit rnz200.token_ids_to_tokens(warm_up_array)
%timeit rnz500.token_ids_to_tokens(warm_up_array)

# testing token range at end of array
toy_ids = np.array(range(toy.unique_tokens-1, toy.unique_tokens-6, -1))
%timeit toy.token_ids_to_tokens(toy_ids)
rnz_ids = np.array(range(rnz.unique_tokens-1, rnz.unique_tokens-6, -1))
%timeit rnz.token_ids_to_tokens(rnz_ids)
rnz100_ids = np.array(range(rnz100.unique_tokens-1, rnz100.unique_tokens-6, -1))
%timeit rnz100.token_ids_to_tokens(rnz100_ids)
rnz200_ids = np.array(range(rnz200.unique_tokens-1, rnz200.unique_tokens-6, -1))
%timeit rnz200.token_ids_to_tokens(rnz200_ids)
rnz500_ids = np.array(range(rnz500.unique_tokens-1, rnz500.unique_tokens-6, -1))
%timeit rnz500.token_ids_to_tokens(rnz500_ids)

# testing random token ids
toy_ids = np.random.randint(0, toy.unique_tokens, 5)
%timeit toy.token_ids_to_tokens(toy_ids)
rnz_ids = np.random.randint(0, rnz.unique_tokens, 5)
%timeit rnz.token_ids_to_tokens(rnz_ids)
rnz100_ids = np.random.randint(0, rnz100.unique_tokens, 5)
%timeit rnz100.token_ids_to_tokens(rnz100_ids)
rnz200_ids = np.random.randint(0, rnz200.unique_tokens, 5)
%timeit rnz200.token_ids_to_tokens(rnz200_ids)
rnz500_ids = np.random.randint(0, rnz500.unique_tokens, 5)


2025-02-13 11:14:37 - INFO - Create tokens_array in 0.000 seconds
2025-02-13 11:14:37 - INFO - Create tokens_array in 0.011 seconds
2025-02-13 11:14:37 - INFO - Create tokens_array in 0.023 seconds
2025-02-13 11:14:37 - INFO - Create tokens_array in 0.042 seconds
2025-02-13 11:14:37 - INFO - Create tokens_array in 0.053 seconds


325 ns ± 19.3 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
623 ns ± 16.6 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
632 ns ± 20.2 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
622 ns ± 14.1 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
633 ns ± 29.1 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
405 ns ± 11.2 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
637 ns ± 13 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
643 ns ± 32.3 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
819 ns ± 170 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
662 ns ± 26.3 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
408 ns ± 14.4 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
710 ns ± 68.4 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
710 ns ± 50.7 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
67

In [None]:
@patch
# maybe convert to using tokens_array rather than frequency_table
def token_to_id(self: Corpus, token):
	if self.frequency_table is None:
		self.create_frequency_table()
	token = self.frequency_table.filter(pl.col('token') == token)['token_id']
	if token.shape[0] == 0:
		return False
	else:
		token = token[0]
	return token

In [None]:
rnz500.token_to_id('dog')

83723

In [None]:
@patch
def frequency_of(self: Corpus, token):
	start_time = time.time()
	if self.frequency_table is None:
		self.create_frequency_table()
	
	if type(token) == str:
		token = self.token_to_id(token)
		if token == False:
			return 0

	logging.info(f'Token frequency retrieval time: {(time.time() - start_time):.5f} seconds')

	if token in self.frequency_lookup:
		return int(self.frequency_lookup[token])
	else:
		return 0

In [None]:
token = 'dog'
print(token, toy.frequency_of(token))
print(token, rnz.frequency_of(token))
print(token, rnz100.frequency_of(token))
print(token, rnz200.frequency_of(token))
print(token, rnz500.frequency_of(token))

token_id = toy.token_to_id(token)
print(token, token_id, toy.frequency_of(token_id))
token_id = rnz.token_to_id(token)
print(token, token_id, rnz.frequency_of(token_id))
token_id = rnz100.token_to_id(token)
print(token, token_id, rnz100.frequency_of(token_id))
token_id = rnz200.token_to_id(token)
print(token, token_id, rnz200.frequency_of(token_id))
token_id = rnz500.token_to_id(token)
print(token, token_id, rnz500.frequency_of(token_id))


2025-02-13 11:16:55 - INFO - Token frequency retrieval time: 0.00116 seconds
2025-02-13 11:16:55 - INFO - Token frequency retrieval time: 0.00107 seconds
2025-02-13 11:16:55 - INFO - Token frequency retrieval time: 0.00154 seconds
2025-02-13 11:16:55 - INFO - Token frequency retrieval time: 0.00126 seconds
2025-02-13 11:16:55 - INFO - Token frequency retrieval time: 0.00108 seconds
2025-02-13 11:16:55 - INFO - Token frequency retrieval time: 0.00000 seconds
2025-02-13 11:16:55 - INFO - Token frequency retrieval time: 0.00000 seconds
2025-02-13 11:16:55 - INFO - Token frequency retrieval time: 0.00000 seconds
2025-02-13 11:16:55 - INFO - Token frequency retrieval time: 0.00000 seconds
2025-02-13 11:16:55 - INFO - Token frequency retrieval time: 0.00000 seconds


dog 3
dog 31
dog 202
dog 363
dog 894
dog 10 3
dog 13162 31
dog 34833 202
dog 52073 363
dog 83723 894


In [None]:
@patch
def tokenize(self: Corpus, string, return_doc = False, simple_indexing = False): #TODO test speed on this using pipe for one doc
	start_time = time.time()
	placeholder_string = 'zzxxzzplaceholderzzxxzz' # so doesn't split tokens
	is_wildcard_search = False
	if simple_indexing == True:
		index_id = LOWER
		strings_to_tokenize = [string.strip()]
	else:
		raise('only simple_indexing implemented')
		# TODO rework
		# if '*' in string:
		# 	is_wildcard_search = True
		# 	string = string.replace('*',placeholder_string)
		# if string.islower() == True:
		# 	index_id = LOWER
		# else:
		# 	index_id = ORTH
		# if '|' in string:
		# 	strings_to_tokenize = string.split('|')
		# else:
		# 	strings_to_tokenize = [string.strip()]
	token_sequences = []
	for doc in nlp.tokenizer.pipe(strings_to_tokenize):
		token_sequences.append(tuple(doc.to_array(index_id)))
	# if is_wildcard_search == True:
	# 	tmp_token_sequence = []
	# 	sequence_count = 1
	# 	for token in doc:
	# 		tmp_token_sequence.append([])
	# 		if placeholder_string in token.text:
	# 			chunked_string = token.text.split(placeholder_string)
	# 			if len(chunked_string) > 2 or (len(chunked_string) == 2 and chunked_string[0] != '' and chunked_string[1] != ''):
	# 				# use regex
	# 				approach = 'regex'
	# 				regex = re.compile('.*'.join(chunked_string))
	# 			elif chunked_string[0] == '':
	# 				approach = 'endswith'
	# 			else:
	# 				approach = 'startswith'
	# 			for token_id in loaded_corpora[corpus_name]['frequency_lookup']:
	# 				possible_word = False
	# 				word = loaded_corpora[corpus_name]['vocab'][token_id]
	# 				if approach == 'regex':
	# 					if regex.match(word):
	# 						possible_word = word
	# 				elif getattr(word,approach)(''.join(chunked_string)):
	# 					possible_word = word
	# 				if possible_word != False:
	# 					tmp_token_sequence[token.i].append(loaded_corpora[corpus_name]['vocab'][possible_word])
	# 		else:
	# 			tmp_token_sequence[token.i].append(token.orth)
	# 		sequence_count *= len(tmp_token_sequence[token.i])
	# 	rotated_token_sequence = []
	# 	token_repeat = sequence_count
	# 	for pos in range(len(tmp_token_sequence)):
	# 		rotated_token_sequence.append([])
	# 		if len(tmp_token_sequence[pos]) == 1:
	# 			rotated_token_sequence[pos] += sequence_count * [tmp_token_sequence[pos][0]]
	# 		else:
	# 			token_repeat = token_repeat // len(tmp_token_sequence[pos])
	# 			while len(rotated_token_sequence[pos]) < sequence_count:
	# 				for token in tmp_token_sequence[pos]:
	# 					rotated_token_sequence[pos] += token_repeat * [token]
	# 	token_sequences = list(zip(*rotated_token_sequence))
	# 	#for tokens in tmp_token_sequence:
	# 	#    for token in tokens:
	# covert token_sequences to reindexed tokens using original_to_new
	token_sequences = [tuple([self.original_to_new[token] for token in sequence]) for sequence in token_sequences]
	logging.info(f'Tokenization time: {(time.time() - start_time):.5f} seconds')
	if return_doc == True:
		return token_sequences, index_id, doc
	else:
		return token_sequences, index_id

In [None]:
token_str = 'dog'
toy_token_sequence, toy_index_id = toy.tokenize(token_str, simple_indexing=True)
rnz_token_sequence, rnz_index_id = rnz.tokenize(token_str, simple_indexing=True)
rnz100_token_sequence, rnz100_index_id = rnz100.tokenize(token_str, simple_indexing=True)
rnz200_token_sequence, rnz200_index_id = rnz200.tokenize(token_str, simple_indexing=True)
rnz500_token_sequence, rnz500_index_id = rnz500.tokenize(token_str, simple_indexing=True)

print(toy_token_sequence, toy.index_name(toy_index_id))


2025-02-13 11:16:57 - INFO - Tokenization time: 0.00026 seconds
2025-02-13 11:16:57 - INFO - Tokenization time: 0.00009 seconds
2025-02-13 11:16:57 - INFO - Tokenization time: 0.00011 seconds
2025-02-13 11:16:57 - INFO - Tokenization time: 0.00009 seconds
2025-02-13 11:16:57 - INFO - Tokenization time: 0.00008 seconds


[(np.uint32(10),)] LOWER


In [None]:
@patch
def get_token_index(self: Corpus, token_sequence, index_id): #TEST - refactor token_sequence
	start_time = time.time()
	results = []

	sequence_len = len(token_sequence[0])
	variants_len = len(token_sequence)

	if index_id == ORTH:
		index = 'orth_index'
	else:
		index = 'lower_index'

	if (index, sequence_len) not in self.ngram_index:
		slices = [] # TODO adjust so not just lower below - so need a var to pass to this function with whether islower
		[slices.append(np.roll(getattr(self, index), shift)) for shift in -np.arange(sequence_len)]
		seq = np.vstack(slices).T
		self.ngram_index[(index, sequence_len)] = seq

	if variants_len == 1:
		results.append(np.where(np.all(self.ngram_index[(index, sequence_len)] == token_sequence[0], axis=1))[0])
	else:
		condition_list = []
		choice_list = variants_len * [True]
		for seq in token_sequence:
			condition_list.append(self.ngram_index[(index, sequence_len)] == seq)
		results.append(np.where(np.all(np.select(condition_list, choice_list),axis=1))[0])

	logging.info(f'Token indexing ({len(results[0])}) time: {(time.time() - start_time):.5f} seconds')
	return results

In [None]:
toy_token_index = toy.get_token_index(toy_token_sequence, toy_index_id)
rnz_token_index = rnz.get_token_index(rnz_token_sequence, rnz_index_id)
rnz100_token_index = rnz100.get_token_index(rnz100_token_sequence, rnz100_index_id)
rnz200_token_index = rnz200.get_token_index(rnz200_token_sequence, rnz200_index_id)
rnz500_token_index = rnz500.get_token_index(rnz500_token_sequence, rnz500_index_id)

2025-02-13 11:16:59 - INFO - Token indexing (3) time: 0.00048 seconds
2025-02-13 11:16:59 - INFO - Token indexing (31) time: 0.00094 seconds
2025-02-13 11:16:59 - INFO - Token indexing (202) time: 0.01015 seconds
2025-02-13 11:16:59 - INFO - Token indexing (363) time: 0.01740 seconds
2025-02-13 11:16:59 - INFO - Token indexing (894) time: 0.15002 seconds


In [None]:
@patch

# speed up by:
# 1. keep as columns
# 2. optimize removal to only search relevant columns
# 3. return numpy array

def get_ngrams(self: Corpus, token_sequence, index_id, token_index, ngram_length = 2, ngram_word_position = 'LEFT'): #TEST refactor token_sequence
	start_time = time.time()
	sequence_len = len(token_sequence[0])
	variants_len = len(token_sequence)
	token_index_len = len(token_index[0])

	if index_id == ORTH:
		index = 'orth_index'
	else:
		index = 'lower_index'

	if ngram_word_position == 'RIGHT':
		ngram_range = range(-1 * ngram_length + sequence_len, sequence_len)
	elif ngram_word_position == 'MIDDLE':
		ngram_range = range(-1 * ngram_length + sequence_len + 1, sequence_len + 1)
	else:
		ngram_range = range(0, ngram_length)

	ngrams = []
	for pos in ngram_range:
		if variants_len == 1 and pos > -1 and pos < sequence_len:
			# create numpy array with the same token (token_sequence[pos]) for length of token_index[0]
			ngrams.append(np.full(token_index_len, token_sequence[0][pos]))
		else:
			seq = token_index[0] + pos
			ngrams.append(getattr(self, index)[seq])

	ngrams = np.stack(ngrams)

	# get column positions to search for EOF_TOKEN
	columns = (np.array(ngram_range)[:, None] != np.arange(sequence_len)).all(axis=1)

	ngrams = np.delete(ngrams, np.where(ngrams[columns] == self.EOF_TOKEN)[1], axis=1)
	logging.info(f'Ngrams ({ngrams.shape[1]}) retrieval time: {(time.time() - start_time):.5f} seconds')
	return ngrams


In [None]:
result = toy.get_ngrams(toy_token_sequence, toy_index_id, toy_token_index, ngram_length = 2, ngram_word_position = 'LEFT')
print(result)
result = rnz.get_ngrams(rnz_token_sequence, rnz_index_id, rnz_token_index, ngram_length = 2, ngram_word_position = 'LEFT')
result = rnz100.get_ngrams(rnz100_token_sequence, rnz100_index_id, rnz100_token_index, ngram_length = 2, ngram_word_position = 'LEFT')
result = rnz200.get_ngrams(rnz200_token_sequence, rnz200_index_id, rnz200_token_index, ngram_length = 2, ngram_word_position = 'LEFT')
result = rnz500.get_ngrams(rnz500_token_sequence, rnz500_index_id, rnz500_token_index, ngram_length = 2, ngram_word_position = 'LEFT')

2025-02-13 11:17:05 - INFO - Ngrams (3) retrieval time: 0.00025 seconds
2025-02-13 11:17:05 - INFO - Ngrams (31) retrieval time: 0.00017 seconds
2025-02-13 11:17:05 - INFO - Ngrams (202) retrieval time: 0.00024 seconds
2025-02-13 11:17:05 - INFO - Ngrams (363) retrieval time: 0.00031 seconds
2025-02-13 11:17:05 - INFO - Ngrams (894) retrieval time: 0.00257 seconds


[[10 10 10]
 [11  1  1]]


In [None]:
@patch
def ngrams(self: Corpus, token_str, ngram_length = 2, ngram_word_position = 'LEFT', page_size = PAGE_SIZE, page_current = 0, pretty = True):
	token_sequence, index_id = self.tokenize(token_str, simple_indexing=True)

	start_time = time.time()
	cache_id = tuple(['ngram'] + list(token_sequence) + [ngram_length, ngram_word_position])

	if cache_id in self.results_cache:
		logging.info('Using cached ngrams results')
		ngrams_report = self.results_cache[cache_id]
	else:
		token_index = self.get_token_index(token_sequence, index_id)
		
		if len(token_index[0]) == 0:
			logging.info('No tokens found')
			return None, 0

		logging.info('Generating ngrams results')
		ngrams = self.get_ngrams(token_sequence, index_id, token_index, ngram_length = ngram_length, ngram_word_position = ngram_word_position)
		schema = [f'token_{i+1}' for i in range(ngram_length)]
		ngrams_report = pl.DataFrame(ngrams.T, schema=schema).to_struct(name = 'ngram_token_ids').value_counts(sort=True).rename({"count": "frequency"})
		ngrams_report = ngrams_report.with_row_index(name='rank', offset=1)
		self.results_cache[cache_id] = ngrams_report

	total_count = len(ngrams_report)

	resultset_start = page_size*page_current
	resultset_end = page_size*(page_current+1)


	# get specific chunk of report into pandas based on resultset_start:
	ngrams_report_page = ngrams_report.slice(resultset_start, page_size).unnest('ngram_token_ids')
	token_strs = []
	for i in range(ngram_length):
		token_strs.append(self.token_ids_to_tokens(ngrams_report_page[f'token_{i+1}'].to_numpy()))
	token_strs = np.array(token_strs)
	ngram_text = [' '.join(column) for column in token_strs.T]
	ngrams_report_page = ngrams_report_page.with_columns(pl.Series(name="ngram", values=ngram_text))
	ngrams_report_page = ngrams_report_page.to_pandas().set_index('rank')

	# sort column display - add normalized_frequency and rank, optional ngram_token_ids
	logging.info(f'Ngrams report time: {(time.time() - start_time):.5f} seconds')
	if pretty == True:
		return ngrams_report_page[['ngram', 'frequency']], total_count
	else:
		return ngrams_report_page, total_count




In [None]:
pretty = True

toy.results_cache = {}
toy_ngrams, toy_total_count = toy.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', pretty = pretty)
display(toy_ngrams)
print(toy_total_count)

rnz.results_cache = {}
rnz_ngrams, rnz_total_count = rnz.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', pretty = pretty)
display(rnz_ngrams)
print(rnz_total_count)

rnz100.results_cache = {}
rnz100_ngrams, rnz100_total_count = rnz100.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', pretty = pretty)
display(rnz100_ngrams)
print(rnz100_total_count)

rnz200.results_cache = {}
rnz200_ngrams, rnz200_total_count = rnz200.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', pretty = pretty)
display(rnz200_ngrams)
print(rnz200_total_count)

rnz500.results_cache = {}
rnz500_ngrams, rnz500_total_count = rnz500.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', pretty = pretty)
display(rnz500_ngrams)
print(rnz500_total_count)


2025-02-13 11:17:10 - INFO - Tokenization time: 0.00014 seconds
2025-02-13 11:17:10 - INFO - Token indexing (3) time: 0.00008 seconds
2025-02-13 11:17:10 - INFO - Generating ngrams results
2025-02-13 11:17:10 - INFO - Ngrams (3) retrieval time: 0.00062 seconds
2025-02-13 11:17:10 - INFO - Create tokens_array in 0.000 seconds
2025-02-13 11:17:10 - INFO - Ngrams report time: 0.01070 seconds


Unnamed: 0_level_0,ngram,frequency
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,dog is,2
2,dog sat,1


2025-02-13 11:17:10 - INFO - Tokenization time: 0.00013 seconds
2025-02-13 11:17:10 - INFO - Token indexing (31) time: 0.00098 seconds
2025-02-13 11:17:10 - INFO - Generating ngrams results
2025-02-13 11:17:10 - INFO - Ngrams (31) retrieval time: 0.00025 seconds
2025-02-13 11:17:10 - INFO - Create tokens_array in 0.015 seconds
2025-02-13 11:17:10 - INFO - Ngrams report time: 0.02486 seconds


2


Unnamed: 0_level_0,ngram,frequency
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,dog to,4
2,dog .,2
3,dog attack,2
4,dog died,2
5,"dog """,2
6,dog while,1
7,dog had,1
8,dog as,1
9,dog owners,1
10,dog can,1


2025-02-13 11:17:10 - INFO - Tokenization time: 0.00013 seconds
2025-02-13 11:17:10 - INFO - Token indexing (202) time: 0.00730 seconds
2025-02-13 11:17:10 - INFO - Generating ngrams results
2025-02-13 11:17:10 - INFO - Ngrams (202) retrieval time: 0.00042 seconds
2025-02-13 11:17:10 - INFO - Create tokens_array in 0.037 seconds
2025-02-13 11:17:10 - INFO - Ngrams report time: 0.05472 seconds


24


Unnamed: 0_level_0,ngram,frequency
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,dog .,19
2,dog owners,14
3,dog in,10
4,dog attack,8
5,dog trial,7
6,dog to,4
7,dog died,4
8,dog -,4
9,dog that,4
10,dog control,4


2025-02-13 11:17:10 - INFO - Tokenization time: 0.00010 seconds
2025-02-13 11:17:10 - INFO - Token indexing (363) time: 0.00511 seconds
2025-02-13 11:17:10 - INFO - Generating ngrams results
2025-02-13 11:17:10 - INFO - Ngrams (363) retrieval time: 0.00023 seconds
2025-02-13 11:17:10 - INFO - Create tokens_array in 0.044 seconds
2025-02-13 11:17:10 - INFO - Ngrams report time: 0.05525 seconds


97


Unnamed: 0_level_0,ngram,frequency
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,dog .,38
2,dog owners,16
3,dog in,14
4,dog control,14
5,dog attack,11
6,dog trial,9
7,"dog ,",9
8,dog and,8
9,dog has,8
10,dog to,7


2025-02-13 11:17:10 - INFO - Tokenization time: 0.00008 seconds
2025-02-13 11:17:10 - INFO - Token indexing (894) time: 0.01575 seconds
2025-02-13 11:17:10 - INFO - Generating ngrams results
2025-02-13 11:17:10 - INFO - Ngrams (894) retrieval time: 0.00098 seconds


156


2025-02-13 11:17:10 - INFO - Create tokens_array in 0.103 seconds
2025-02-13 11:17:10 - INFO - Ngrams report time: 0.12732 seconds


Unnamed: 0_level_0,ngram,frequency
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,dog .,63
2,"dog ,",30
3,dog owners,29
4,dog in,29
5,dog attack,27
6,dog has,23
7,dog -,22
8,dog that,22
9,dog control,22
10,dog and,19


271


Testing the cache using a query with lots of results. Note: this has been optimised to remove reliance on np.unique with_counts for 10x faster retrieval of large result set. Caching speeds this up further.

In [None]:
rnz500.results_cache = {}
# warm up tokens_array cache
result = rnz500.token_ids_to_tokens([1, 2, 3])

rnz500_ngrams, rnz500_total_count = rnz500.ngrams('the', ngram_length = 2, ngram_word_position = 'LEFT')
rnz500_ngrams, rnz500_total_count = rnz500.ngrams('the', ngram_length = 2, ngram_word_position = 'LEFT')
rnz500_ngrams, rnz500_total_count = rnz500.ngrams('the', ngram_length = 2, ngram_word_position = 'LEFT', page_current=10)

#rnz500.results_cache = {}
# # warm up tokens_array cache
# result = rnz500.token_ids_to_tokens([1, 2, 3])
# %lprun -f Corpus.ngrams rnz500.ngrams('the', ngram_length = 2, ngram_word_position = 'LEFT')

2025-02-13 11:19:04 - INFO - Create tokens_array in 0.054 seconds
2025-02-13 11:19:04 - INFO - Tokenization time: 0.00009 seconds
2025-02-13 11:19:04 - INFO - Token indexing (939251) time: 0.03140 seconds
2025-02-13 11:19:04 - INFO - Generating ngrams results
2025-02-13 11:19:04 - INFO - Ngrams (939244) retrieval time: 0.02559 seconds
2025-02-13 11:19:04 - INFO - Ngrams report time: 0.12569 seconds
2025-02-13 11:19:04 - INFO - Tokenization time: 0.00009 seconds
2025-02-13 11:19:04 - INFO - Using cached ngrams results
2025-02-13 11:19:04 - INFO - Ngrams report time: 0.00436 seconds
2025-02-13 11:19:04 - INFO - Tokenization time: 0.00007 seconds
2025-02-13 11:19:04 - INFO - Using cached ngrams results
2025-02-13 11:19:04 - INFO - Ngrams report time: 0.00259 seconds


In [None]:
@patch
def get_concordance(self: Corpus, token_sequence, token_index, context_words = 5, index_id = None): #TEST refactor token_sequence
	start_time = time.time()
	# TODO could build the concordance_range part when actually doing the display - i.e. all we really need is the concordance + sort columns

	if index_id == LOWER:
		index = 'lower_index'
	else:
		index = 'orth_index'

	concordance = []
	sequence_len = len(token_sequence[0])
	concordance_range = range(-1 * context_words, context_words + sequence_len)

	#concordance.append(token_index[0])

	for pos in concordance_range:
		seq = list(token_index[0]+pos)
		concordance.append(getattr(self, index)[seq])

	concordance = np.vstack(concordance).T
	#concordance = pd.DataFrame(data=concordance, index=token_index[0], columns=concordance_range)
	logging.info(f'Concordance results ({concordance.shape[0]}) retrieval time: {(time.time() - start_time):.5f} seconds')
	return list(concordance_range), concordance

In [None]:
positional_columns, concordance = toy.get_concordance(toy_token_sequence, toy_token_index, context_words = 5)
print('positional_columns', positional_columns)
print('concordance (token ids)\n', concordance)
print(len(concordance))

2025-02-13 11:19:08 - INFO - Concordance results (3) retrieval time: 0.00020 seconds


positional_columns [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]
concordance (token ids)
 [[16 16 16 16  3 10 11  7  9 12 14]
 [ 9 12 14 16  3 10  1  8 13 15 14]
 [13  4 14 16  3 10  1  2 14 16  3]]
3


In [None]:
result = toy.get_concordance(toy_token_sequence, toy_token_index, context_words = 5)
result = rnz.get_concordance(rnz_token_sequence, rnz_token_index, context_words = 5)
result = rnz100.get_concordance(rnz100_token_sequence, rnz100_token_index, context_words = 5)
result = rnz200.get_concordance(rnz200_token_sequence, rnz200_token_index, context_words = 5)
result = rnz500.get_concordance(rnz500_token_sequence, rnz500_token_index, context_words = 5)


2025-02-13 11:19:11 - INFO - Concordance results (3) retrieval time: 0.00017 seconds
2025-02-13 11:19:11 - INFO - Concordance results (31) retrieval time: 0.00014 seconds
2025-02-13 11:19:11 - INFO - Concordance results (202) retrieval time: 0.00033 seconds
2025-02-13 11:19:11 - INFO - Concordance results (363) retrieval time: 0.00051 seconds
2025-02-13 11:19:11 - INFO - Concordance results (894) retrieval time: 0.00268 seconds


In [None]:
@patch
def get_concordance_revised(self: Corpus, token_sequence, token_index, context_words = 5, index_id = None): #TEST refactor token_sequence
	start_time = time.time()
	# TODO could build the concordance_range part when actually doing the display - i.e. all we really need is the concordance + sort columns

	if index_id == LOWER:
		index = 'lower_index'
	else:
		index = 'orth_index'

	concordance = []
	sequence_len = len(token_sequence[0])
	concordance_range = range(-1 * context_words, context_words + sequence_len)

	#concordance.append(token_index[0])

	for pos in concordance_range:
		seq = list(token_index[0]+pos)
		concordance.append(getattr(self, index)[seq])

	#concordance = np.vstack(concordance).T
	concordance = np.stack(concordance)

	logging.info(f'Concordance results ({concordance.shape[0]}) retrieval time: {(time.time() - start_time):.5f} seconds')
	return list(concordance_range), concordance

In [None]:
positional_columns, concordance = toy.get_concordance_revised(toy_token_sequence, toy_token_index, context_words = 5)
print('positional_columns', positional_columns)
print('concordance (token ids)\n', concordance)
print(len(concordance))
result = rnz.get_concordance_columns(rnz_token_sequence, rnz_token_index, context_words = 5)
result = rnz100.get_concordance_columns(rnz100_token_sequence, rnz100_token_index, context_words = 5)
result = rnz200.get_concordance_columns(rnz200_token_sequence, rnz200_token_index, context_words = 5)
result = rnz500.get_concordance_columns(rnz500_token_sequence, rnz500_token_index, context_words = 5)


2025-02-13 11:19:14 - INFO - Concordance results (11) retrieval time: 0.00020 seconds


positional_columns [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]
concordance (token ids)
 [[16  9 13]
 [16 12  4]
 [16 14 14]
 [16 16 16]
 [ 3  3  3]
 [10 10 10]
 [11  1  1]
 [ 7  8  2]
 [ 9 13 14]
 [12 15 16]
 [14 14  3]]
11


AttributeError: 'Corpus' object has no attribute 'get_concordance_columns'

In [None]:
@patch
def concordance(self: Corpus, token_str, context_words = 5, order='1R2R3R', page_size=PAGE_SIZE, page_current=0): #TEST refactor token_sequence
	#  TODO: shift to polars dataframes
	token_sequence, index_id = self.tokenize(token_str, simple_indexing=True)

	start_time = time.time()
	sequence_len = len(token_sequence[0])

	cache_id = tuple(['concordance'] + list(token_sequence) + [context_words, order])
	if cache_id in self.results_cache:
		logging.info('Using cached concordance results')
		positional_columns = self.results_cache[cache_id][0]
		concordance = self.results_cache[cache_id][1]
		total_count = self.results_cache[cache_id][2]
		total_docs = self.results_cache[cache_id][3]
	else:
		token_index = self.get_token_index(token_sequence, index_id)

		if len(token_index[0]) == 0:
			logging.info('No tokens found')
			return None, 0, False, False

		logging.info('Generating concordance results')
		positional_columns, concordance = self.get_concordance(token_sequence, token_index, context_words = context_words)
		# get doc from token_index positions in token2doc_index

		if order == '1L2L3L':
			sort_columns = [-1,-2,-3]
		elif order == '3L2L1L':
			sort_columns = [-3,-2,-1]
		elif order == '2L1L1R':
			sort_columns = [-2,-1,sequence_len + 1 - 1]
		elif order == '1L1R2R':
			sort_columns = [-1,sequence_len + 1 - 1,sequence_len + 2 - 1]
		else:
			sort_columns = [sequence_len + 1 - 1,sequence_len + 2 - 1,sequence_len + 3 - 1]

		sort_columns_data = {}
		for x in range(len(sort_columns)):
			sort_columns_data[x] = concordance[:,[positional_columns.index(sort_columns[x])]].astype(np.str_)
			for y in range(len(sort_columns_data[x])):
				# TODO - this was updated to use .item() for numpy compatibility, but the x*y iteration is very slow - apply a method that avoids the loop
				sort_columns_data[x][y] = self.vocab[int(sort_columns_data[x][y].item())] 

		concordance = pd.DataFrame(data=concordance, index=token_index[0], columns=positional_columns)
		concordance['sort0'] = sort_columns_data[0]
		concordance['sort1'] = sort_columns_data[1]
		concordance['sort2'] = sort_columns_data[2]

		concordance['document_id'] = ''
		concordance['left'] = ''
		concordance['keyword'] = ''
		concordance['right'] = ''

		concordance = concordance.sort_values(['sort0','sort1','sort1'], ascending=[True, True, True])

		total_count = len(concordance)
		total_docs = 0
		total_docs = len(np.unique(self.token2doc_index[list(token_index[0])]))

		self.results_cache[cache_id] = [positional_columns, concordance, total_count, total_docs]

	resultset_start = page_size*page_current
	resultset_end = page_size*(page_current+1)

	offsets_arr = np.array(self.offsets,dtype=np.uint64)

	for concordance_index, concordance_row in concordance.iloc[resultset_start:resultset_end].iterrows():
		document_id = np.searchsorted(offsets_arr,concordance_index, side = 'right') - 1
		# TODO fix with metadata
		concordance.at[concordance_index, 'document_id'] = document_id#str(document_id) + '(' + str(concordance_index - int(offsets_arr[document_id])) + ', ' + str(sequence_len) + ')' #'tmp'#loaded_corpora[corpus_name]['texts']['id'][text_index] + ':' + str(concordance_index - int(offsets_arr[text_index])) + ':' + str(sequence_len)

		concordance_left = []
		concordance_right = []
		concordance_keyword = []
		for pos in positional_columns:
			token_id = concordance.at[concordance_index,pos]
			token = self.vocab[token_id]
			if pos < 0:
				concordance_left.append(token)
				if token_id == self.EOF_TOKEN:
					concordance_left = []
			elif pos == 0 or pos < sequence_len:
				concordance_keyword.append(token)
			else:
				if token_id == self.EOF_TOKEN:
					break
				else:
					concordance_right.append(token)

		concordance.at[concordance_index, 'left'] = ' '.join(concordance_left)
		concordance.at[concordance_index, 'keyword'] = ' '.join(concordance_keyword)
		concordance.at[concordance_index, 'right'] = ' '.join(concordance_right)

	logging.info(f'Concordance report time: {(time.time() - start_time):.5f} seconds')
	return concordance.iloc[resultset_start:resultset_end].sort_values(['sort0','sort1','sort1'], ascending=[True, True, True]), total_count, total_docs


In [None]:
rnz.results_cache = {}
concordance_report, total_count, total_docs = rnz.concordance(token_str, context_words = 5, order='1L2L3L')
if total_count > 0:
	display(concordance_report[['document_id', 'left','keyword','right']])
	print(total_count,'rows', total_docs, 'docs')


2025-02-13 11:19:16 - INFO - Tokenization time: 0.00010 seconds
2025-02-13 11:19:16 - INFO - Token indexing (31) time: 0.00091 seconds
2025-02-13 11:19:16 - INFO - Generating concordance results
2025-02-13 11:19:16 - INFO - Concordance results (31) retrieval time: 0.00019 seconds
2025-02-13 11:19:16 - INFO - Concordance report time: 0.02063 seconds


Unnamed: 0,document_id,left,keyword,right
12649,383,,Dog,owners in Hawke 's Bay
140175,3657,"'s KiwiBuild scheme a """,dog,""" and questions how some"
140682,3665,"the KiwiBuild scheme a """,dog,""" and says houses are"
79815,2161,", Keanu Reeves '",dog,- loving hitman is up
222459,6126,Theatre 's season of ',Dog,' .
145153,3755,face much higher fines .,Dog,and Lemon Guide editor Clive
97261,2541,soon became a Red Cross,dog,.
154580,4020,Documentary and the Pound Hound,dog,rescue show .
22844,658,Owning a,dog,can turn back the hands
8435,265,being set upon by a,dog,as its handler urges the


31 rows 27 docs


In [None]:
show_concordances = False
toy.results_cache = {}
concordance_report, total_count, total_docs = toy.concordance(token_str, context_words = 5, order='1L2L3L')
if total_count > 0 and show_concordances == True:
	display(concordance_report[['document_id', 'left','keyword','right']])
	print(total_count,'rows')

rnz.results_cache = {}
concordance_report, total_count, total_docs = rnz.concordance(token_str, context_words = 5, order='1L2L3L')
if total_count > 0 and show_concordances == True:
	display(concordance_report[['document_id', 'left','keyword','right']])
	print(total_count,'rows')

rnz100.results_cache = {}
concordance_report, total_count, total_docs = rnz100.concordance(token_str, context_words = 5, order='1L2L3L')
if total_count > 0 and show_concordances == True:
	display(concordance_report[['document_id', 'left','keyword','right']])
	print(total_count,'rows')

rnz200.results_cache = {}
concordance_report, total_count, total_docs = rnz200.concordance(token_str, context_words = 5, order='1L2L3L')
if total_count > 0 and show_concordances == True:
	display(concordance_report[['document_id', 'left','keyword','right']])
	print(total_count,'rows')

rnz500.results_cache = {}
concordance_report, total_count, total_docs = rnz500.concordance(token_str, context_words = 5, order='1L2L3L')
if total_count > 0 and show_concordances == True:
	display(concordance_report[['document_id', 'left','keyword','right']])
	print(total_count,'rows')

2025-02-13 11:19:17 - INFO - Tokenization time: 0.00011 seconds
2025-02-13 11:19:17 - INFO - Token indexing (3) time: 0.00008 seconds
2025-02-13 11:19:17 - INFO - Generating concordance results
2025-02-13 11:19:17 - INFO - Concordance results (3) retrieval time: 0.00030 seconds
2025-02-13 11:19:17 - INFO - Concordance report time: 0.01295 seconds
2025-02-13 11:19:17 - INFO - Tokenization time: 0.00012 seconds
2025-02-13 11:19:17 - INFO - Token indexing (31) time: 0.00066 seconds
2025-02-13 11:19:17 - INFO - Generating concordance results
2025-02-13 11:19:17 - INFO - Concordance results (31) retrieval time: 0.00014 seconds
2025-02-13 11:19:17 - INFO - Concordance report time: 0.02051 seconds
2025-02-13 11:19:17 - INFO - Tokenization time: 0.00008 seconds
2025-02-13 11:19:17 - INFO - Token indexing (202) time: 0.00441 seconds
2025-02-13 11:19:17 - INFO - Generating concordance results
2025-02-13 11:19:17 - INFO - Concordance results (202) retrieval time: 0.00040 seconds
2025-02-13 11:19:

Testing the cache using a query with lots of results ...

In [None]:
rnz500.results_cache = {}
concordance_report, total_count, total_docs = rnz500.concordance('the', context_words = 5, order='1L2L3L')
concordance_report, total_count, total_docs = rnz500.concordance('the', context_words = 5, order='1L2L3L')
concordance_report, total_count, total_docs = rnz500.concordance('the', context_words = 5, order='1L2L3L', page_current=5000)


2025-02-13 11:19:52 - INFO - Tokenization time: 0.00012 seconds
2025-02-13 11:19:52 - INFO - Token indexing (939251) time: 0.03488 seconds
2025-02-13 11:19:52 - INFO - Generating concordance results
2025-02-13 11:19:53 - INFO - Concordance results (939251) retrieval time: 1.29022 seconds
2025-02-13 11:19:58 - INFO - Concordance report time: 5.75223 seconds
2025-02-13 11:19:58 - INFO - Tokenization time: 0.00008 seconds
2025-02-13 11:19:58 - INFO - Using cached concordance results
2025-02-13 11:19:58 - INFO - Concordance report time: 0.02924 seconds
2025-02-13 11:19:58 - INFO - Tokenization time: 0.00008 seconds
2025-02-13 11:19:58 - INFO - Using cached concordance results
2025-02-13 11:19:58 - INFO - Concordance report time: 0.04076 seconds


In [None]:
# # test load from file
# toy = Corpus('toy')
# for i, text in enumerate(toy.load_from_files('../test-corpora/toy/', file_mask='*.txt', metadata_file='../test-corpora/toy.csv', metadata_lookup_column = 'source', metadata_columns=['source', 'category'])):
# #for i, text in enumerate(toy.load_from_files('../test-corpora/toy/')):
# 	print(text)
# display(toy.metadata.to_pandas())
# print()
# # load from csv
# toy = Corpus('toy')
# for i, text in enumerate(toy.load_from_csv('../test-corpora/toy.csv', metadata_columns=['source', 'category'])):
# 	print(text)
# display(toy.metadata.to_pandas())
# print()

In [None]:
# corpus = Corpus('rnz')
# corpus.build(corpus.load_from_csv('../test-corpora/rnz-200k.csv.gz', text_column='description'))
# corpus = Corpus('rnz')
# %lprun -f corpus.build  corpus.build(corpus.load_from_csv('../test-corpora/rnz-10k.csv.gz', text_column='description'))
# corpus = Corpus('reuters')
# corpus.build(corpus.load_from_files('../test-corpora/reuters'))
#%lprun -f corpus.build  corpus.build(corpus.load_from_csv('../test-corpora/rnz-200k.csv.gz', text_column='description'))
#%memit corpus.build(corpus.load_from_files('../test-corpora/toy'))

## Development notes

Investigate storage efficiency of exploiting repetition using run-length encoding (probably just more efficient to compress)  
* https://stackoverflow.com/questions/3098907/how-to-efficiently-store-a-matrix-with-highly-redundant-values  
* https://pypi.org/project/python-rle/  
* https://gist.github.com/nvictus/66627b580c13068589957d6ab0919e66  
* https://trimsh.org/trimesh.voxel.runlength.html#trimesh.voxel.runlength.rle_gatherer_1d  
* https://github.com/mikedh/trimesh/blob/63e35a5652c9525a6a8070271c2bac8f4d13105b/trimesh/voxel/runlength.py#L345  


## Testing optimisations

Testing speed of CSV reads.

In [None]:
file = '../test-corpora/rnz-200k.csv.gz'
# standard csv library
def load_gzip_csv(file):
	with gzip.open(file, 'rt') as f:
		reader = csv.DictReader(f)
		for row in reader:
			yield row

# dtype set to string for all columns to suppress error
def load_gzip_pandas(file):
	df = pd.read_csv(file, dtype=str)
	for i, row in df.iterrows():
		yield row
	del df

# dtype set to string for all columns to suppress error
def load_gzip_pyarrow(file):
	table = pyarrow.csv.read_csv(file, parse_options=pyarrow.csv.ParseOptions(invalid_row_handler=skip_comment))
	for row in table:
		yield row
	del table

# polars
def load_gzip_polars(file):
	df = pl.read_csv(file)
	for row in df.iter_rows():
		yield row
	del df

def skip_comment(row):
	return 'skip'

# %time _ = [row for row in load_gzip_csv()]
# %memit _ = [row for row in load_gzip_csv()]

# %time _ = [row for row in load_gzip_pandas()]
# %memit _ = [row for row in load_gzip_pandas()]

%time _ = [row for row in load_gzip_pyarrow(file)]
%memit _ = [row for row in load_gzip_pyarrow(file)]

%time _ = [row for row in load_gzip_polars(file)]
%memit _ = [row for row in load_gzip_polars(file)]


CPU times: user 1.97 s, sys: 332 ms, total: 2.3 s
Wall time: 1.27 s
peak memory: 2733.00 MiB, increment: 224.18 MiB
CPU times: user 2.03 s, sys: 405 ms, total: 2.44 s
Wall time: 1.68 s
peak memory: 3523.83 MiB, increment: 495.31 MiB


Generating the frequency lookup using Counter or numpy. 

Result: numpy, at least 10x speedup.

In [None]:
print('Generating frequency lookup using Counter')
%timeit Counter(corpus.lower_index)
print()
print('Generating frequency lookup using numpy')
%timeit dict(zip(*np.unique(corpus.lower_index, return_counts=True)))

Generating frequency lookup using Counter
795 ms ± 89.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Generating frequency lookup using numpy
76.4 ms ± 12.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


Testing the speed of Spacy's vocab stringstore vs a dict.  

Result: dict, at least 2x speedup.

In [None]:
print('Retrieving from Spacy vocab stringstore:')
%timeit nlp.vocab.strings[7425985699627899538]
%timeit nlp.vocab.strings['the']
vocab = {**{k:nlp.vocab.strings[k] for k in nlp.vocab.strings}, **{nlp.vocab.strings[k]:k for k in nlp.vocab.strings}}

print()

print('Retrieving from dict:')
%timeit the = vocab['the']
%timeit the = vocab[7425985699627899538]


Retrieving from Spacy vocab stringstore:
148 ns ± 12.3 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
265 ns ± 11.4 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)

Retrieving from dict:
65.5 ns ± 10.3 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
59.5 ns ± 8.85 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


Problem: Spacy uses unint64 for string hashes. Working with these hashes is fast, but requires more memory/storage.  Vocabulary sizes of a corpus are much smaller than the max allowed by uint64. 

Solution: Reindex the hashes to create an internal id to substantially reduce memory/storage. Preserve lookup to Spacy hashes if needed. 

Optimisations to ngram retrieval - slightly slower in micro-seconds for rare token sequences, but faster in milliseconds for common ones and format is better for using columnar approach to speed up main ngrams function.

In [None]:
def get_ngrams_old(Corpus, token_sequence, index_id, token_index, ngram_length = 2, ngram_word_position = 'LEFT', log=False): #TEST refactor token_sequence
	sequence_len = len(token_sequence[0])

	if index_id == ORTH:
		index = 'orth_index'
	else:
		index = 'lower_index'

	if ngram_word_position == 'RIGHT':
		ngram_range = range(-1 * ngram_length + sequence_len, sequence_len)
	elif ngram_word_position == 'MIDDLE':
		ngram_range = range(-1 * ngram_length + sequence_len + 1, sequence_len + 1)
	else:
		ngram_range = range(0, ngram_length)

	ngrams = []
	for pos in ngram_range:
		seq = token_index[0] + pos
		ngrams.append(getattr(Corpus, index)[seq])

	ngrams = np.vstack(ngrams).T

	#remove any ngrams that include EOF_TOKEN (this is a marker for text separation)
	ngrams = np.delete(ngrams, np.where(ngrams == Corpus.EOF_TOKEN)[0], axis=0)
	return ngrams

# speed up by:
# 1. keep as columns
# 2. optimize removal to only search relevant columns
# 3. return numpy array
def get_ngrams_new(Corpus, token_sequence, index_id, token_index, ngram_length = 2, ngram_word_position = 'LEFT', log=False): #TEST refactor token_sequence
	sequence_len = len(token_sequence[0])
	variants_len = len(token_sequence)
	token_index_len = len(token_index[0])

	if index_id == ORTH:
		index = 'orth_index'
	else:
		index = 'lower_index'

	if ngram_word_position == 'RIGHT':
		ngram_range = range(-1 * ngram_length + sequence_len, sequence_len)
	elif ngram_word_position == 'MIDDLE':
		ngram_range = range(-1 * ngram_length + sequence_len + 1, sequence_len + 1)
	else:
		ngram_range = range(0, ngram_length)

	ngrams = []
	if log: logging.info(ngram_range)
	for pos in ngram_range:
		if variants_len == 1 and pos > -1 and pos < sequence_len:
			# create numpy array with the same token (token_sequence[pos]) for length of token_index[0]
			ngrams.append(np.full(token_index_len, token_sequence[0][pos]))
		else:
			seq = token_index[0] + pos
			ngrams.append(getattr(Corpus, index)[seq])

	ngrams = np.stack(ngrams)

	# get column positions to search for EOF_TOKEN
	columns = (np.array(ngram_range)[:, None] != np.arange(sequence_len)).all(axis=1)
	if log: logging.info(columns)
	ngrams = np.delete(ngrams, np.where(ngrams[columns] == Corpus.EOF_TOKEN)[1], axis=1)
	return ngrams

run_time_test = True

the_token_str = 'the'
the_token_sequence, the_index_id = rnz500.tokenize(the_token_str, simple_indexing=True)
rnz500_the_token_index = rnz500.get_token_index(the_token_sequence, the_index_id)
# print(the_token_sequence)
# print(the_index_id)

if run_time_test:
	%timeit result = get_ngrams_old(rnz500, the_token_sequence, the_index_id, rnz500_the_token_index, ngram_length = 2, ngram_word_position = 'LEFT')
else:
	%time result = get_ngrams_old(rnz500, the_token_sequence, the_index_id, rnz500_the_token_index, ngram_length = 2, ngram_word_position = 'LEFT', log=False)
# print(result.shape)
# print(result[:5])
if run_time_test:
	%timeit result = get_ngrams_new(rnz500, the_token_sequence, the_index_id, rnz500_the_token_index, ngram_length = 2, ngram_word_position = 'LEFT')
else:
	%time result = get_ngrams_new(rnz500, the_token_sequence, the_index_id, rnz500_the_token_index, ngram_length = 2, ngram_word_position = 'LEFT', log=False)


the_token_str = 'the government'
the_token_sequence, the_index_id = rnz500.tokenize(the_token_str, simple_indexing=True)
rnz500_the_token_index = rnz500.get_token_index(the_token_sequence, the_index_id)
# print(the_token_sequence)
# print(the_index_id)

if run_time_test:
	%timeit result = get_ngrams_old(rnz500, the_token_sequence, the_index_id, rnz500_the_token_index, ngram_length = 3, ngram_word_position = 'LEFT')
else:
	%time result = get_ngrams_old(rnz500, the_token_sequence, the_index_id, rnz500_the_token_index, ngram_length = 3, ngram_word_position = 'LEFT', log=False)
# print(result.shape)
# print(result[:5])
if run_time_test:
	%timeit result = get_ngrams_new(rnz500, the_token_sequence, the_index_id, rnz500_the_token_index, ngram_length = 3, ngram_word_position = 'LEFT')
else:
	%time result = get_ngrams_new(rnz500, the_token_sequence, the_index_id, rnz500_the_token_index, ngram_length = 3, ngram_word_position = 'LEFT', log=False)


the_token_str = 'dog'
the_token_sequence, the_index_id = rnz500.tokenize(the_token_str, simple_indexing=True)
rnz500_the_token_index = rnz500.get_token_index(the_token_sequence, the_index_id)
# print(the_token_sequence)
# print(the_index_id)

if run_time_test:
	%timeit result = get_ngrams_old(rnz500, the_token_sequence, the_index_id, rnz500_the_token_index, ngram_length = 2, ngram_word_position = 'LEFT')
else:
	%time result = get_ngrams_old(rnz500, the_token_sequence, the_index_id, rnz500_the_token_index, ngram_length = 2, ngram_word_position = 'LEFT', log=False)
# print(result.shape)
# print(result[:5])
if run_time_test:
	%timeit result = get_ngrams_new(rnz500, the_token_sequence, the_index_id, rnz500_the_token_index, ngram_length = 2, ngram_word_position = 'LEFT')
else:
	%time result = get_ngrams_new(rnz500, the_token_sequence, the_index_id, rnz500_the_token_index, ngram_length = 2, ngram_word_position = 'LEFT', log=False)

the_token_str = 'the dog'
the_token_sequence, the_index_id = rnz500.tokenize(the_token_str, simple_indexing=True)
rnz500_the_token_index = rnz500.get_token_index(the_token_sequence, the_index_id)
# print(the_token_sequence)
# print(the_index_id)

if run_time_test:
	%timeit result = get_ngrams_old(rnz500, the_token_sequence, the_index_id, rnz500_the_token_index, ngram_length = 3, ngram_word_position = 'LEFT')
else:
	%time result = get_ngrams_old(rnz500, the_token_sequence, the_index_id, rnz500_the_token_index, ngram_length = 3, ngram_word_position = 'LEFT', log=False)
# print(result.shape)
# print(result[:5])
if run_time_test:
	%timeit result = get_ngrams_new(rnz500, the_token_sequence, the_index_id, rnz500_the_token_index, ngram_length = 3, ngram_word_position = 'LEFT')
else:
	%time result = get_ngrams_new(rnz500, the_token_sequence, the_index_id, rnz500_the_token_index, ngram_length = 3, ngram_word_position = 'LEFT', log=False)



# print(result.shape)
# # preview first 5 entries in each column
# print(result)

#%lprun -f get_ngrams_new result = get_ngrams_new(rnz500, the_token_sequence, the_index_id, rnz500_the_token_index, ngram_length = 3, ngram_word_position = 'LEFT', log=False)

# columns = [0, 1]
# # # find rnz500.EOF_TOKEN in result[columns]
# # print(result[columns])
# #print(rnz500.EOF_TOKEN)
# eof_index = np.where(result[columns] == rnz500.EOF_TOKEN)
# print(len(eof_index[0]))
# print(len(eof_index[1]))
# print(939251 -787900 )
# print(eof_index)
# print(result[0][125])

2025-02-12 14:54:32 - INFO - Tokenization time: 0.00008 seconds
2025-02-12 14:54:33 - INFO - Token indexing (939251) time: 0.02845 seconds


41.5 ms ± 5.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


2025-02-12 14:54:38 - INFO - Tokenization time: 0.00010 seconds
2025-02-12 14:54:38 - INFO - Token indexing (24369) time: 0.13101 seconds


31.1 ms ± 2.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
1.61 ms ± 222 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


2025-02-12 14:55:01 - INFO - Tokenization time: 0.00010 seconds
2025-02-12 14:55:01 - INFO - Token indexing (894) time: 0.01946 seconds


1.13 ms ± 142 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
61.1 μs ± 2.64 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


2025-02-12 14:55:13 - INFO - Tokenization time: 0.00009 seconds
2025-02-12 14:55:13 - INFO - Token indexing (72) time: 0.02603 seconds


87.1 μs ± 3.64 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
29.2 μs ± 2.01 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
54.3 μs ± 2.59 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


Speed up to ngram reporting produces 10x speed for large results. Making better use of column formats and faster unique using polars struc counts. Probably worth testing frequent and infrequent tokens. Note: caching turned off in these.

In [None]:
def ngrams_old(Corpus, token_str, ngram_length = 2, ngram_word_position = 'LEFT', page_size = PAGE_SIZE, page_current = 0):
	token_sequence, index_id = Corpus.tokenize(token_str, simple_indexing=True)

	start_time = time.time()
	cache_id = tuple(['ngram'] + list(token_sequence) + [ngram_length, ngram_word_position])
	if False: #cache_id in Corpus.results_cache:
		logging.info('Using cached ngrams results')
		ngrams_report = Corpus.results_cache[cache_id]
	else:
		token_index = Corpus.get_token_index(token_sequence, index_id)
		
		if len(token_index[0]) == 0:
			logging.info('No tokens found')
			return None, 0

		logging.info('Generating ngrams results')
		ngrams = get_ngrams_old(Corpus, token_sequence, index_id, token_index, ngram_length = ngram_length, ngram_word_position = ngram_word_position)

		unique_ngrams, counts = np.unique(ngrams, axis=0, return_counts=True)
		ngrams_report = []
		unique_ngrams = list(map(tuple, unique_ngrams))
		ngrams_report = pl.DataFrame({'ngram_token_ids': pl.Series("ngram_token_ids", unique_ngrams, dtype=pl.Object) , 'frequency': counts, 'ngram': ''}).sort('frequency', descending=True)
		ngrams_report = ngrams_report.with_row_index(name='rank', offset=1)
		Corpus.results_cache[cache_id] = ngrams_report

	total_count = len(ngrams_report)

	resultset_start = page_size*page_current
	resultset_end = page_size*(page_current+1)

	# get specific chunk of report into pandas based on resultset_start:
	ngrams_report_page = ngrams_report.slice(resultset_start, page_size).to_pandas().set_index('ngram_token_ids')
	for ngram_token_ids, row in ngrams_report_page.iterrows():
		ngram_text = []
		for token in ngram_token_ids:
			ngram_text.append(Corpus.vocab[token])
		ngrams_report_page.at[ngram_token_ids, 'ngram'] = ' '.join(ngram_text)

	# set index back to column and set new rank index
	ngrams_report_page['token_ids'] = ngrams_report_page.index
	ngrams_report_page = ngrams_report_page.set_index('rank')
	
	# sort column display - add normalized_frequency and rank, optional ngram_token_ids
	logging.info(f'Ngrams report time: {(time.time() - start_time):.5f} seconds')
	return ngrams_report_page[['ngram', 'frequency']], total_count

def ngrams_new(Corpus, token_str, ngram_length = 2, ngram_word_position = 'LEFT', page_size = PAGE_SIZE, page_current = 0):
	token_sequence, index_id = Corpus.tokenize(token_str, simple_indexing=True)

	start_time = time.time()
	cache_id = tuple(['ngram'] + list(token_sequence) + [ngram_length, ngram_word_position])
	if False: #cache_id in Corpus.results_cache:
		logging.info('Using cached ngrams results')
		ngrams_report = Corpus.results_cache[cache_id]
	else:
		token_index = Corpus.get_token_index(token_sequence, index_id)
		
		if len(token_index[0]) == 0:
			logging.info('No tokens found')
			return None, 0

		logging.info('Generating ngrams results')
		ngrams = get_ngrams_new(Corpus, token_sequence, index_id, token_index, ngram_length = ngram_length, ngram_word_position = ngram_word_position)
		# get schema token_1, token_2 etc based on ngram_length
		schema = [f'token_{i+1}' for i in range(ngram_length)]
		ngrams_report = pl.DataFrame(ngrams.T, schema=schema).to_struct(name = 'ngram_token_ids').value_counts(sort=True).rename({"count": "frequency"})
		# add ngram column with str type
		ngrams_report = ngrams_report.with_row_index(name='rank', offset=1)
		Corpus.results_cache[cache_id] = ngrams_report

	total_count = len(ngrams_report)

	resultset_start = page_size*page_current
	resultset_end = page_size*(page_current+1)

	# get specific chunk of report into pandas based on resultset_start:
	ngrams_report_page = ngrams_report.slice(resultset_start, page_size).unnest('ngram_token_ids')#.to_pandas().set_index('ngram_token_ids')
	token_strs = []
	for i in range(ngram_length):
		token_ids = ngrams_report_page[f'token_{i+1}'].to_numpy()
		Corpus.results_cache['token_ids_for_test'] = token_ids
		_token_strs = Corpus.token_ids_to_tokens(token_ids)
		token_strs.append(_token_strs)
		#token_strs.append()
	token_strs = np.array(token_strs)
	ngram_text = [' '.join(column) for column in token_strs.T]
	ngrams_report_page = ngrams_report_page.with_columns(pl.Series(name="ngram", values=ngram_text))
	ngrams_report_page = ngrams_report_page.to_pandas().set_index('rank')
	
	# sort column display - add normalized_frequency and rank, optional ngram_token_ids
	logging.info(f'Ngrams report time: {(time.time() - start_time):.5f} seconds')
	return ngrams_report_page[['ngram', 'frequency']], total_count

#REMEMBER - disabled result cache in these functions 
# 
run_time_test = False

the_token_str = 'the'
if run_time_test:
	%timeit result = ngrams_old(rnz500, the_token_str, ngram_length = 2, ngram_word_position = 'LEFT')
else:
	%time result = ngrams_old(rnz500, the_token_str, ngram_length = 2, ngram_word_position = 'LEFT', page_size = PAGE_SIZE, page_current = 0)
result = ngrams_old(rnz500, the_token_str, ngram_length = 2, ngram_word_position = 'LEFT', page_size = PAGE_SIZE, page_current = 0)
print(result[0][:5])

if run_time_test:
	%timeit result = ngrams_new(rnz500, the_token_str, ngram_length = 2, ngram_word_position = 'LEFT')
else:
	%time result = ngrams_new(rnz500, the_token_str, ngram_length = 2, ngram_word_position = 'LEFT', page_size = PAGE_SIZE, page_current = 0)
result = ngrams_new(rnz500, the_token_str, ngram_length = 2, ngram_word_position = 'LEFT', page_size = PAGE_SIZE, page_current = 0)
print(result[0][:5])



2025-02-12 15:29:52 - INFO - Tokenization time: 0.00009 seconds


2025-02-12 15:29:52 - INFO - Token indexing (939251) time: 0.03751 seconds
2025-02-12 15:29:52 - INFO - Generating ngrams results
2025-02-12 15:29:53 - INFO - Ngrams report time: 1.79165 seconds
2025-02-12 15:29:53 - INFO - Tokenization time: 0.00008 seconds
2025-02-12 15:29:53 - INFO - Token indexing (939251) time: 0.02977 seconds
2025-02-12 15:29:53 - INFO - Generating ngrams results


CPU times: user 1.78 s, sys: 20.3 ms, total: 1.8 s
Wall time: 1.79 s


2025-02-12 15:29:55 - INFO - Ngrams report time: 1.72422 seconds
2025-02-12 15:29:55 - INFO - Tokenization time: 0.00007 seconds
2025-02-12 15:29:55 - INFO - Token indexing (939251) time: 0.03745 seconds
2025-02-12 15:29:55 - INFO - Generating ngrams results
2025-02-12 15:29:55 - INFO - Ngrams report time: 0.14558 seconds
2025-02-12 15:29:55 - INFO - Tokenization time: 0.00008 seconds
2025-02-12 15:29:55 - INFO - Token indexing (939251) time: 0.03609 seconds
2025-02-12 15:29:55 - INFO - Generating ngrams results


               ngram  frequency
rank                           
1     the government      24369
2            the new      22748
3         the latest      17959
4        the country      15560
5          the first      12916
CPU times: user 151 ms, sys: 23 μs, total: 151 ms
Wall time: 148 ms


2025-02-12 15:29:55 - INFO - Ngrams report time: 0.18574 seconds


               ngram  frequency
rank                           
1     the government      24369
2            the new      22748
3         the latest      17959
4        the country      15560
5          the first      12916


In [None]:
def reindex(token_index):
	""" Takes as input a list of token ids (np.uint64) and reindexes that outputting a lookup and the reindexed token_index. """
	unique_values = np.unique(token_index)
	original_to_new = dict(zip(unique_values, range(len(unique_values))))
	new_to_original = dict(zip(range(len(unique_values)), unique_values))
	reindexed_array = np.array([original_to_new[x] for x in token_index], dtype=np.uint32)
	vocab = {k:nlp.vocab.strings[k] for k in unique_values}
	reindexed_vocab = {**{int(original_to_new[k]):nlp.vocab.strings[k] for k in vocab}, **{nlp.vocab.strings[k]:int(original_to_new[k]) for k in vocab}}
	return reindexed_array, reindexed_vocab, original_to_new, new_to_original

original_array = corpus.lower_index
reindexed_array, reindexed_vocab, original_to_new, new_to_original = reindex(original_array)

results = []
results.append(['original array', original_array.nbytes / 1024 / 1024])
results.append(['reindexed array', reindexed_array.nbytes / 1024 / 1024])
results.append(['original_to_new', sys.getsizeof(original_to_new) / 1024 / 1024])
results.append(['new_to_original', sys.getsizeof(new_to_original) / 1024 / 1024])
results.append(['original vocab', sys.getsizeof(vocab) / 1024 / 1024])
results.append(['reindexed vocab', sys.getsizeof(reindexed_vocab) / 1024 / 1024])

print('Memory usage:')
display(pd.DataFrame(results, columns=['data structure', 'size (MB)']))

with gzip.open('../test-corpora/rnz-vocab.pkl.gz', 'wb') as f:
	pickle.dump(vocab, f)
with gzip.open('../test-corpora/rnz-reindexed-vocab.pkl.gz', 'wb') as f:
	pickle.dump(reindexed_vocab, f)
with gzip.open('../test-corpora/rnz-original-to-new.pkl.gz', 'wb') as f:
	pickle.dump(original_to_new, f)
with gzip.open('../test-corpora/rnz-new-to-original.pkl.gz', 'wb') as f:
	pickle.dump(new_to_original, f)
np.savez_compressed('../test-corpora/rnz-original-array.npz', original_array)
np.savez_compressed('../test-corpora/rnz-reindexed-array.npz', reindexed_array)

results = []
results.append(['corpus source', os.path.getsize(corpus.path) / 1024 / 1024]) 

results.append(['vocab (pkl.gz)', os.path.getsize('../test-corpora/rnz-vocab.pkl.gz') / 1024 / 1024])
results.append(['original array (npz)', os.path.getsize('../test-corpora/rnz-original-array.npz') / 1024 / 1024])

results.append(['reindexed vocab (pkl.gz)', os.path.getsize('../test-corpora/rnz-reindexed-vocab.pkl.gz') / 1024 / 1024])
results.append(['original_to_new (pkl.gz)', os.path.getsize('../test-corpora/rnz-original-to-new.pkl.gz') / 1024 / 1024])
results.append(['new_to_original (pkl.gz)', os.path.getsize('../test-corpora/rnz-new-to-original.pkl.gz') / 1024 / 1024])
results.append(['reindexed array (npz)', os.path.getsize('../test-corpora/rnz-reindexed-array.npz') / 1024 / 1024])

print('File sizes:')
display(pd.DataFrame(results, columns=['data structure', 'size (MB)']))


Memory usage:


Unnamed: 0,data structure,size (MB)
0,original array,24.672913
1,reindexed array,12.336456
2,original_to_new,2.500084
3,new_to_original,2.500084
4,original vocab,10.000084
5,reindexed vocab,5.000076


File sizes:


Unnamed: 0,data structure,size (MB)
0,corpus source,38.819672
1,vocab (pkl.gz),3.710016
2,original array (npz),9.917922
3,reindexed vocab (pkl.gz),0.781233
4,original_to_new (pkl.gz),0.507931
5,new_to_original (pkl.gz),0.507943
6,reindexed array (npz),4.914656


In [None]:
#| export
def foo(): pass

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()