# core

> Helper functions and classes for Conc.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import re
import os
import logging
from great_tables import GT
import polars as pl
import msgspec
import spacy
from memory_profiler import _get_memory

In [None]:
#| export
PAGE_SIZE = 20
EOF_TOKEN_STR = ' conc-end-of-file-token'
ERR_TOKEN_STR = 'ERROR: not a token'

In [None]:
#| export
DOCUMENTATION_URL = 'https://geoffford.nz/conc'
REPOSITORY_URL = 'https://github.com/polsci/conc'
PYPI_URL = ''
CITATION_STR = '''If you use Conc in your work, please cite it as follows:'''

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

In [None]:
#| hide
polars_conf = pl.Config.set_tbl_width_chars(300)
polars_conf = pl.Config.set_fmt_str_lengths(300)

## Logging

In [None]:
#| exporti
class ConcLogger(logging.Logger):
	""" Custom logger for conc module. """
	def __init__(self, name, level=logging.WARNING, log_file=None):
		super().__init__(name, level)
		self._setup_handler(log_file)
		self.last_memory_usage = None

	def _setup_handler(self, log_file = None):
		console_handler = logging.StreamHandler()
		formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(funcName)s - %(message)s', 
									  datefmt='%Y-%m-%d %H:%M:%S')
		console_handler.setFormatter(formatter)
		self.addHandler(console_handler)

		if log_file is not None:
			file_handler = logging.FileHandler(log_file)
			file_handler.setFormatter(formatter)
			self.addHandler(file_handler)

	def set_state(self, state:str # 'quiet' or 'verbose'
				  ):
		if state == 'quiet':
			level = logging.WARNING
		elif state == 'verbose':
			level = logging.DEBUG
		else:
			raise ValueError(f"Invalid state: {state}")
		
		self.setLevel(level)

	def memory_usage(self, message = '', init=False):
		if init:
			self.last_memory_usage = None
		usage = _get_memory(-1, 'psutil', include_children=True)
		if self.last_memory_usage is not None:
			difference = usage - self.last_memory_usage
			memory_message = f', memory usage: {usage} MB, difference: {difference} MB'
		else:
			memory_message = f', memory usage: {usage} MB'
		self.info(f"{message}{memory_message}")
		self.last_memory_usage = usage


In [None]:
#| export
logging.setLoggerClass(ConcLogger)

logger = logging.getLogger(__name__)


In [None]:
#| export
def set_logger_state(state:str # 'quiet' or 'verbose'
					 ):
	""" Set the state of the conc logger to either 'quiet' or 'verbose' """
	logger.set_state(state)

## spaCy

In [None]:
#| hide
# This is a quick reminder of the available spacy attributes that can be output for a doc (depending on the model and pipe settings)
for attr in spacy.attrs.IDS:
	if attr and not attr.startswith('FLAG'):
		print(f'{attr}: {spacy.attrs.IDS[attr]}')

IS_ALPHA: 1
IS_ASCII: 2
IS_DIGIT: 3
IS_LOWER: 4
IS_PUNCT: 5
IS_SPACE: 6
IS_TITLE: 7
IS_UPPER: 8
LIKE_URL: 9
LIKE_NUM: 10
LIKE_EMAIL: 11
IS_STOP: 12
IS_OOV_DEPRECATED: 13
IS_BRACKET: 14
IS_QUOTE: 15
IS_LEFT_PUNCT: 16
IS_RIGHT_PUNCT: 17
IS_CURRENCY: 18
ID: 64
ORTH: 65
LOWER: 66
NORM: 67
SHAPE: 68
PREFIX: 69
SUFFIX: 70
LENGTH: 71
LEMMA: 73
POS: 74
TAG: 75
DEP: 76
ENT_IOB: 77
ENT_TYPE: 78
ENT_ID: 454
ENT_KB_ID: 452
HEAD: 79
SENT_START: 80
SPACY: 81
LANG: 83
MORPH: 453
IDX: 455


In [None]:
#| export
def spacy_attribute_name(index):
	"""Get name of index from spacy."""

	return list(spacy.attrs.IDS.keys())[list(spacy.attrs.IDS.values()).index(index)]

## Corpus metadata schema

In [None]:
#| export
class CorpusMetadata(msgspec.Struct): 
    """ JSON validation schema for corpus metadata """
    name: str
    description: str
    slug: str
    conc_version: str
    document_count: int
    token_count: int
    word_token_count: int
    punct_token_count: int
    space_token_count: int
    unique_tokens: int
    unique_word_tokens: int
    date_created: str
    #source_path: str
    EOF_TOKEN: int
    SPACY_EOF_TOKEN: int
    SPACY_MODEL: str
    SPACY_MODEL_VERSION: str
    punct_tokens: list[int]
    space_tokens: list[int]



In [None]:
#| echo: true
properties = msgspec.json.schema(CorpusMetadata)['$defs']['CorpusMetadata']['properties']
display(properties)

{'name': {'type': 'string'},
 'description': {'type': 'string'},
 'slug': {'type': 'string'},
 'conc_version': {'type': 'string'},
 'document_count': {'type': 'integer'},
 'token_count': {'type': 'integer'},
 'word_token_count': {'type': 'integer'},
 'punct_token_count': {'type': 'integer'},
 'space_token_count': {'type': 'integer'},
 'unique_tokens': {'type': 'integer'},
 'unique_word_tokens': {'type': 'integer'},
 'date_created': {'type': 'string'},
 'EOF_TOKEN': {'type': 'integer'},
 'SPACY_EOF_TOKEN': {'type': 'integer'},
 'SPACY_MODEL': {'type': 'string'},
 'SPACY_MODEL_VERSION': {'type': 'string'},
 'punct_tokens': {'type': 'array', 'items': {'type': 'integer'}},
 'space_tokens': {'type': 'array', 'items': {'type': 'integer'}}}

## List available corpora

In [None]:
#| export
def list_corpora(
		path: str # path to load corpus
		) -> pl.DataFrame: # Dataframe with path, corpus, corpus name, document count, token count
	""" Scan a directory for available corpora """
	
	available_corpora = {'corpus': [], 'name': [], 'date_created': [], 'document_count': [], 'token_count': []}
	for dir in os.listdir(path):
		if os.path.isdir(os.path.join(path, dir)) and os.path.isfile( os.path.join(path, dir, 'corpus.json')):
			with open(os.path.join(path, dir, 'corpus.json'), 'rb') as f:
				data = msgspec.json.decode(f.read(), type=CorpusMetadata)

			available_corpora['corpus'].append(dir)
			for k in ['name', 'document_count', 'token_count', 'date_created']:
				attr = getattr(data, k)
				if isinstance(attr, int):
					attr = f'{attr:,}'
				available_corpora[k].append(attr)

	return pl.DataFrame(available_corpora)

In [None]:
print(list_corpora(save_path))

shape: (8, 5)
┌──────────────────────────────────────────────┬───────────────────────────────────────┬─────────────────────┬────────────────┬─────────────┐
│ corpus                                       ┆ name                                  ┆ date_created        ┆ document_count ┆ token_count │
│ ---                                          ┆ ---                                   ┆ ---                 ┆ ---            ┆ ---         │
│ str                                          ┆ str                                   ┆ str                 ┆ str            ┆ str         │
╞══════════════════════════════════════════════╪═══════════════════════════════════════╪═════════════════════╪════════════════╪═════════════╡
│ introduce-yourself.corpus                    ┆ Introduce Yourself                    ┆ 2025-06-03 12:06:23 ┆ 28             ┆ 10,034      │
│ gutenberg.corpus                             ┆ Gutenberg Corpus                      ┆ 2025-06-03 12:05:42 ┆ 18             ┆ 2,777,

## Get word lists

In [None]:
#| export
def get_stop_words(save_path:str, # directory to save stop words to, file name will be created based on spaCy model name
				   spacy_model:str = 'en_core_web_sm' # model to get stop words for
					):
	""" Get stop words from spaCy and cache to disk """

	stop_words = None

	filename = f'{spacy_model}_stop_words.txt'
	save_to = os.path.join(save_path, filename)

	if os.path.exists(save_to):
		with open(save_to, 'r', encoding='utf-8') as f:
			stop_words = set(f.read().splitlines())

	if stop_words is None:
		nlp = spacy.load(spacy_model)
		stop_words = nlp.Defaults.stop_words
		del nlp

		if not os.path.exists(save_path):
			os.makedirs(save_path)

		with open(save_to, 'w', encoding='utf-8') as f:
			for word in stop_words:
				f.write(word + '\n')

	return stop_words

In [None]:
print(get_stop_words(save_path = save_path, spacy_model='en_core_web_sm'))

{'thru', 'also', 'whereafter', 'perhaps', 'behind', 'me', 'again', 'while', 'yourself', 'n’t', 'your', 'to', 'into', '’ve', 'would', 'each', 'see', 'other', 'am', 'somewhere', 'five', 'ever', 'you', 'besides', 'those', 'is', 'will', 'do', 'in', 'two', 'from', 'everywhere', 'an', 'off', 'therefore', 'already', 'sometimes', 'may', 'whose', 'why', 'has', 'it', 'up', 'her', 'keep', 'a', 'next', "'d", 'name', 'are', 'something', 'rather', 'does', 'eleven', 'part', 'herself', 'thence', 'we', 'now', 'moreover', 'out', 'of', 'due', 'mine', 'everything', 'as', 'ca', 'under', 'former', 'could', 'nine', 'what', 'without', 'n‘t', 'on', 'both', 'nothing', 'third', 'him', 'most', 'their', 'until', 'wherever', 'myself', 'often', 'among', 'whereby', 'never', 'many', 'say', '’re', 'regarding', 'seems', 'wherein', 'make', 'and', 'anyhow', 'put', 'beside', 'thereafter', 'always', 'afterwards', 'against', 'only', 'who', 'been', 'upon', 'by', 'enough', 'us', 'four', 'really', 'yourselves', 'few', 'none', '

## Get data sources

In [None]:
#| exporti

toy_data = []
toy_data.append(['1.txt', 'The cat sat on the mat.', 'feline', 'cat'])
toy_data.append(['2.txt', 'The dog sat on the mat.', 'canine', 'dog'])
toy_data.append(['3.txt', 'The cat is meowing.', 'feline', 'cat'])
toy_data.append(['4.txt', 'The dog is barking.', 'canine', 'dog'])
toy_data.append(['5.txt', 'The cat is climbing a tree.', 'feline', 'cat'])
toy_data.append(['6.txt', 'The dog is digging a hole.', 'canine', 'dog'])

In [None]:
#| hide
# checking on counts above
toy_data_test = [doc[1] for doc in toy_data] 
toy_data_test = [re.findall(r'\b\w+\b|[^\w\s]', text) for text in toy_data_test]
toy_data_test = [token.lower() for sublist in toy_data_test for token in sublist if token.strip()]
#print(toy_data_test)
# token count
print(len(toy_data_test)) # should be 38
# word token count
print(len(toy_data_test) - sum([1 for token in toy_data_test if token == '.'])) # should be 32
toy_data_test_unique = set(toy_data_test)
# unique tokens
print(len(toy_data_test_unique))
toy_data_test_unique_word = set([token for token in toy_data_test_unique if token != '.'])
# unique word tokens
print(len(toy_data_test_unique_word))

# based on this - toy corpus should have ... 
# document_count = 6
# token_count = 38
# word_token_count = 32
# unique_tokens = 15
# unique_word_tokens = 14

38
32
15
14


In [None]:
#| export
def create_toy_corpus_sources(source_path:str # path to location of sources for building corpora
							 ):
	""" Create txt files and csv to test build of toy corpus. """

	toy_path = os.path.join(source_path, 'toy')
	if not os.path.exists(toy_path):
		os.makedirs(toy_path, exist_ok=True)
	for row in toy_data:
		with open(f'{source_path}/toy/{row[0]}', 'w', encoding='utf-8') as f:
			f.write(row[1])
	df = pl.DataFrame(toy_data, orient='row', schema=(('source', str), ('text', str), ('category', str), ('species', str)))
	df.write_csv(os.path.join(source_path, 'toy.csv'))
	df.write_csv(os.path.join(source_path, 'toy.csv.gz'))


In [None]:
#| hide
#| eval: false
create_toy_corpus_sources(source_path)

In [None]:
#| export
def show_toy_corpus(
        csv_path:str # path to location of csv for building corpora
        ) -> GT: 
    """ Show toy corpus in a table. """
    
    toy_corpus_df = pl.read_csv(csv_path)
    GT(toy_corpus_df).tab_options(table_margin_left = 0).show()

In [None]:
show_toy_corpus(os.path.join(source_path, 'toy.csv'))

source,text,category,species
1.txt,The cat sat on the mat.,feline,cat
2.txt,The dog sat on the mat.,canine,dog
3.txt,The cat is meowing.,feline,cat
4.txt,The dog is barking.,canine,dog
5.txt,The cat is climbing a tree.,feline,cat
6.txt,The dog is digging a hole.,canine,dog


In [None]:
#| export
def get_nltk_corpus_sources(source_path:str # path to location of sources for building corpora
							 ):
	""" Get NLTK corpora as sources for development or testing Conc functionality. """

	try:
		import nltk
	except ImportError as e:
		raise ImportError('This function requires NLTK. To minimise requirements this is not installed by default. You can install NLTK with "pip install nltk"')

	import nltk
	nltk.download('gutenberg')
	nltk.download('brown')
	nltk.download('reuters')
	from nltk.corpus import gutenberg
	from nltk.corpus import reuters
	from nltk.corpus import brown

	def clean_text(text):
		# to match words/punc that followed by /tags
		pattern = re.compile(r"(\S+)(/[^ ]+)") # match non-space followed by / and non-space
		return pattern.sub(r"\1", text)

	if not os.path.exists(source_path):
		os.makedirs(source_path, exist_ok=True)
	if not os.path.exists(f'{source_path}/brown'):
		os.makedirs(f'{source_path}/brown', exist_ok=True)
	brown_path = os.path.join(source_path, 'brown.csv.gz')
	corpus_data = []
	for fileid in brown.fileids():
		corpus_data.append([fileid, clean_text(brown.raw(fileid))])
		with open(f'{source_path}/brown/{fileid}.txt', 'w', encoding='utf-8') as f:
			f.write(clean_text(brown.raw(fileid)))
	df = pl.DataFrame(corpus_data, orient='row', schema=(('source', str), ('text', str)))
	df.write_csv(brown_path)

	gutenberg_path = os.path.join(source_path, 'gutenberg.csv.gz')
	corpus_data = []
	for fileid in gutenberg.fileids():
		corpus_data.append([fileid, clean_text(gutenberg.raw(fileid))])
	df = pl.DataFrame(corpus_data, orient='row', schema=(('source', str), ('text', str)))
	df.write_csv(gutenberg_path)

	reuters_path = os.path.join(source_path, 'reuters.csv.gz')
	corpus_data = []
	for fileid in reuters.fileids():
		fileid_name = fileid.split('/')[1]
		corpus_data.append([fileid_name, clean_text(reuters.raw(fileid))])
	df = pl.DataFrame(corpus_data, orient='row', schema=(('source', str), ('text', str)))
	df.write_csv(reuters_path)


In [None]:
#| hide
#| eval: false
get_nltk_corpus_sources(source_path)

[nltk_data] Downloading package gutenberg to /home/geoff/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package brown to /home/geoff/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package reuters to /home/geoff/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


The texts for the Brown corpus from nltk can be used to test Conc functionality. The Reuters and Gutenberg corpora are also prepared by `get_nltk_corpus_sources`. Running the function will download the texts and save the texts as a .csv.gz files with columns: source and text. The Brown Corpus is also saved as .txt files to test the Corpus.build_from_texts method.

In [None]:
#| export
def get_garden_party(source_path: str #path to location of sources for building corpora
					):
	""" Get corpus of The Garden Party by Katherine Mansfield for development of Conc and testing Conc functionality. """

	path = 'https://github.com/ucdh/scraping-garden-party/raw/master/garden-party-corpus.zip'

	import requests
	try:
		import requests
	except ImportError as e:
		raise ImportError('This function requires the requests library. To minimise requirements this is not installed by default. You can install requests with "pip install requests"')

	r = requests.get(path)
	with open(f'{source_path}/garden-party-corpus.zip', 'wb') as f:
		f.write(r.content)
	# converting to .tar and tar.gz files for testing
	import zipfile
	with zipfile.ZipFile(f'{source_path}/garden-party-corpus.zip', 'r') as z:
		z.extractall(f'{source_path}/garden-party-corpus')
	import shutil # make tar.gz
	shutil.make_archive(f'{source_path}/garden-party-corpus', 'gztar', f'{source_path}/garden-party-corpus')
	shutil.move(f'{source_path}/garden-party-corpus.tar.gz', f'{source_path}/garden-party-corpus.tar.gz')
	shutil.make_archive(f'{source_path}/garden-party-corpus', 'tar', f'{source_path}/garden-party-corpus')
	shutil.move(f'{source_path}/garden-party-corpus.tar', f'{source_path}/garden-party-corpus.tar')
	shutil.rmtree(f'{source_path}/garden-party-corpus')
	

The `get_garden_party` function downloads a zip file of an example corpus based on Katherine Mansfield short stories. This function creates a .tar and a .tar.gz version of the texts for testing Corpus build methods.  

In [None]:
#| eval: false
get_garden_party(source_path)

## Create large corpora for development and testing

In [None]:
#| export
def get_large_dataset(source_path: str #path to location of sources for building corpora
                    ):
    """ Get 1m rows of https://huggingface.co/datasets/Eugleo/us-congressional-speeches-subset for testing. """
    df = pl.read_parquet('hf://datasets/Eugleo/us-congressional-speeches-subset/data/train-*.parquet')
    df.sample(1000000).select(['speech_id', 'date', 'speaker', 'chamber', 'state', 'text']).write_csv(f'{source_path}/us-congressional-speeches-subset-1m.csv.gz')
    del df


In [None]:
#| hide
# get_large_dataset(source_path)

In [None]:
#| hide
# Define the chunk size
# chunk_size = 100_000  # Adjust based on your memory constraints

# # Lazily load the CSV file
# df = pl.scan_csv(f'{source_path}us-congressional-speeches-subset-1m.csv.gz')

# # Add the new column 'is_empty'
# df = df.with_columns(
#     (pl.col('text').str.strip_chars().eq('')).alias('is_empty')
# )

# # get length of is_empty where True
# count = df.filter(pl.col("is_empty") == True).collect().height
# print(f"Number of empty rows: {count}")

# any empty?
#len(df[df['text'].is_null()])

# get distribution of date (by year), speaker, chamber, state
# dates are in iso format - extract year and summarize
# df = df.with_columns(pl.col('date').str.slice(0, 4).alias('year'))
# df.group_by('year').agg(pl.count('year').alias('count')).sort('year', descending=True).head(10).collect()

# #df.group_by('speaker').agg(pl.count('speaker').alias('count')).sort('count', descending=True).head(20)
# #df.group_by('chamber').agg(pl.count('chamber').alias('count')).sort('count', descending=True).head(20)
# df.group_by('state').agg(pl.count('state').alias('count')).sort('count', descending=True).head(20)

In [None]:
#| export
def create_large_dataset_sizes(source_path: str, #path to location of sources for building corpora
						sizes: list = [10000, 100000, 200000, 500000] # list of sizes for test data-sets
						):
	""" Create datasets of different sizes from data source retrieved by get_large_dataset for testing. """
	for max_i in sizes:
		max_i_label = int(max_i / 1000)
		df = pl.read_csv(f'{source_path}/us-congressional-speeches-subset-1m.csv.gz')
		df.sample(max_i).write_csv(f'{source_path}/us-congressional-speeches-subset-{max_i_label}k.csv.gz')
		logger.info(f'Creating dataset of {max_i_label}k rows')


In [None]:
#| hide
# create_large_dataset_sizes(source_path)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()