# listcorpus

> Representation of frequency information for a corpus, which can be be used as a reference corpus for keyword analysis.
- toc: false
- page-layout: full

Note: to generate a frequency table for a corpus, see `Conc.frequencies`.

In [None]:
#| default_exp listcorpus

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
from fastcore.basics import patch
import shutil
from slugify import slugify
import time
import msgspec
import polars as pl
import os


In [None]:
#| export
from conc import __version__
from conc.core import logger, CorpusMetadata, PAGE_SIZE, EOF_TOKEN_STR, ERR_TOKEN_STR, REPOSITORY_URL, DOCUMENTATION_URL, CITATION_STR, PYPI_URL
from conc.corpus import Corpus
from conc.result import Result

In [None]:
#| hide
from conc.core import set_logger_state, spacy_attribute_name

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

## ListCorpus class

In [None]:
#| export
class ListCorpus:
	"""Represention of a corpus based on frequency information, which can be loaded as a reference corpus."""
	
	def __init__(self, 
				name: str = '', # name of corpus
				description: str = '' # description of corpus
				):
		# information about corpus
		self.name = name
		self.description = description
		self.slug = None

		# conc version that built the corpus
		self.conc_version = None
		
		# paths
		self.corpus_path = None
		self.source_path = None

		# settings
		self.SPACY_MODEL = None
		self.SPACY_MODEL_VERSION = None
		self.SPACY_EOF_TOKEN = None # set below as nlp.vocab[EOF_TOKEN_STR].orth in build or through load  - EOF_TOKEN_STR starts with space so eof_token can't match anything from corpus
		self.EOF_TOKEN = None

		# special token ids
		self.punct_tokens = None
		self.space_tokens = None

		# metadata for corpus
		self.document_count = None
		self.token_count = None
		self.unique_tokens = None

		self.word_token_count = None
		self.unique_word_tokens = None

		self.date_created = None

		# token data
		self.tokens = None
		self.vocab = None

		self.puncts = None
		self.spaces = None

		# metadata for each document
		self.metadata = None

		self.ngram_index = {}
		self.results_cache = {}
		self.expected_files_ = ['listcorpus.json', 'vocab.parquet']
		self.required_tables_ = ['vocab']


In [None]:
#| export
LISTCORPUS_README_TEMPLATE = """# {name}

## About

This directory contains frequency data for a corpus created using the [Conc]({REPOSITORY_URL}) Python library. 
This can be used as a reference corpus for keyword analysis with Conc. 

## Original Corpus Information

{description}

Date created: {date_created}  
Document count: {document_count}  
Token count: {token_count}  
Word token count: {word_token_count}  
Unique tokens: {unique_tokens}  
Unique word tokens: {unique_word_tokens}  
Conc Version Number: {conc_version}  
spaCy model: {SPACY_MODEL}, version {SPACY_MODEL_VERSION}  

## Using this reference corpus
 
Conc can be installed [via pip]({PYPI_URL}). The [Conc documentation site]({DOCUMENTATION_URL}) 
has tutorials and detailed information to get you started with Conc or to work with the corpus 
data directly.  

## Using the frequency information

You can work with this data directly using the Polars library, or using another 
library that supports the Parquet format. See the [Conc documentation site]({DOCUMENTATION_URL}) 
for more information.  

## Cite Conc

{CITATION_STR}

"""

In [None]:
#| export
@patch
def build_from_corpus(self: ListCorpus, 
				   source_corpus_path:str, # path to a Conc corpus directory
				   save_path:str # directory where corpus will be created, a subdirectory will be automatically created with the corpus content
		  ) -> None:
	""" Build a List Corpus from a Conc corpus. """
	
	if not os.path.isdir(source_corpus_path):
		raise FileNotFoundError(f"Source corpus path '{source_corpus_path}' does not exist or is not a directory.")
	
	start_time = time.time()

	self.slug = os.path.basename(source_corpus_path)
	if self.slug.endswith('.corpus'):
		self.slug = self.slug[:-7]
	self.corpus_path = os.path.join(save_path, f'{self.slug}.listcorpus')

	if not os.path.isdir(self.corpus_path):
		os.makedirs(self.corpus_path, exist_ok=True)

	try:
		shutil.copy(os.path.join(source_corpus_path, 'corpus.json'), os.path.join(self.corpus_path, 'listcorpus.json'))
		shutil.copy(os.path.join(source_corpus_path, 'vocab.parquet'), os.path.join(self.corpus_path, 'vocab.parquet'))
	except FileNotFoundError:
		raise FileNotFoundError(f"Expected files not found in source corpus path '{source_corpus_path}'. Is this a valid Conc corpus?")
	
	self.load(corpus_path=self.corpus_path)

	with open(f'{self.corpus_path}/README.md', 'w', encoding='utf-8') as f:
		f.write(LISTCORPUS_README_TEMPLATE.format(
			name=self.name,
			REPOSITORY_URL=REPOSITORY_URL,
			PYPI_URL=PYPI_URL,
			DOCUMENTATION_URL=DOCUMENTATION_URL,
			CITATION_STR=CITATION_STR,
			description=self.description,
			date_created=self.date_created,
			document_count=self.document_count,
			token_count=self.token_count,
			word_token_count=self.word_token_count,
			unique_tokens=self.unique_tokens,
			unique_word_tokens=self.unique_word_tokens,
			conc_version=self.conc_version,
			SPACY_MODEL=self.SPACY_MODEL,
			SPACY_MODEL_VERSION=self.SPACY_MODEL_VERSION
		))

	# adding document counts for each token
	document_counts_lower = pl.scan_parquet(os.path.join(source_corpus_path, 'tokens.parquet')).select(pl.col('lower_index').alias('token_id'), pl.col('token2doc_index')).group_by('token_id').agg(pl.col('token2doc_index').n_unique().alias('document_frequency_lower'))
	self.vocab = self.vocab.join(document_counts_lower, on='token_id', how='left', maintain_order='left')
	document_counts_orth = pl.scan_parquet(os.path.join(source_corpus_path, 'tokens.parquet')).select(pl.col('orth_index').alias('token_id'), pl.col('token2doc_index')).group_by('token_id').agg(pl.col('token2doc_index').n_unique().alias('document_frequency_orth'))
	self.vocab = self.vocab.join(document_counts_orth, on='token_id', how='left', maintain_order='left')
	
	# rewriting the vocab file with doc frequencies
	self.vocab.collect().write_parquet(os.path.join(self.corpus_path, 'vocab.parquet'))

	self._init_corpus_dataframes()

	logger.info(f"Built ListCorpus {os.path.basename(self.corpus_path)} {time.time() - start_time:.2f} seconds.")

	return self


In [None]:
#| exporti
@patch
def _init_corpus_dataframes(self: ListCorpus):
	""" Initialize dataframes after build or load """
	
	for file in self.expected_files_:
		if not os.path.isfile(os.path.join(self.corpus_path, file)):
			raise FileNotFoundError(f"Expected file '{file}' not found in corpus path '{self.corpus_path}'")

	for file in self.required_tables_:
		self.__setattr__(file, pl.scan_parquet(f'{self.corpus_path}/{file}.parquet'))

In [None]:
#| export
@patch
def load(self: ListCorpus, 
		 corpus_path: str # path to load corpus
		 ):
	""" Load list corpus from disk. """

	logger.memory_usage('init', init=True)

	start_time = time.time()

	if not os.path.isdir(corpus_path):
		raise FileNotFoundError(f"Path '{corpus_path}' is not a directory")
	
	if not all(os.path.isfile(os.path.join(corpus_path, f)) for f in self.expected_files_):
		raise FileNotFoundError(f"Path '{corpus_path}' does not contain all expected files: {self.expected_files_}")

	self.corpus_path = corpus_path

	with open(f'{self.corpus_path}/listcorpus.json', 'rb') as f:
		data = msgspec.json.decode(f.read(), type=CorpusMetadata)

	for k in data.__slots__:
		setattr(self, k, getattr(data, k))

	self._init_corpus_dataframes()

	logger.info(f'Load time: {(time.time() - start_time):.3f} seconds')

	return self

In [None]:
#| export
@patch
def info(self: ListCorpus, 
		 formatted:bool = True # return formatted output
		 ) -> str: # formatted information about the corpus
	""" Return information about the list corpus. """
	
	result = []
	attributes = ['name', 'description', 'date_created', 'conc_version', 'corpus_path', 'document_count', 'token_count', 'word_token_count', 'unique_tokens', 'unique_word_tokens']
	for attr in attributes:
		value = getattr(self, attr)
		if isinstance(value, bool):
			result.append('True' if value else 'False')
		elif isinstance(value, int):
			result.append(f'{value:,}')
		else:
			result.append(str(value))
			
	if formatted:
		attributes = [attr.replace('_', ' ').title() for attr in attributes]

	return pl.DataFrame({'Attribute': attributes, 'Value': result})


In [None]:
#| export
@patch
def report(self: ListCorpus, 
			) -> Result: # returns Result object with corpus summary information
	""" Get information about the list corpus as a result object. """
	return Result('summary', self.info(), 'List Corpus Summary', '', {}, [])	

In [None]:
#| export
@patch
def summary(self: ListCorpus, 
			include_memory_usage:bool = False # include memory usage in output
			):
	""" Print information about the list corpus in a formatted table. """
	result = Result('summary', self.info(include_memory_usage), 'List Corpus Summary', '', {}, [])
	result.display()

In [None]:
#| exporti
@patch
def __str__(self: ListCorpus):
	""" Formatted information about the list corpus. """
	
	return str(self.info())



In [None]:
#| hide
set_logger_state('verbose')
listcorpus = ListCorpus().build_from_corpus(source_corpus_path = f'{save_path}toy.corpus', save_path = save_path)
listcorpus.summary()
assert listcorpus.name == 'Toy Corpus'
assert listcorpus.token_count == 38
assert os.path.isdir(listcorpus.corpus_path)
assert os.path.isfile(f'{listcorpus.corpus_path}/README.md')
assert os.path.isfile(f'{listcorpus.corpus_path}/vocab.parquet')
assert os.path.isfile(f'{listcorpus.corpus_path}/listcorpus.json')
assert type(listcorpus.vocab) == pl.LazyFrame
set_logger_state('quiet')

2025-07-10 20:37:32 - INFO - memory_usage - init, memory usage: 164.9765625 MB
2025-07-10 20:37:32 - INFO - load - Load time: 0.002 seconds
2025-07-10 20:37:32 - INFO - build_from_corpus - Built ListCorpus toy.listcorpus 0.03 seconds.


List Corpus Summary,List Corpus Summary
Attribute,Value
name,Toy Corpus
description,Toy corpus is a very small dataset for testing and library development.
date_created,2025-07-09 09:21:39
conc_version,0.1.6
corpus_path,/home/geoff/data/conc-test-corpora/toy.listcorpus
document_count,6
token_count,38
word_token_count,32
unique_tokens,15
unique_word_tokens,14


In [None]:
#| hide
set_logger_state('verbose')
listcorpus = ListCorpus().load(corpus_path = f'{save_path}toy.listcorpus')
assert listcorpus.name == 'Toy Corpus'
assert listcorpus.token_count == 38
assert type(listcorpus.vocab) == pl.LazyFrame
set_logger_state('quiet')

2025-07-10 20:37:32 - INFO - memory_usage - init, memory usage: 192.93359375 MB
2025-07-10 20:37:32 - INFO - load - Load time: 0.001 seconds


## Information on working with the list corpus format

To create a list corpus you first need to create a standard corpus using `Conc.corpus`. See the [recipes](https://geoffford.nz/conc/tutorials/recipes.html) for examples. 

Note: if you intend to use the list corpus as a reference corpus for keyness analsis, it will probably be helpful to add `standardize_word_token_punctuation_characters` to the build method when building the source corpus. This will ensure that word tokens with punctuation (e.g. n't) use the same apostrophe character and allow Conc to handle these differences when calculating keyness.

Once created you create a list corpus by creating in the path to the corpus directory ... 

In [None]:
listcorpus = ListCorpus().build_from_corpus(source_corpus_path = f'{save_path}garden-party.corpus', save_path = save_path)

In [None]:
#| hide
from pathlib import Path

List corpus will copy some of the data from the corpus, and add document frequency information for each token. Conc uses the .listcorpus suffix on directories to differentiate standard corpora from list corpora. The directory for the list corpus will contain corpus information in listcorpus.json, the frequency information in the vocab.parquet file, and a human-readable README.md to aide sharing the data. 

In [None]:
#| echo: false
def print_directory_tree(path, prefix="", restrict_to=None):
	path = Path(path)
	contents = list(path.iterdir())
	pointers = ['├── '] * (len(contents) - 1) + ['└── ']
	for pointer, child in zip(pointers, contents):
		if restrict_to is not None and restrict_to not in child.name:
			continue
		print(prefix + pointer + child.name)
		if child.is_dir():
			extension = '│   ' if pointer == '├── ' else '    '
			print_directory_tree(child, prefix + extension)

# Example usage: show current directory
print_directory_tree(f'{save_path}', '', restrict_to='garden-party.listcorpus')


├── garden-party.listcorpus
│   ├── vocab.parquet
│   ├── README.md
│   └── listcorpus.json


You can access summary information, with the same methods as the `Conc.corpus` class. 

For example ...

In [None]:
listcorpus.summary()

List Corpus Summary,List Corpus Summary
Attribute,Value
name,Garden Party Corpus
description,A corpus of short stories from The Garden Party: and Other Stories by Katherine Mansfield. Texts downloaded from Project Gutenberg https://gutenberg.org/ and are in the public domain. The text files contain the short story without the title. https://github.com/ucdh/scraping-garden-party
date_created,2025-07-09 11:15:56
conc_version,0.1.6
corpus_path,/home/geoff/data/conc-test-corpora/garden-party.listcorpus
document_count,15
token_count,74664
word_token_count,59514
unique_tokens,5410
unique_word_tokens,5392


In [None]:
#| hide
# listcorpus = ListCorpus().build_from_corpus(source_corpus_path = f'{save_path}brown.corpus', save_path = save_path)
# listcorpus = ListCorpus().build_from_corpus(source_corpus_path = f'{save_path}bnc.corpus', save_path = save_path)
# listcorpus = ListCorpus().build_from_corpus(source_corpus_path = f'{save_path}baby-bnc.corpus', save_path = save_path)

This preview of the vocab table shows the available columns in case you want to access the data directly. The [anatomy](https://geoffford.nz/conc/explanations/anatomy.html) page has information on the columns from the standard Conc corpus format that is relevant to working with a list corpus.

In [None]:
display(listcorpus.vocab.head(1000).collect().sample(10))

rank,tokens_sort_order,token_id,token,frequency_lower,frequency_orth,is_punct,is_space,document_frequency_lower,document_frequency_orth
509,4863,1482,"""sky""",17.0,17,False,False,8.0,8
284,3999,4150,"""pink""",32.0,32,False,False,10.0,10
399,3719,3175,"""On""",,22,False,False,,8
450,2504,1071,"""held""",19.0,19,False,False,9.0,9
87,5523,691,"""this""",138.0,112,False,False,14.0,14
491,6171,1788,"""women""",20.0,18,False,False,10.0,9
256,6110,5588,"""why""",99.0,37,False,False,12.0,10
720,4974,3520,"""somebody""",13.0,11,False,False,5.0,5
398,2922,2620,"""Kember""",,22,False,False,,1
472,2016,1213,"""followed""",18.0,18,False,False,8.0,8


In [None]:
#| export
@patch
def get_token_count_text(self: ListCorpus, 
					exclude_punctuation:bool = False # exclude punctuation tokens from the count
					) -> tuple[int, str, str]: # token count with adjustments based on exclusions, token descriptor, total descriptor
	""" Get the token count for the corpus with adjustments and text for output """

	count_tokens = self.token_count
	tokens_descriptor = 'word and punctuation tokens'
	total_descriptor = 'Total word and punctuation tokens'
	if exclude_punctuation:
		count_tokens = self.word_token_count
		tokens_descriptor = 'word tokens'
		total_descriptor = 'Total word tokens'

	return count_tokens, tokens_descriptor, total_descriptor

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()