# core

> Helper functions and classes for Conc.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import re
import os
import logging
from great_tables import GT
import polars as pl
import msgspec
import spacy
from memory_profiler import _get_memory

In [None]:
#| export
PAGE_SIZE = 20
EOF_TOKEN_STR = ' conc-end-of-file-token'
ERR_TOKEN_STR = 'ERROR: not a token'

In [None]:
#| export
DOCUMENTATION_URL = 'https://geoffford.nz/conc'
REPOSITORY_URL = 'https://github.com/polsci/conc'
PYPI_URL = ''
CITATION_STR = '''If you use Conc in your work, please cite it as follows:'''

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

In [None]:
#| hide
polars_conf = pl.Config.set_tbl_width_chars(300)
polars_conf = pl.Config.set_fmt_str_lengths(300)

## Logging

In [None]:
#| exporti
class ConcLogger(logging.Logger):
	""" Custom logger for conc module. """
	def __init__(self, name, level=logging.WARNING, log_file=None):
		super().__init__(name, level)
		self._setup_handler(log_file)
		self.last_memory_usage = None

	def _setup_handler(self, log_file = None):
		console_handler = logging.StreamHandler()
		formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(funcName)s - %(message)s', 
									  datefmt='%Y-%m-%d %H:%M:%S')
		console_handler.setFormatter(formatter)
		self.addHandler(console_handler)

		if log_file is not None:
			file_handler = logging.FileHandler(log_file)
			file_handler.setFormatter(formatter)
			self.addHandler(file_handler)

	def set_state(self, state:str # 'quiet' or 'verbose'
				  ):
		if state == 'quiet':
			level = logging.WARNING
		elif state == 'verbose':
			level = logging.DEBUG
		else:
			raise ValueError(f"Invalid state: {state}")
		
		self.setLevel(level)

	def memory_usage(self, message = '', init=False):
		if init:
			self.last_memory_usage = None
		usage = _get_memory(-1, 'psutil', include_children=True)
		if self.last_memory_usage is not None:
			difference = usage - self.last_memory_usage
			memory_message = f', memory usage: {usage} MB, difference: {difference} MB'
		else:
			memory_message = f', memory usage: {usage} MB'
		self.info(f"{message}{memory_message}")
		self.last_memory_usage = usage


In [None]:
#| export
logging.setLoggerClass(ConcLogger)

logger = logging.getLogger(__name__)


In [None]:
#| export
def set_logger_state(state:str # 'quiet' or 'verbose'
					 ):
	""" Set the state of the conc logger to either 'quiet' or 'verbose' """
	logger.set_state(state)

## spaCy

In [None]:
#| hide
# This is a quick reminder of the available spacy attributes that can be output for a doc (depending on the model and pipe settings)
for attr in spacy.attrs.IDS:
	if attr and not attr.startswith('FLAG'):
		print(f'{attr}: {spacy.attrs.IDS[attr]}')

IS_ALPHA: 1
IS_ASCII: 2
IS_DIGIT: 3
IS_LOWER: 4
IS_PUNCT: 5
IS_SPACE: 6
IS_TITLE: 7
IS_UPPER: 8
LIKE_URL: 9
LIKE_NUM: 10
LIKE_EMAIL: 11
IS_STOP: 12
IS_OOV_DEPRECATED: 13
IS_BRACKET: 14
IS_QUOTE: 15
IS_LEFT_PUNCT: 16
IS_RIGHT_PUNCT: 17
IS_CURRENCY: 18
ID: 64
ORTH: 65
LOWER: 66
NORM: 67
SHAPE: 68
PREFIX: 69
SUFFIX: 70
LENGTH: 71
LEMMA: 73
POS: 74
TAG: 75
DEP: 76
ENT_IOB: 77
ENT_TYPE: 78
ENT_ID: 454
ENT_KB_ID: 452
HEAD: 79
SENT_START: 80
SPACY: 81
LANG: 83
MORPH: 453
IDX: 455


In [None]:
#| export
def spacy_attribute_name(index):
	"""Get name of index from spacy."""

	return list(spacy.attrs.IDS.keys())[list(spacy.attrs.IDS.values()).index(index)]

## Corpus metadata schema

In [None]:
#| export
class CorpusMetadata(msgspec.Struct): 
    """ JSON validation schema for corpus metadata """
    name: str
    description: str
    slug: str
    conc_version: str
    document_count: int
    token_count: int
    word_token_count: int
    punct_token_count: int
    space_token_count: int
    unique_tokens: int
    unique_word_tokens: int
    date_created: str
    EOF_TOKEN: int
    SPACY_EOF_TOKEN: int
    SPACY_MODEL: str
    SPACY_MODEL_VERSION: str
    punct_tokens: list[int]
    space_tokens: list[int]



In [None]:
#| echo: true
properties = msgspec.json.schema(CorpusMetadata)['$defs']['CorpusMetadata']['properties']
display(properties)

{'name': {'type': 'string'},
 'description': {'type': 'string'},
 'slug': {'type': 'string'},
 'conc_version': {'type': 'string'},
 'document_count': {'type': 'integer'},
 'token_count': {'type': 'integer'},
 'word_token_count': {'type': 'integer'},
 'punct_token_count': {'type': 'integer'},
 'space_token_count': {'type': 'integer'},
 'unique_tokens': {'type': 'integer'},
 'unique_word_tokens': {'type': 'integer'},
 'date_created': {'type': 'string'},
 'EOF_TOKEN': {'type': 'integer'},
 'SPACY_EOF_TOKEN': {'type': 'integer'},
 'SPACY_MODEL': {'type': 'string'},
 'SPACY_MODEL_VERSION': {'type': 'string'},
 'punct_tokens': {'type': 'array', 'items': {'type': 'integer'}},
 'space_tokens': {'type': 'array', 'items': {'type': 'integer'}}}

## Get word lists

In [None]:
#| export
def get_stop_words(save_path:str, # directory to save stop words to, file name will be created based on spaCy model name
				   spacy_model:str = 'en_core_web_sm' # model to get stop words for
					):
	""" Get stop words from spaCy and cache to disk """

	stop_words = None

	filename = f'{spacy_model}_stop_words.txt'
	save_to = os.path.join(save_path, filename)

	if os.path.exists(save_to):
		with open(save_to, 'r', encoding='utf-8') as f:
			stop_words = set(f.read().splitlines())

	if stop_words is None:
		nlp = spacy.load(spacy_model)
		stop_words = nlp.Defaults.stop_words
		del nlp

		if not os.path.exists(save_path):
			os.makedirs(save_path)

		with open(save_to, 'w', encoding='utf-8') as f:
			for word in stop_words:
				f.write(word + '\n')

	return stop_words

In [None]:
print(get_stop_words(save_path = save_path, spacy_model='en_core_web_sm'))

{'but', 'down', 'more', 'whom', 'there', 'becoming', 'hereupon', 'hers', 'same', '’s', 'name', 'does', 'anyway', 'did', 'being', 'do', 'throughout', 'that', 'whoever', 'after', 'could', 'how', 'against', 'around', 'however', 'here', 'except', 'any', 'side', 'their', 'into', 'since', 'everywhere', 'thus', 're', 'what', 'thereupon', 'everyone', 'might', 'due', 'other', 'out', "'d", 'nowhere', 'whose', 'if', 'elsewhere', 'beforehand', 'although', 'in', 'latter', 'must', 'front', 'to', 'than', 'are', 'once', 'one', 'cannot', 'i', 'amongst', 'itself', 'these', 'a', 'am', 'put', 'about', 'who', 'quite', 'ourselves', 'everything', 'across', 'sometime', 'her', 'get', 'well', 'moreover', 'below', 'my', 'above', '’d', 'something', 'such', 'also', '’m', 'on', 'every', 'mostly', 'per', 'never', 'anyhow', 'whatever', 'is', 'whence', 'full', 'next', 'nothing', 'regarding', 'only', 'always', 'seems', 'alone', 'take', 'together', "'m", '‘m', 'you', 'hundred', 'beside', 'still', 'have', 'twelve', 'twen

## Access these functions from conc.corpora

Up to version 0.1.1 conc.core included helper functions to list, download and build corpora. These have been moved to the `conc.corpora` module. Running these functions will trigger a warning with a note about depreciation and the new location of the functions. Access to these functions will only be via conc.corpora by Conc version 1.0.0.

In [None]:
#| export
def list_corpora(
		path: str # path to load corpus
		) -> pl.DataFrame: # Dataframe with path, corpus, corpus name, document count, token count
	""" (Depreciated - call via conc.corpora) Scan a directory for available corpora """
	
	logger.warning(DeprecationWarning("Calling list_corpora via conc.core is depreciated and will be removed by v1.0.0, instead import with 'from conc.corpora import list_corpora' and call as before."))

	from conc.corpora import list_corpora as _list_corpora
	return _list_corpora(path=path)


In [None]:
#| export
def create_toy_corpus_sources(source_path:str # path to location of sources for building corpora
							 ):
	""" (Depreciated - call via conc.corpora) Create txt files and csv to test build of toy corpus. """

	logger.warning(DeprecationWarning("Calling create_toy_corpus_sources via conc.core is depreciated and will be removed by v1.0.0, instead import with 'from conc.corpora import create_toy_corpus_sources' and call as before."))

	from conc.corpora import create_toy_corpus_sources as _create_toy_corpus_sources
	return _create_toy_corpus_sources(source_path=source_path)

In [None]:
#| export
def show_toy_corpus(
        csv_path:str # path to location of csv for building corpora
        ) -> GT: 
    """ (Depreciated - call via conc.corpora) Show toy corpus in a table. """

    logger.warning(DeprecationWarning("Calling show_toy_corpus via conc.core is depreciated and will be removed by v1.0.0, instead import with 'from conc.corpora import show_toy_corpus' and call as before."))
    
    from conc.corpora import show_toy_corpus as _show_toy_corpus
    return _show_toy_corpus(csv_path=csv_path)

In [None]:
#| export
def get_nltk_corpus_sources(source_path:str # path to location of sources for building corpora
							 ):
	""" (Depreciated - call via conc.corpora) Get NLTK corpora as sources for development or testing Conc functionality. """

	logger.warning(DeprecationWarning("Calling get_nltk_corpus_sources via conc.core is depreciated and will be removed by v1.0.0, instead import with 'from conc.corpora import get_nltk_corpus_sources' and call as before."))

	from conc.corpora import get_nltk_corpus_sources as _get_nltk_corpus_sources
	return _get_nltk_corpus_sources(source_path=source_path)

In [None]:
#| export
def get_garden_party(source_path: str #path to location of sources for building corpora
					):
	""" (Depreciated - call via conc.corpora) Get corpus of The Garden Party by Katherine Mansfield for development of Conc and testing Conc functionality. """

	logger.warning(DeprecationWarning("Calling get_garden_party via conc.core is depreciated and will be removed by v1.0.0, instead import with 'from conc.corpora import get_garden_party' and call as before."))

	from conc.corpora import get_garden_party as _get_garden_party
	return _get_garden_party(source_path=source_path)

In [None]:
#| export
def get_large_dataset(source_path: str #path to location of sources for building corpora
                    ):
    """ (Depreciated - call via conc.corpora) Get 1m rows of https://huggingface.co/datasets/Eugleo/us-congressional-speeches-subset for testing. """

    logger.warning(DeprecationWarning("Calling get_large_dataset via conc.core is depreciated and will be removed by v1.0.0, instead import with 'from conc.corpora import get_large_dataset' and call as before."))

    from conc.corpora import get_large_dataset as _get_large_dataset
    return _get_large_dataset(source_path=source_path)

In [None]:
#| export
def create_large_dataset_sizes(source_path: str, #path to location of sources for building corpora
						sizes: list = [10000, 100000, 200000, 500000] # list of sizes for test data-sets
						):
	""" (Depreciated - call via conc.corpora) Create datasets of different sizes from data source retrieved by get_large_dataset for testing. """
	
	logger.warning(DeprecationWarning("Calling create_large_dataset_sizes via conc.core is depreciated and will be removed by v1.0.0, instead import with 'from conc.corpora import create_large_dataset_sizes' and call as before."))

	from conc.corpora import create_large_dataset_sizes as _create_large_dataset_sizes
	return _create_large_dataset_sizes(source_path=source_path, sizes = sizes)


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()