# conc

> An interface to create Conc reports for corpus linguistic analysis of frequency, concordances, ngrams, keyness, and collocation.

In [None]:
#| default_exp conc

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import time
from fastcore.basics import patch

In [None]:
#| export
from conc.result import Result
from conc.core import PAGE_SIZE, logger
from conc.corpus import Corpus
from conc.frequency import Frequency
from conc.ngrams import Ngrams
from conc.concordance import Concordance
from conc.keyness import Keyness
from conc.collocates import Collocates

In [None]:
#| hide
from conc.core import set_logger_state

In [None]:
#| hide
import os

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

path_to_toy_corpus = f'{save_path}toy.corpus'
path_to_brown_corpus = f'{save_path}brown.corpus'
path_to_reuters_corpus = f'{save_path}reuters.corpus'

In [None]:
#| export
class Conc:
	"""Unified interface to Conc reporting for analysis of frequency, ngrams, concordances, keyness, and collocates."""
	
	def __init__(self, 
				corpus # Corpus instance
				):
		# information about corpus
		self.corpus = corpus
		self.frequency_ = Frequency(corpus)
		self.ngrams_ = Ngrams(corpus)
		self.concordance_ = Concordance(corpus)
		self.keyness_ = None
		self.collocates_ = Collocates(corpus)

In [None]:
# load (or build) a corpus
reuters = Corpus('reuters').load(path_to_reuters_corpus)

In [None]:
#get a summary
reuters.summary()

Corpus Summary,Corpus Summary
Attribute,Value
Name,Reuters Corpus
Description,"Reuters corpus (Reuters-21578, Distribution 1.0). ""The copyright for the text of newswire articles and Reuters annotations in the Reuters-21578 collection resides with Reuters Ltd. Reuters Ltd. and Carnegie Group, Inc. have agreed to allow the free distribution of this data *for research purposes only*. If you publish results based on this data set, please acknowledge its use, refer to the data set by the name (Reuters-21578, Distribution 1.0), and inform your readers of the current location of the data set."" https://kdd.ics.uci.edu/databases/reuters21578/reuters21578.html. This version downloaded via NLTK https://www.nltk.org/nltk_data/."
Date Created,2025-06-09 12:44:27
Conc Version,0.0.1
Corpus Path,/home/geoff/data/conc-test-corpora/reuters.corpus
Document Count,10788
Token Count,1552919
Word Token Count,1398782
Unique Tokens,49901
Unique Word Tokens,49860


In [None]:
# create a Conc report instance for the corpus
conc = Conc(reuters)

In [None]:
#| export
@patch
def frequencies(self: Conc,
				case_sensitive:bool=False, # frequencies for tokens with or without case preserved 
				normalize_by:int=10000, # normalize frequencies by a number (e.g. 10000)
				page_size:int=PAGE_SIZE, # number of rows to return, if 0 returns all
				page_current:int=1, # current page, ignored if page_size is 0
				show_token_id:bool=False, # show token_id in output
				show_document_frequency:bool=False, # show document frequency in output
				exclude_tokens:list[str]=[], # exclude specific tokens from frequency report, can be used to remove stopwords
				exclude_tokens_text:str = '', # text to explain which tokens have been excluded, will be added to the report notes
				restrict_tokens:list[str]=[], # restrict frequency report to return frequencies for a list of specific tokens
				restrict_tokens_text:str = '', # text to explain which tokens are included, will be added to the report notes
				exclude_punctuation:bool=True # exclude punctuation tokens
				) -> Result: # return a Result object with the frequency table
	""" Report frequent tokens. """
	return self.frequency_.frequencies(case_sensitive=case_sensitive,
										normalize_by=normalize_by,
										page_size=page_size,
										page_current=page_current,
										show_token_id=show_token_id,
										show_document_frequency=show_document_frequency,
										exclude_tokens=exclude_tokens,
										exclude_tokens_text=exclude_tokens_text,
										restrict_tokens=restrict_tokens,
										restrict_tokens_text=restrict_tokens_text,
										exclude_punctuation=exclude_punctuation)

In [None]:
#| hide
# said mentioned 25379

In [None]:
conc.frequencies(normalize_by=10000).display()

Frequencies,Frequencies,Frequencies,Frequencies
"Frequencies of word tokens, Reuters Corpus","Frequencies of word tokens, Reuters Corpus","Frequencies of word tokens, Reuters Corpus","Frequencies of word tokens, Reuters Corpus"
Rank,Token,Frequency,Normalized Frequency
1,the,69263,495.17
2,of,36779,262.94
3,to,36328,259.71
4,in,29252,209.12
5,and,25645,183.34
6,said,25379,181.44
7,a,24844,177.61
8,mln,18621,133.12
9,vs,14332,102.46
10,for,13720,98.09


In [None]:
#| export
@patch
def ngrams(self: Conc, 
		   token_str: str, # token string to get ngrams for 
		   ngram_length:int = 2, # length of ngram
		   ngram_token_position: str = 'LEFT', # specify if token sequence is on LEFT or RIGHT (support for ngrams with token in middle of sequence is in-development))
		   normalize_by:int=10000, # normalize frequencies by a number (e.g. 10000)
		   page_size:int = PAGE_SIZE, # number of results to display per results page 
		   page_current:int = 1, # current page of results
		   show_all_columns:bool = False, # return raw df with all columns or just ngram and frequency
		   exclude_punctuation:bool=True, # do not return ngrams with punctuation tokens
		   use_cache:bool = True # retrieve the results from cache if available
		   ) -> Result: # return a Result object with ngram data
	""" Report ngram frequencies containing a token string. """

	return self.ngrams_.ngrams(token_str, 
							ngram_length=ngram_length, 
							ngram_token_position=ngram_token_position, 
							normalize_by=normalize_by,
							page_size=page_size, 
							page_current=page_current, 
							show_all_columns=show_all_columns, 
							exclude_punctuation=exclude_punctuation,
							use_cache=use_cache)

In [None]:
#| hide
# said mentioned 25,379 from above - matches count 25,379 for bigrams ... 
# conc.ngrams(token_str = 'said', ngram_length = 2, ngram_token_position = 'LEFT', exclude_punctuation = False, page_size = 5).display()
# if change to trigrams - get different count 25,355 - this is due to EOF removal
set_logger_state('verbose')
conc.ngrams(token_str = 'said', ngram_length = 3, ngram_token_position = 'RIGHT', exclude_punctuation = True, page_size = 5).display()
set_logger_state('quiet')
# the company said ngram shown as 1,173

2025-06-10 10:54:21 - INFO - _init_token_arrays - Created tokens_array in 0.015 seconds
2025-06-10 10:54:21 - INFO - _init_token_arrays - Created tokens_lookup in 0.006 seconds
2025-06-10 10:54:21 - INFO - _init_token_arrays - Created tokens_sort_order in 0.006 seconds
2025-06-10 10:54:21 - INFO - tokenize - Tokenization time: 0.02980 seconds
2025-06-10 10:54:21 - INFO - get_token_positions - Token indexing (25379) time: 0.01357 seconds
2025-06-10 10:54:21 - INFO - ngrams - Generating ngrams results
2025-06-10 10:54:21 - INFO - get_tokens_in_context - Context tokens collected: (3, 25379)
2025-06-10 10:54:21 - INFO - get_tokens_in_context - Context retrieved in 0.06 seconds.
2025-06-10 10:54:21 - DEBUG - _get_ngrams - Ngrams shape prior to EOF removal (3, 25379)
2025-06-10 10:54:21 - DEBUG - _get_ngrams - Ngrams shape after EOF removal (3, 25355)
2025-06-10 10:54:21 - INFO - _get_ngrams - Ngrams (12707) retrieval time: 0.06519 seconds
2025-06-10 10:54:21 - INFO - ngrams - Ngrams report 

"Ngrams for ""said""","Ngrams for ""said""","Ngrams for ""said""","Ngrams for ""said"""
Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus
Rank,Ngram,Frequency,Normalized Frequency
1,the company said,1173,8.39
2,the department said,194,1.39
3,the sources said,165,1.18
4,of england said,122,0.87
5,the spokesman said,116,0.83
Report based on word tokens,Report based on word tokens,Report based on word tokens,Report based on word tokens
"Ngram length: 3, Token position: right","Ngram length: 3, Token position: right","Ngram length: 3, Token position: right","Ngram length: 3, Token position: right"
Ngrams containing punctuation tokens excluded,Ngrams containing punctuation tokens excluded,Ngrams containing punctuation tokens excluded,Ngrams containing punctuation tokens excluded
"Normalized Frequency is per 10,000 tokens","Normalized Frequency is per 10,000 tokens","Normalized Frequency is per 10,000 tokens","Normalized Frequency is per 10,000 tokens"
"Total unique ngrams: 4,698","Total unique ngrams: 4,698","Total unique ngrams: 4,698","Total unique ngrams: 4,698"


In [None]:
conc.ngrams(token_str = 'said', ngram_length = 3, ngram_token_position = 'RIGHT', exclude_punctuation = True).display()

"Ngrams for ""said""","Ngrams for ""said""","Ngrams for ""said""","Ngrams for ""said"""
Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus
Rank,Ngram,Frequency,Normalized Frequency
1,the company said,1173,8.39
2,the department said,194,1.39
3,the sources said,165,1.18
4,of england said,122,0.87
5,the spokesman said,116,0.83
6,the bank said,114,0.81
7,agriculture department said,106,0.76
8,trade sources said,95,0.68
9,company also said,93,0.66
10,the report said,93,0.66


In [None]:
#| export
@patch
def ngram_frequencies(self: Conc, 
                ngram_length:int=2, # length of ngram
                case_sensitive:bool=False, # frequencies for tokens lowercased or with case preserved
				normalize_by:int=10000, # normalize frequencies by a number (e.g. 10000)
				page_size:int=PAGE_SIZE, # number of rows to return
				page_current:int=1, # current page
				exclude_punctuation:bool=True # exclude ngrams containing punctuation tokens
				) -> Result: # return a Result object with the frequency table
    """ Report frequent ngrams. """
    return self.ngrams_.ngram_frequencies(ngram_length=ngram_length,
                                    case_sensitive=case_sensitive,
                                    normalize_by=normalize_by,
                                    page_size=page_size,
                                    page_current=page_current,
                                    exclude_punctuation=exclude_punctuation)

In [None]:
#| hide
# "the company said" = 1,173 - matches above count
conc.ngram_frequencies(ngram_length = 3, case_sensitive = False, exclude_punctuation = True, page_size = 10).display()

Ngram Frequencies,Ngram Frequencies,Ngram Frequencies,Ngram Frequencies
Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus
Rank,Ngram,Frequency,Normalized Frequency
1,the company said,1173,8.39
2,mln dlrs in,795,5.68
3,cts vs loss,665,4.75
4,said it has,636,4.55
5,mln avg shrs,620,4.43
6,pct of the,608,4.35
7,the united states,603,4.31
8,qtr net shr,574,4.10
9,dlrs a share,546,3.90
10,inc said it,523,3.74


In [None]:
conc.ngram_frequencies(ngram_length = 3, case_sensitive = False, exclude_punctuation = True, page_current = 1).display()

Ngram Frequencies,Ngram Frequencies,Ngram Frequencies,Ngram Frequencies
Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus
Rank,Ngram,Frequency,Normalized Frequency
1,the company said,1173,8.39
2,mln dlrs in,795,5.68
3,cts vs loss,665,4.75
4,said it has,636,4.55
5,mln avg shrs,620,4.43
6,pct of the,608,4.35
7,the united states,603,4.31
8,qtr net shr,574,4.10
9,dlrs a share,546,3.90
10,inc said it,523,3.74


In [None]:
#| export
@patch
def concordance(self: Conc, 
				token_str: str, # token string to get concordance for 
				context_length:int = 5, # number of words to show on left and right of token string
				order:str='1R2R3R', # order of sort columns
				page_size:int=PAGE_SIZE, # number of results to display per results page
				page_current:int=1, # current page of results
				show_all_columns:bool = False, # df with all columns or just essentials
				use_cache:bool = True # retrieve the results from cache if available
				) -> Result: # concordance report results
	""" Report concordance for a token string. """
	return self.concordance_.concordance(token_str, context_length=context_length, order=order, page_size=page_size, page_current=page_current, show_all_columns=show_all_columns, use_cache=use_cache)

In [None]:
#| hide
# Total Concordance Lines for "said" is correct - 25379
# "the company said" 1173

In [None]:
conc.concordance('the company said', context_length = 5, order='1R2R3R').display()

"Concordance for ""the company said""","Concordance for ""the company said""","Concordance for ""the company said""","Concordance for ""the company said"""
"Reuters Corpus, Context tokens: 5, Order: 1R2R3R","Reuters Corpus, Context tokens: 5, Order: 1R2R3R","Reuters Corpus, Context tokens: 5, Order: 1R2R3R","Reuters Corpus, Context tokens: 5, Order: 1R2R3R"
Document Id,Left,Node,Right
2744,through a tender offer .,The company said,""" The negotiations would determine"
10501,1.25 dlrs a share .,The company said,""" this could bring earnings"
8353,of gold per ton .,The company said,& lt;Manitoba Mineral Resources Ltd
2186,". In a statement ,",the company said,", "" The SEC action"
8898,Co > of Japan .,The company said,", "" The discussions have"
6379,"In a brief statement ,",the company said,", "" We are studying"
6221,"special cost escrow accounts ,",the company said,", adding , that there"
4264,"close in near future ,",the company said,", adding it is prepared"
6319,"taxes . In addition ,",the company said,", Georgia Power 's contracts"
4664,the conversion of debentures .,The company said,", however , it expects"


In [None]:
#| export
@patch
def concordance_plot(self: Conc,
					 token_str: str,
					 page_size: int = 10):
	"""Display concordance plot."""
	return self.concordance_.concordance_plot(token_str, 
										   page_size=page_size)

In [None]:
#| hide
conc.concordance_plot('the company said', page_size=10)

IntSlider(value=1, description='Page', layout=Layout(margin='10px 0 10px 0', width='600px'), max=92, min=1)

FigureWidget({
    'data': [{'hoverinfo': 'text',
              'hovertext': '1.53 billion in 1985 . The company said it had been',
              'line': {'color': 'black', 'width': 2},
              'mode': 'lines',
              'showlegend': False,
              'type': 'scatter',
              'uid': '5eff8aa5-dfe1-43a1-9db0-239d040d6ba8',
              'x': [30.42071197411003, 30.42071197411003],
              'xaxis': 'x',
              'y': [0, 1],
              'yaxis': 'y'},
             {'hoverinfo': 'text',
              'hovertext': 'stg profit in 1985 . The company said in a statement',
              'line': {'color': 'black', 'width': 2},
              'mode': 'lines',
              'showlegend': False,
              'type': 'scatter',
              'uid': 'b85a67be-749a-4740-b76f-e088cda8ddea',
              'x': [33.86243386243386, 33.86243386243386],
              'xaxis': 'x2',
              'y': [0, 1],
              'yaxis': 'y2'},
             {'hoverinfo': 'text',

HTML(value="<div style='text-align: left; font-size: 12px; color: black; margin-left: 80px;margin-bottom:10px;…

In [None]:
#| export
@patch
def set_reference_corpus(self: Conc, 
                    corpus: Corpus  # Reference corpus
                    ) -> None:
    """ Set a reference corpus for keyness analysis. """
    self.keyness_ = Keyness(self.corpus, corpus)

In [None]:
# load a corpus as a reference corpus
brown = Corpus('brown').load(path_to_brown_corpus)

# set corpus as reference corpus
conc.set_reference_corpus(brown)

In [None]:
#| export
@patch
def keywords(self: Conc,
				effect_size_measure:str = 'log_ratio', # effect size measure to use, currently only 'log_ratio' is supported
				statistical_significance_measure:str = 'log_likelihood', # statistical significance measure to use, currently only 'log_likelihood' is supported
				order:str|None = None, # default of None orders by effect size measure, results can also be ordered by: frequency, frequency_reference, document_frequency, document_frequency_reference, log_likelihood
				order_descending:bool = True, # order is descending or ascending
				statistical_significance_cut: float|None = None, # statistical significance p-value to filter results, e.g. 0.05 or 0.01 or 0.001 - ignored if None or 0
				apply_bonferroni:bool = False, # apply Bonferroni correction to the statistical significance cut-off
				min_document_frequency: int = 0, # minimum document frequency in target for token to be included in the report
				min_document_frequency_reference: int = 0, # minimum document frequency in reference for token to be included in the report
				min_frequency: int = 0, # minimum frequency in target for token to be included in the report
				min_frequency_reference: int = 0, # minimum document frequency in reference for token to be included in the report
				case_sensitive:bool=False, # frequencies for tokens with or without case preserved 
				normalize_by:int=10000, # normalize frequencies by a number (e.g. 10000)
				page_size:int=PAGE_SIZE, # number of rows to return, if 0 returns all
				page_current:int=1, # current page, ignored if page_size is 0
				show_document_frequency:bool=False, # show document frequency in output
				exclude_tokens:list[str]=[], # exclude specific tokens from report results
				exclude_tokens_text:str = '', # text to explain which tokens have been excluded, will be added to the report notes
				restrict_tokens:list[str]=[], # restrict report to return results for a list of specific tokens
				restrict_tokens_text:str = '', # text to explain which tokens are included, will be added to the report notes
				exclude_punctuation:bool=True # exclude punctuation tokens
				) -> Result: # return a Result object with the frequency table
	""" Get keywords for the corpus. """
	if self.keyness_ is None:
		raise ValueError("Reference corpus is not set. Use 'set_reference_corpus' to set the reference corpus.")
	return self.keyness_.keywords(effect_size_measure=effect_size_measure,
									statistical_significance_measure=statistical_significance_measure,
									order=order,
									order_descending=order_descending,
									statistical_significance_cut=statistical_significance_cut,
									apply_bonferroni=apply_bonferroni,
									min_document_frequency=min_document_frequency,
									min_document_frequency_reference=min_document_frequency_reference,
									min_frequency=min_frequency,
									min_frequency_reference=min_frequency_reference,
									case_sensitive=case_sensitive,
									normalize_by=normalize_by,
									page_size=page_size,
									page_current=page_current,
									show_document_frequency=show_document_frequency,
									exclude_tokens=exclude_tokens,
									exclude_tokens_text=exclude_tokens_text,
									restrict_tokens=restrict_tokens,
									restrict_tokens_text=restrict_tokens_text,
									exclude_punctuation=exclude_punctuation)

In [None]:
#| hide
# keyness "said" frequency 25,379

In [None]:
conc.keywords(statistical_significance_cut = 0.0001, min_document_frequency_reference = 5).display()

Keywords,Keywords,Keywords,Keywords,Keywords,Keywords,Keywords,Keywords,Keywords
"Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus"
Rank,Token,Frequency,Frequency Reference,Normalized Frequency,Normalized Frequency Reference,Relative Risk,Log Ratio,Log Likelihood
1,net,6988,31,49.96,0.32,157.95,7.30,7078.84
2,dividend,1041,6,7.44,0.06,121.57,6.93,1042.37
3,exports,1214,10,8.68,0.10,85.07,6.41,1191.05
4,4th,840,8,6.01,0.08,73.57,6.20,815.81
5,securities,839,8,6.00,0.08,73.49,6.20,814.76
6,currency,818,8,5.85,0.08,71.65,6.16,792.86
7,subsidiary,630,7,4.50,0.07,63.06,5.98,604.46
8,billion,5828,65,41.66,0.66,62.83,5.97,5589.95
9,pact,428,5,3.06,0.05,59.98,5.91,408.89
10,profit,2960,36,21.16,0.37,57.61,5.85,2817.73


In [None]:
#| export
@patch
def collocates(self: Conc, 
				token_str:str, # Token to search for
				effect_size_measure:str = 'logdice', # statistical measure to use for collocation calculation: logdice, mutual_information
				statistical_significance_measure:str = 'log_likelihood', # statistical significance measure to use, currently only 'log_likelihood' is supported
				order:str|None = None, # default of None orders by collocation_measure, results can also be ordered by: collocate_frequency, frequency, log_likelihood
				order_descending:bool = True, # order is descending or ascending
				statistical_significance_cut: float|None = None, # statistical significance p-value to filter results, e.g. 0.05 or 0.01 or 0.001 - ignored if None or 0
				apply_bonferroni:bool = False, # apply Bonferroni correction to the statistical significance cut-off
				context_length:int|tuple[int, int]=5, # Window size per side in tokens - if an int (e.g. 5) context lengths on left and right will be the same, for independent control of left and right context length pass a tuple (context_length_left, context_left_right) (e.g. (0, 5)) 
				min_collocate_frequency:int=5, # Minimum count of collocates
				page_size:int=PAGE_SIZE, # number of rows to return, if 0 returns all
				page_current:int=1, # current page, ignored if page_size is 0
				exclude_punctuation:bool=True # exclude punctuation tokens				
				) -> Result:

	""" Report collocates for a given token string. """

	return self.collocates_.collocates(token_str, 
										effect_size_measure=effect_size_measure, 
										statistical_significance_measure=statistical_significance_measure, 
										order=order, 
										order_descending=order_descending, 
										statistical_significance_cut=statistical_significance_cut, 
										apply_bonferroni=apply_bonferroni, 
										context_length=context_length, 
										min_collocate_frequency=min_collocate_frequency, 
										page_size=page_size, 
										page_current=page_current,
										exclude_punctuation=exclude_punctuation)

In [None]:
#| hide
# collocates "said" frequency 25,379
# with context_length = (0, 1) and exclude_punctuation False and "the company" said collocate frequency is 1,173 

In [None]:
conc.collocates('the company', context_length = (0, 1), exclude_punctuation = False).display()

"Collocates of ""the company""","Collocates of ""the company""","Collocates of ""the company""","Collocates of ""the company""","Collocates of ""the company""","Collocates of ""the company"""
Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus
Rank,Token,Collocate Frequency,Frequency,Logdice,Log Likelihood
1,said,1173,25379,10.40,5149.40
2,'s,518,9627,10.38,2429.14
3,also,107,2532,9.28,450.99
4,reported,51,775,8.74,259.63
5,has,69,4874,8.14,151.05
6,had,47,2975,7.98,111.88
7,earned,22,159,7.78,145.70
8,would,43,4688,7.49,63.27
9,is,59,7673,7.48,70.93
10,will,49,5951,7.47,63.94


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()