# conc

> An interface to create Conc reports for corpus linguistic analysis of frequency, concordances, ngrams, keyness, and collocation.

In [None]:
#| default_exp conc

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import time
from fastcore.basics import patch

In [None]:
#| export
from conc.result import Result
from conc.core import PAGE_SIZE
from conc.corpus import Corpus
from conc.frequency import Frequency
from conc.ngrams import Ngrams
from conc.concordance import Concordance
from conc.keyness import Keyness
from conc.collocates import Collocates

In [None]:
#| export
class Conc:
	"""Unified interface to Conc reporting for analysis of frequency, ngrams, concordances, keyness, and collocates."""
	
	def __init__(self, 
				corpus # Corpus instance
				):
		# information about corpus
		self.corpus = corpus
		self.frequency_ = Frequency(corpus)
		self.ngrams_ = Ngrams(corpus)
		self.concordance_ = Concordance(corpus)
		self.keyness_ = None
		self.collocates_ = Collocates(corpus)

In [None]:
#| hide
import os

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

path_to_toy_corpus = f'{save_path}toy.corpus'
path_to_brown_corpus = f'{save_path}brown.corpus'
path_to_reuters_corpus = f'{save_path}reuters.corpus'

In [None]:
# load (or build) a corpus
reuters = Corpus('reuters').load(path_to_reuters_corpus)

In [None]:
# create a report instance for the corpus
report = Conc(reuters)

In [None]:
#| export
@patch
def frequencies(self: Conc,
				case_sensitive:bool=False, # frequencies for tokens with or without case preserved 
				normalize_by:int=10000, # normalize frequencies by a number (e.g. 10000)
				page_size:int=PAGE_SIZE, # number of rows to return, if 0 returns all
				page_current:int=1, # current page, ignored if page_size is 0
				show_token_id:bool=False, # show token_id in output
				show_document_frequency:bool=False, # show document frequency in output
				exclude_tokens:list[str]=[], # exclude specific tokens from frequency report, can be used to remove stopwords
				exclude_tokens_text:str = '', # text to explain which tokens have been excluded, will be added to the report notes
				restrict_tokens:list[str]=[], # restrict frequency report to return frequencies for a list of specific tokens
				restrict_tokens_text:str = '', # text to explain which tokens are included, will be added to the report notes
				exclude_punctuation:bool=True, # exclude punctuation tokens
				exclude_spaces:bool=True # exclude space tokens
				) -> Result: # return a Result object with the frequency table
	""" Report frequent tokens. """
	return self.frequency_.frequencies(case_sensitive=case_sensitive,
										normalize_by=normalize_by,
										page_size=page_size,
										page_current=page_current,
										show_token_id=show_token_id,
										show_document_frequency=show_document_frequency,
										exclude_tokens=exclude_tokens,
										exclude_tokens_text=exclude_tokens_text,
										restrict_tokens=restrict_tokens,
										restrict_tokens_text=restrict_tokens_text,
										exclude_punctuation=exclude_punctuation,
										exclude_spaces=exclude_spaces)

In [None]:
report.frequencies(normalize_by=10000).display()

Frequencies,Frequencies,Frequencies,Frequencies
"Frequencies of word tokens, Reuters Corpus","Frequencies of word tokens, Reuters Corpus","Frequencies of word tokens, Reuters Corpus","Frequencies of word tokens, Reuters Corpus"
Rank,Token,Frequency,Normalized Frequency
1,the,69263,495.17
2,of,36779,262.94
3,to,36328,259.71
4,in,29252,209.12
5,and,25645,183.34
6,said,25379,181.44
7,a,24844,177.61
8,mln,18621,133.12
9,vs,14332,102.46
10,for,13720,98.09


In [None]:
#| export
@patch
def ngrams(self: Conc, 
		   token_str: str, # token string to get ngrams for 
		   ngram_length:int = 2, # length of ngram
		   ngram_token_position: str = 'LEFT', # specify if token sequence is on LEFT or RIGHT (support for ngrams with token in middle of sequence is in-development))
		   normalize_by:int=10000, # normalize frequencies by a number (e.g. 10000)
		   page_size:int = PAGE_SIZE, # number of results to display per results page 
		   page_current:int = 1, # current page of results
		   show_all_columns:bool = False, # return raw df with all columns or just ngram and frequency
		   exclude_punctuation:bool=True, # do not return ngrams with punctuation tokens
		   exclude_spaces:bool=True, # better thought of as ignore space tokens, unlikely that you will need this as anything other than True
		   use_cache:bool = True # retrieve the results from cache if available
		   ) -> Result: # return a Result object with ngram data
	""" Report ngram frequencies containing a token string. """

	return self.ngrams_.ngrams(token_str, 
							ngram_length=ngram_length, 
							ngram_token_position=ngram_token_position, 
							normalize_by=normalize_by,
							page_size=page_size, 
							page_current=page_current, 
							show_all_columns=show_all_columns, 
							exclude_punctuation=exclude_punctuation,
							exclude_spaces=exclude_spaces,
							use_cache=use_cache)

In [None]:
report.ngrams(token_str = 'i', ngram_length = 2, ngram_token_position = 'LEFT').display()

"Ngrams for ""i""","Ngrams for ""i""","Ngrams for ""i""","Ngrams for ""i"""
Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus
Rank,Ngram,Frequency,Normalized Frequency
1,i think,148,1.06
2,i do,119,0.85
3,i am,65,0.46
4,i would,58,0.41
5,i 'm,45,0.32
6,i have,23,0.16
7,i believe,20,0.14
8,i can,16,0.11
9,i hope,14,0.10
10,i expect,11,0.08


In [None]:
#| export
@patch
def concordance(self: Conc, 
				token_str: str, # token string to get concordance for 
				context_length:int = 5, # number of words to show on left and right of token string
				order:str='1R2R3R', # order of sort columns
				page_size:int=PAGE_SIZE, # number of results to display per results page
				page_current:int=1, # current page of results
				show_all_columns:bool = False, # df with all columns or just essentials
				use_cache:bool = True # retrieve the results from cache if available
				) -> Result: # concordance report results
	""" Report concordance for a token string. """
	return self.concordance_.concordance(token_str, context_length=context_length, order=order, page_size=page_size, page_current=page_current, show_all_columns=show_all_columns, use_cache=use_cache)

In [None]:
report.concordance('cause', context_length = 5, order='1R2R3R').display()

"Concordance for ""cause""","Concordance for ""cause""","Concordance for ""cause""","Concordance for ""cause"""
"Reuters Corpus, Context tokens: 5, Order: 1R2R3R","Reuters Corpus, Context tokens: 5, Order: 1R2R3R","Reuters Corpus, Context tokens: 5, Order: 1R2R3R","Reuters Corpus, Context tokens: 5, Order: 1R2R3R"
Document Id,Left,Node,Right
1802,the jump in inflation will,cause,a falloff in the
4127,non - dilutive and will,cause,a slight increase in
5393,a further devalued dollar may,cause,economic depressions in some
5160,"1987 "" are expected to",cause,even the aggressive side
5717,- term decisions which might,cause,"markets to panic ,"
9574,Saddam Hussein was the main,cause,of tension in the
9628,Saddam Hussein was the main,cause,of tension in the
6565,Lanka 's plantations and may,cause,output to drop below
7550,problems in Argentina will likely,cause,those prices to rise
9209,patterns to any single,cause,", El Nino 's role"


In [None]:
#| export
@patch
def set_reference_corpus(self: Conc, 
                    corpus: Corpus  # Reference corpus
                    ) -> None:
    """ Set a reference corpus for keyness analysis. """
    self.keyness_ = Keyness(self.corpus, corpus)

In [None]:
# load a corpus as a reference corpus
brown = Corpus('brown').load(path_to_brown_corpus)

# set corpus as reference corpus
report.set_reference_corpus(brown)

In [None]:
#| export
@patch
def keywords(self: Conc,
				effect_size_measure:str = 'log_ratio', # effect size measure to use, currently only 'log_ratio' is supported
				statistical_significance_measure:str = 'log_likelihood', # statistical significance measure to use, currently only 'log_likelihood' is supported
				order:str|None = None, # default of None orders by effect size measure, results can also be ordered by: frequency, frequency_reference, document_frequency, document_frequency_reference, log_likelihood
				order_descending:bool = True, # order is descending or ascending
				statistical_significance_cut: float|None = None, # statistical significance p-value to filter results, e.g. 0.05 or 0.01 or 0.001 - ignored if None or 0
				apply_bonferroni:bool = False, # apply Bonferroni correction to the statistical significance cut-off
				min_document_frequency: int = 0, # minimum document frequency in target for token to be included in the report
				min_document_frequency_reference: int = 0, # minimum document frequency in reference for token to be included in the report
				min_frequency: int = 0, # minimum frequency in target for token to be included in the report
				min_frequency_reference: int = 0, # minimum document frequency in reference for token to be included in the report
				case_sensitive:bool=False, # frequencies for tokens with or without case preserved 
				normalize_by:int=10000, # normalize frequencies by a number (e.g. 10000)
				page_size:int=PAGE_SIZE, # number of rows to return, if 0 returns all
				page_current:int=1, # current page, ignored if page_size is 0
				show_document_frequency:bool=False, # show document frequency in output
				exclude_tokens:list[str]=[], # exclude specific tokens from report results
				exclude_tokens_text:str = '', # text to explain which tokens have been excluded, will be added to the report notes
				restrict_tokens:list[str]=[], # restrict report to return results for a list of specific tokens
				restrict_tokens_text:str = '', # text to explain which tokens are included, will be added to the report notes
				exclude_punctuation:bool=True, # exclude punctuation tokens
				exclude_spaces:bool=True # exclude space tokens
				) -> Result: # return a Result object with the frequency table
	""" Get keywords for the corpus. """
	if self.keyness_ is None:
		raise ValueError("Reference corpus is not set. Use 'set_reference_corpus' to set the reference corpus.")
	return self.keyness_.keywords(effect_size_measure=effect_size_measure,
									statistical_significance_measure=statistical_significance_measure,
									order=order,
									order_descending=order_descending,
									statistical_significance_cut=statistical_significance_cut,
									apply_bonferroni=apply_bonferroni,
									min_document_frequency=min_document_frequency,
									min_document_frequency_reference=min_document_frequency_reference,
									min_frequency=min_frequency,
									min_frequency_reference=min_frequency_reference,
									case_sensitive=case_sensitive,
									normalize_by=normalize_by,
									page_size=page_size,
									page_current=page_current,
									show_document_frequency=show_document_frequency,
									exclude_tokens=exclude_tokens,
									exclude_tokens_text=exclude_tokens_text,
									restrict_tokens=restrict_tokens,
									restrict_tokens_text=restrict_tokens_text,
									exclude_punctuation=exclude_punctuation,
									exclude_spaces=exclude_spaces)

In [None]:
report.keywords(statistical_significance_cut = 0.001, min_document_frequency_reference=5).display()

Keywords,Keywords,Keywords,Keywords,Keywords,Keywords,Keywords,Keywords,Keywords
"Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus"
Rank,Token,Frequency,Frequency Reference,Normalized Frequency,Normalized Frequency Reference,Relative Risk,Log Ratio,Log Likelihood
1,net,6988,31,49.96,0.32,157.95,7.30,7078.84
2,dividend,1041,6,7.44,0.06,121.57,6.93,1042.37
3,exports,1214,10,8.68,0.10,85.07,6.41,1191.05
4,4th,840,8,6.01,0.08,73.57,6.20,815.81
5,securities,839,8,6.00,0.08,73.49,6.20,814.76
6,currency,818,8,5.85,0.08,71.65,6.16,792.86
7,subsidiary,630,7,4.50,0.07,63.06,5.98,604.46
8,billion,5828,65,41.66,0.66,62.83,5.97,5589.95
9,pact,428,5,3.06,0.05,59.98,5.91,408.89
10,profit,2960,36,21.16,0.37,57.61,5.85,2817.73


In [None]:
#| export
@patch
def collocates(self: Conc, 
				token_str:str, # Token to search for
				effect_size_measure:str = 'logdice', # statistical measure to use for collocation calculation: logdice, mutual_information
				statistical_significance_measure:str = 'log_likelihood', # statistical significance measure to use, currently only 'log_likelihood' is supported
				order:str|None = None, # default of None orders by collocation_measure, results can also be ordered by: collocate_frequency, frequency, log_likelihood
				order_descending:bool = True, # order is descending or ascending
				statistical_significance_cut: float|None = None, # statistical significance p-value to filter results, e.g. 0.05 or 0.01 or 0.001 - ignored if None or 0
				apply_bonferroni:bool = False, # apply Bonferroni correction to the statistical significance cut-off
				context_length:int|None=5, # Window size per side in tokens - use this for setting context lengths on left and right to same value
				context_left:int|None=None, # If context_left or context_right > 0 sets context lengths independently
				context_right:int|None=None, # see context_left
				min_collocate_frequency:int=5, # Minimum count of collocates
				page_size:int=PAGE_SIZE, # number of rows to return, if 0 returns all
				page_current:int=1, # current page, ignored if page_size is 0
				exclude_punctuation:bool=True, # exclude punctuation tokens
				exclude_spaces:bool=True # exclude space tokens
				) -> Result:

	""" Report collocates for a given token string. """

	return self.collocates_.collocates(token_str, 
										effect_size_measure=effect_size_measure, 
										statistical_significance_measure=statistical_significance_measure, 
										order=order, 
										order_descending=order_descending, 
										statistical_significance_cut=statistical_significance_cut, 
										apply_bonferroni=apply_bonferroni, 
										context_length=context_length, 
										context_left=context_left, 
										context_right=context_right, 
										min_collocate_frequency=min_collocate_frequency, 
										page_size=page_size, 
										page_current=page_current,
										exclude_punctuation=exclude_punctuation,
										exclude_spaces=exclude_spaces)

In [None]:
report.collocates('cause').display()

"Collocates of ""cause""","Collocates of ""cause""","Collocates of ""cause""","Collocates of ""cause""","Collocates of ""cause""","Collocates of ""cause"""
Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus
Rank,Token,Collocate Frequency,Frequency,Logdice,Log Likelihood
1,concern,12,200,10.20,76.41
2,problems,8,263,9.36,40.07
3,damage,5,143,9.20,26.42
4,rises,5,228,8.82,21.85
5,could,21,1471,8.74,74.57
6,main,5,290,8.59,19.54
7,inflation,6,476,8.33,19.91
8,drop,5,403,8.25,16.44
9,increase,9,1171,7.82,21.79
10,any,7,925,7.76,16.76


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()