# keyness

> Functionality for keyness analysis.

In [None]:
#| default_exp keyness

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import numpy as np
import time
import polars as pl
from fastcore.basics import patch
from scipy.stats import chi2

In [None]:
#| export
from conc.corpus import Corpus
from conc.result import Result
from conc.frequency import Frequency
from conc.core import logger, PAGE_SIZE

In [None]:
#| hide
polars_conf = pl.Config.set_tbl_cols(20)

In [None]:
#| hide
import os

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

path_to_toy_corpus = f'{save_path}toy.corpus'
path_to_brown_corpus = f'{save_path}brown.corpus'
path_to_reuters_corpus = f'{save_path}reuters.corpus'
path_to_gutenberg_corpus = f'{save_path}gutenberg.corpus'
path_to_gardenparty_corpus = f'{save_path}garden-party.corpus'
path_to_congress_corpus = f'{save_path}us-congressional-speeches-subset-100k.corpus'

In [None]:
#| export
class Keyness:
	""" Class for keyness analysis reporting. """
	def __init__(self,
			  corpus:Corpus, # Corpus instance
			  reference_corpus:Corpus # Corpus for comparison
			  ): 
		self.corpus = corpus
		self.reference_corpus = reference_corpus	


In [None]:
#| export
@patch
def keywords(self: Keyness,
				effect_size_measure:str = 'log_ratio', # effect size measure to use, currently only 'log_ratio' is supported
				statistical_significance_measure:str = 'log_likelihood', # statistical significance measure to use, currently only 'log_likelihood' is supported
				order:str|None = None, # default of None orders by effect size measure, results can also be ordered by: frequency, frequency_reference, document_frequency, document_frequency_reference, log_likelihood
				order_descending:bool = True, # order is descending or ascending
				statistical_significance_cut: float|None = None, # statistical significance p-value to filter results, e.g. 0.05 or 0.01 or 0.001 - ignored if None or 0
				apply_bonferroni:bool = False, # apply Bonferroni correction to the statistical significance cut-off
				min_document_frequency: int = 0, # minimum document frequency in target for token to be included in the report
				min_document_frequency_reference: int = 0, # minimum document frequency in reference for token to be included in the report
				min_frequency: int = 0, # minimum frequency in target for token to be included in the report
				min_frequency_reference: int = 0, # minimum document frequency in reference for token to be included in the report
				case_sensitive:bool=False, # frequencies for tokens with or without case preserved 
				normalize_by:int=10000, # normalize frequencies by a number (e.g. 10000)
				page_size:int=PAGE_SIZE, # number of rows to return, if 0 returns all
				page_current:int=1, # current page, ignored if page_size is 0
				show_document_frequency:bool=False, # show document frequency in output
				exclude_tokens:list[str]=[], # exclude specific tokens from report results
				exclude_tokens_text:str = '', # text to explain which tokens have been excluded, will be added to the report notes
				restrict_tokens:list[str]=[], # restrict report to return results for a list of specific tokens
				restrict_tokens_text:str = '', # text to explain which tokens are included, will be added to the report notes
				exclude_punctuation:bool=True, # exclude punctuation tokens
				exclude_spaces:bool=True # exclude space tokens
				) -> Result: # return a Result object with the frequency table
	""" Get keywords for the corpus. """

	if type(normalize_by) != int:
		raise ValueError('normalize_by must be an integer, e.g. 1000000 or 10000')

	if effect_size_measure not in ['log_ratio']:
		raise ValueError('Currently only log_ratio is supported as an effect size measures.')
	
	if statistical_significance_measure not in ['log_likelihood']:
		raise ValueError('Currently only log_likelihood is supported as a statistical significance measure.')

	if order not in [None, effect_size_measure, 'frequency', 'frequency_reference', 'document_frequency', 'document_frequency_reference', statistical_significance_measure]:
		raise ValueError(f'The order parameter must None (default) or one of: {effect_size_measure}, frequency, frequency_reference, document_frequency, document_frequency_reference, {statistical_significance_measure}.')
	
	if not show_document_frequency and order in ['document_frequency', 'document_frequency_reference']:
		raise ValueError('The show_document_frequency parameter bust be set True if you want to order by document_frequency or document_frequency_reference.')

	if type(normalize_by) != int:
		raise ValueError('normalize_by must be an integer, e.g. 1000000 or 10000')

	start_time = time.time()

	debug = False

	freq_target = Frequency(self.corpus)
	freq_reference = Frequency(self.reference_corpus)

	columns = ['rank', 'token', 'frequency', 'frequency_reference', 'document_frequency', 'document_frequency_reference', 'normalized_frequency', 'normalized_frequency_reference']

	target_count_tokens, tokens_descriptor, total_descriptor = self.corpus.get_token_count_text(exclude_punctuation=exclude_punctuation, exclude_spaces=exclude_spaces)
	reference_count_tokens, _, _ = self.reference_corpus.get_token_count_text(exclude_punctuation=exclude_punctuation, exclude_spaces=exclude_spaces)

	formatted_data = []
	formatted_data.append(f'Report based on {tokens_descriptor}')

	if exclude_tokens:
		excluded_tokens_count = df.filter(pl.col('token').is_in(exclude_tokens)).select(pl.len()).collect(engine='streaming').item()
		df = df.filter(~pl.col('token').is_in(exclude_tokens))
		if exclude_tokens_text == '':
			formatted_data.append(f'Tokens excluded from report: {excluded_tokens_count}')
		else:
			formatted_data.append(f'{exclude_tokens_text}')
	if restrict_tokens:
		df = df.filter(pl.col('token').is_in(restrict_tokens))
		if restrict_tokens_text == '':
			formatted_data.append(f'')
		else:
			formatted_data.append(f'{restrict_tokens_text}')

	target_min_freq = (0.05 * normalize_by) / target_count_tokens
	reference_min_freq = (0.05 * normalize_by) / reference_count_tokens
	
	target_df = freq_target.frequencies(case_sensitive=case_sensitive,
										normalize_by=normalize_by,
										page_size=0,
										page_current=1,
										show_token_id=False,
										show_document_frequency=True, # applying to final report not import
										exclude_tokens=exclude_tokens,
										exclude_tokens_text=exclude_tokens_text,
										restrict_tokens=restrict_tokens,
										restrict_tokens_text=restrict_tokens_text,
										exclude_punctuation=exclude_punctuation,
										exclude_spaces=exclude_spaces).to_frame()

	reference_df = freq_reference.frequencies(case_sensitive=case_sensitive,
										normalize_by=normalize_by,
										page_size=0,
										page_current=1,
										show_token_id=False,
										show_document_frequency=True, # applying to final report not import
										exclude_tokens=exclude_tokens,
										exclude_tokens_text=exclude_tokens_text,
										restrict_tokens=restrict_tokens,
										restrict_tokens_text=restrict_tokens_text,
										exclude_punctuation=exclude_punctuation,
										exclude_spaces=exclude_spaces).to_frame()

	keyness_df = target_df.join(reference_df, on='token', how='left', suffix = '_reference').drop('rank', 'rank_reference')

	keyness_df = keyness_df.with_columns(pl.col('frequency_reference')).fill_null(0)
	keyness_df = keyness_df.with_columns(pl.col('document_frequency_reference')).fill_null(0)

	if effect_size_measure in ['log_ratio']:
		columns.extend(['relative_risk', 'log_ratio'])
		
		keyness_df = keyness_df.with_columns(pl.lit(target_count_tokens).alias('token_count')).with_columns(pl.lit(reference_count_tokens).alias('token_count_reference'))
	
		# copying for calculation - will retain original values for display
		keyness_df = keyness_df.with_columns(pl.col('normalized_frequency').alias('calc_normalized_frequency'))
		keyness_df = keyness_df.with_columns(pl.col('normalized_frequency_reference').alias('calc_normalized_frequency_reference'))

		keyness_df = keyness_df.with_columns(pl.col('calc_normalized_frequency').fill_null(target_min_freq))
		keyness_df = keyness_df.with_columns(pl.col('calc_normalized_frequency_reference').fill_null(reference_min_freq))

		keyness_df = keyness_df.with_columns((pl.col('calc_normalized_frequency')/pl.col('calc_normalized_frequency_reference')).alias('relative_risk'))
		keyness_df = keyness_df.with_columns((pl.col('calc_normalized_frequency').log(2) - pl.col('calc_normalized_frequency_reference').log(2)).alias('log_ratio'))

	if statistical_significance_measure in ['log_likelihood']:
		columns.extend(['log_likelihood'])

		# calculating using approach here: https://ucrel.lancs.ac.uk/llwizard.html
		# a = frequency in target , b = frequency in reference 
		# c = total tokens in target , d = total tokens in reference 
		# E1 = c*(a+b) / (c+d) 
		# E2 = d*(a+b) / (c+d)
		# G2 = 2*((a*ln (a/E1)) + (b*ln (b/E2))) 
		
		# E1 and E2
		keyness_df = keyness_df.with_columns(
			((pl.col('token_count') * (pl.col('frequency') + pl.col('frequency_reference'))) / (pl.col('token_count') + pl.col('token_count_reference'))).alias('expected_frequency'),
			((pl.col('token_count_reference') * (pl.col('frequency') + pl.col('frequency_reference'))) / (pl.col('token_count') + pl.col('token_count_reference'))).alias('expected_frequency_reference'), # 0 if no reference frequency
		)

		# components of G2 as term1 and term 2 - (a*ln (a/E1)) (b*ln (b/E2))
		keyness_df = keyness_df.with_columns([
			pl.when(pl.col('frequency') > 0)
			.then(pl.col('frequency') * (pl.col('frequency') / pl.col('expected_frequency')).log())
			.otherwise(0)
			.alias('term1'),
			pl.when(pl.col('frequency_reference') > 0)
			.then(pl.col('frequency_reference') * (pl.col('frequency_reference') / pl.col('expected_frequency_reference')).log())
			.otherwise(0)
			.alias('term2') # 0 if no reference frequency
		])

		#G2
		keyness_df = keyness_df.with_columns(
			(2 * (pl.col('term1') + pl.col('term2'))).alias('log_likelihood')
		)

		# not needed - as use cutoff instead
		# combined_frequency_table = combined_frequency_table.collect()
		# combined_frequency_table = combined_frequency_table.with_columns(
		#     pl.Series("p_value", chi2.sf(combined_frequency_table["log_likelihood"].to_numpy(), 1)).alias("p_value")
		# )

	# filtering - must be done before bonferroni or similar correction ...
	filtering_descriptors = []
	if min_frequency > 0:
		keyness_df = keyness_df.filter(pl.col('frequency') >= min_frequency)
		filtering_descriptors.append(f'minimum frequency in target corpus ({min_frequency:,.0f})')
	if min_frequency_reference > 0:
		keyness_df = keyness_df.filter(pl.col('frequency_reference') >= min_frequency_reference)
		filtering_descriptors.append(f'minimum frequency in reference corpus ({min_frequency_reference:,.0f})')
	if min_document_frequency > 0:
		keyness_df = keyness_df.filter(pl.col('document_frequency') >= min_document_frequency)
		filtering_descriptors.append(f'minimum document frequency in target corpus ({min_document_frequency:,.0f})')
	if min_document_frequency_reference > 0:
		keyness_df = keyness_df.filter(pl.col('document_frequency_reference') >= min_document_frequency_reference)
		filtering_descriptors.append(f'minimum document frequency in reference corpus ({min_document_frequency_reference:,.0f})')
	
	if len(filtering_descriptors) > 0:
		formatted_data.append(f'Filtered tokens by {(", ".join(filtering_descriptors))}')

	unique_tokens = keyness_df.select(pl.len()).item()

	if statistical_significance_cut is not None and statistical_significance_cut > 0:
		p = statistical_significance_cut
		# bonferroni correction
		if apply_bonferroni:
			p_value_descriptor = f'Keywords filtered based on p-value {p} with Bonferroni correction (based on {unique_tokens} tests)'
			p = p / unique_tokens # adjust by criteria
		else:
			p_value_descriptor = f'Keywords filtered based on p-value: {p}'
		cut = chi2.ppf(1 - p, df=1)		
		keyness_df = keyness_df.filter(pl.col(statistical_significance_measure) > cut)
		formatted_data.append(p_value_descriptor)
		unique_tokens = keyness_df.select(pl.len()).item()
	
	if order is None:
		order = effect_size_measure
	keyness_df = keyness_df.sort(order, descending=order_descending)
	keyness_df = keyness_df.slice((page_current-1)*page_size, page_size)

	if not show_document_frequency:
		keyness_df = keyness_df.drop('document_frequency', 'document_frequency_reference')
		columns.remove('document_frequency')
		columns.remove('document_frequency_reference')

	rank_offset = (page_current-1) * page_size + 1
	keyness_df = keyness_df.with_row_index(name='rank', offset=rank_offset)

	if normalize_by is not None:
		formatted_data.append(f'Normalized Frequency is per {normalize_by:,.0f} tokens')

	formatted_data.append(f'{total_descriptor} in target corpus: {target_count_tokens:,.0f}')
	formatted_data.append(f'{total_descriptor} in reference corpus: {reference_count_tokens:,.0f}')

	formatted_data.append(f'Keywords: {unique_tokens:,.0f}')
	if page_size != 0 and unique_tokens > page_size:
		formatted_data.append(f'Showing {page_size} rows')
		formatted_data.append(f'Page {page_current} of {unique_tokens // page_size + 1}')

	logger.info(f'Keywords report time: {(time.time() - start_time):.5f} seconds')

	if debug:
		columns = keyness_df.columns

	return Result(type = 'keywords', df=keyness_df.select(columns), title='Keywords', description=f'Target corpus: {self.corpus.name}, Reference corpus: {self.reference_corpus.name}', summary_data={}, formatted_data=formatted_data)


In [None]:
# load the target corpus
gardenparty = Corpus().load(path_to_gardenparty_corpus)

In [None]:
# load the reference corpus
brown = Corpus().load(path_to_brown_corpus)

In [None]:
# instantiate the Keyness class
keyness = Keyness(corpus = gardenparty, reference_corpus = brown)

In [None]:
# generate and display the keywords report
keyness.keywords(show_document_frequency = True, min_document_frequency_reference = 5, statistical_significance_cut = 0.0001, apply_bonferroni = True, order_descending = True, page_current = 1).display()

Keywords,Keywords,Keywords,Keywords,Keywords,Keywords,Keywords,Keywords,Keywords,Keywords,Keywords
"Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus","Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus","Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus","Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus","Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus","Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus","Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus","Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus","Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus","Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus","Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus"
Rank,Token,Frequency,Frequency Reference,Document Frequency,Document Frequency Reference,Normalized Frequency,Normalized Frequency Reference,Relative Risk,Log Ratio,Log Likelihood
1,laura,86,14,2,6,13.58,0.14,95.10,6.57,402.74
2,jug,30,6,2,5,4.74,0.06,77.41,6.27,136.44
3,grandma,73,15,2,5,11.53,0.15,75.34,6.24,330.64
4,meadows,33,7,1,5,5.21,0.07,72.98,6.19,148.73
5,con,27,7,1,5,4.26,0.07,59.71,5.90,117.62
6,bye,25,7,9,7,3.95,0.07,55.29,5.79,107.37
7,velvet,14,5,6,5,2.21,0.05,43.35,5.44,57.19
8,shone,13,5,7,5,2.05,0.05,40.25,5.33,52.21
9,queer,15,6,5,6,2.37,0.06,38.70,5.27,59.69
10,gloves,17,7,7,5,2.69,0.07,37.60,5.23,67.18


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()