# collocates

> Functionality for collocation analysis.

In [None]:
#| default_exp collocates

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import numpy as np
import time
import polars as pl
from fastcore.basics import patch
from scipy.stats import chi2

In [None]:
#| export
from conc.corpus import Corpus
from conc.result import Result
from conc.core import logger, PAGE_SIZE

In [None]:
#| hide
from conc.core import set_logger_state

In [None]:
#| hide
import os

In [None]:
#| hide
np.set_printoptions(suppress=True)
set_logger_state('quiet')

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

path_to_toy_corpus = f'{save_path}toy.corpus'
path_to_brown_corpus = f'{save_path}brown.corpus'
path_to_reuters_corpus = f'{save_path}reuters.corpus'
path_to_gutenberg_corpus = f'{save_path}gutenberg.corpus'
path_to_gardenparty_corpus = f'{save_path}garden-party.corpus'

In [None]:
#| hide
reuters = Corpus().load(path_to_reuters_corpus)

In [None]:
#| export
class Collocates:
	""" Class for collocation analysis reporting. """
	def __init__(self,
			  corpus:Corpus # Corpus instance
			  ): 
		self.corpus = corpus


In [None]:
#| export
@patch
def collocates(self:Collocates, 
				token_str:str, # Token to search for
				effect_size_measure:str = 'logdice', # statistical measure to use for collocation calculation: logdice, mutual_information
				statistical_significance_measure:str = 'log_likelihood', # statistical significance measure to use, currently only 'log_likelihood' is supported
				order:str|None = None, # default of None orders by collocation measure, results can also be ordered by: collocate_frequency, frequency, log_likelihood
				order_descending:bool = True, # order is descending or ascending
				statistical_significance_cut: float|None = None, # statistical significance p-value to filter results, e.g. 0.05 or 0.01 or 0.001 - ignored if None or 0
				apply_bonferroni:bool = False, # apply Bonferroni correction to the statistical significance cut-off
				context_length:int|tuple[int, int]=5, # Window size per side in tokens - if an int (e.g. 5) context lengths on left and right will be the same, for independent control of left and right context length pass a tuple (context_length_left, context_left_right) (e.g. (0, 5)) 
				min_collocate_frequency:int=5, # Minimum count of collocates
				page_size:int=PAGE_SIZE, # number of rows to return, if 0 returns all
				page_current:int=1, # current page, ignored if page_size is 0
				exclude_punctuation:bool=True # exclude punctuation tokens
				) -> Result:
	""" Report collocates for a given token string. """

	if effect_size_measure not in ['logdice', 'mutual_information']:
		raise ValueError(f'Collocation measure must be one of "logdice" or "mutual_information".')
	
	if statistical_significance_measure not in ['log_likelihood']:
		raise ValueError(f'Statistical significance measure must be "log_likelihood".')

	if order not in [None, effect_size_measure, 'collocate_frequency', 'frequency', statistical_significance_measure]:
		raise ValueError(f'The order parameter must be None (default) or one of: {effect_size_measure}, collocate_frequency, frequency, {statistical_significance_measure}')

	token_sequence, index_id = self.corpus.tokenize(token_str, simple_indexing=True)

	index_column = 'lower_index'
	frequency_column = 'frequency_lower'
	columns = ['rank', 'token', 'collocate_frequency', 'frequency']

	start_time = time.time()
	debug = False

	sequence_len = len(token_sequence[0])
	token_positions = self.corpus.get_token_positions(token_sequence, index_id)

	if token_positions is None or token_positions[0].shape[0] == 0:
		logger.warning(f'Token "{token_str}" not found in the corpus.')
		return Result(type='collocates', df=pl.DataFrame(), title=f'No matches for "{token_str}"', description=f'{self.corpus.name}', summary_data={}, formatted_data=[])

	count_tokens, tokens_descriptor, total_descriptor = self.corpus.get_token_count_text(exclude_punctuation)

	formatted_data = []
	formatted_data.append(f'Report based on {tokens_descriptor}')

	# if any of context_length, context_left, context_right are None - set them to 0
	if type(context_length) == int:
		if context_length < 1:
			raise ValueError('Context length must be greater than 0')
		context_left = context_length
		context_right = context_length
	elif type(context_length) == tuple:
		if len(context_length) != 2:
			raise ValueError('Context_length must be an int or a tuple of two ints (context_left, context_right).')
		elif type(context_length[0]) != int or type(context_length[1]) != int:
			raise ValueError('Context_length must be an int or a tuple of two ints (context_left, context_right).')
		elif context_length[0] < 1 and context_length[1] < 1:
			raise ValueError('If setting context lengths independently, at least one context length must be greater than 0')
		else:
			context_left, context_right = context_length

	formatted_data.append(f'Context tokens left: {context_left}, context tokens right: {context_right}')

	# getting context tokens
	left_tokens = self.corpus.get_tokens_in_context(token_positions=token_positions, index=index_column, context_length=context_left, position_offset=-1, position_offset_step = -1, exclude_punctuation=exclude_punctuation, convert_eof = True)
	right_tokens = self.corpus.get_tokens_in_context(token_positions=token_positions, index=index_column, context_length=context_right, position_offset=sequence_len, position_offset_step = 1, exclude_punctuation=exclude_punctuation, convert_eof = True)
	combined_tokens = np.concatenate([left_tokens.flatten(), right_tokens.flatten()])
	del left_tokens, right_tokens
	combined_tokens = combined_tokens[combined_tokens != 0] # removes punctuation and space placeholder
	# getting frequencies of collocates
	unique_token_ids, counts = np.unique(combined_tokens, return_counts=True)
	token_count_in_context_window = combined_tokens.shape[0]

	node_tokens = self.corpus.get_tokens_in_context(token_positions=token_positions, index=index_column, context_length=sequence_len, position_offset=0,  position_offset_step = 1, exclude_punctuation=exclude_punctuation, convert_eof = True)
	unique_node_token_ids, node_counts = np.unique(node_tokens, return_counts=True)

	df = pl.DataFrame({
		'token_id': unique_token_ids,
		'collocate_frequency': counts
	})

	# for log liklihood calculation - need to have counts of node tokens to adjust collocate_frequency_outside_context
	node_frequency = np.zeros_like(unique_token_ids, dtype=np.int32)
	# use where to set node_frequency for node tokens
	for i, token_id in enumerate(unique_node_token_ids):
		node_frequency[unique_token_ids == token_id] = node_counts[i]
	df = df.with_columns(
		pl.lit(node_frequency).alias('node_frequency')
	)

	# adding frequency of collocates in corpus
	df = df.join(self.corpus.vocab.collect().select(['token_id', 'token', frequency_column]), on='token_id', how='left', maintain_order='left')
	df = df.rename({frequency_column: 'frequency'})

	filtering_descriptors = []
	if min_collocate_frequency > 1: # applying min_frequency filter
		df = df.filter(pl.col('collocate_frequency') >= min_collocate_frequency)
		filtering_descriptors.append(f'minimum collocation frequency ({min_collocate_frequency})')
	if len(filtering_descriptors) > 0:
		formatted_data.append(f'Filtered tokens by {(", ".join(filtering_descriptors))}')

	if effect_size_measure == 'logdice':
		# calculating collocation measure
		# from old code: logdice = 14 + math.log2((2 * collocate_count) / (node_frequency + loaded_corpora[corpus_name]['frequency_lookup'][collocate]))
		df = df.with_columns(
			(pl.lit(14) + ((2 * pl.col('collocate_frequency')) / (pl.lit(token_positions[0].shape[0]) + pl.col('frequency'))).log(2))
			.alias('logdice')
		)
		columns.append('logdice')
	elif effect_size_measure == 'mutual_information':
		# from old code: mi = math.log2((loaded_corpora[corpus_name]['token_count'] * collocate_count) / (node_frequency * loaded_corpora[corpus_name]['frequency_lookup'][collocate]))
		df = df.with_columns(
			(pl.lit(count_tokens) * pl.col('collocate_frequency') / (pl.lit(token_positions[0].shape[0]) * pl.col('frequency'))).log(2)
			.alias('mutual_information')
		)
		columns.append('mutual_information')

	if statistical_significance_measure == 'log_likelihood':
		# based on calculation for keyness: https://ucrel.lancs.ac.uk/llwizard.html
		# a = collocate frequency in context, is collocate_frequency
		# b = collocate frequency outside context ...
		# reminder: for individual tokens constituting the node, collocate_frequency_outside_context should exclude frequency of tokens in node
		df = df.with_columns(
			pl.max_horizontal([(pl.col('frequency') - pl.col('collocate_frequency') - pl.col('node_frequency')), pl.lit(0)]).alias('collocate_frequency_outside_context')
		)

		# c = total tokens in context windows, is token_count_in_context_window calculated above
		# d = total tokens outside context windows
		total_tokens_outside_context_window = count_tokens - token_count_in_context_window - (token_positions[0].shape[0] * sequence_len)

		# E1 = c*(a+b) / (c+d) 
		# E2 = d*(a+b) / (c+d)
		# E1 and E2
		df = df.with_columns(
			((pl.lit(token_count_in_context_window) * (pl.col('collocate_frequency') + pl.col('collocate_frequency_outside_context'))) / (pl.lit(token_count_in_context_window) + pl.lit(total_tokens_outside_context_window))).alias('expected_frequency1'),
			((pl.lit(total_tokens_outside_context_window) * (pl.col('collocate_frequency') + pl.col('collocate_frequency_outside_context'))) / (pl.lit(token_count_in_context_window) + pl.lit(total_tokens_outside_context_window))).alias('expected_frequency2'), 
		)

		# G2 = 2*((a*ln (a/E1)) + (b*ln (b/E2))) 
		# components of G2 as term1 and term 2 - (a*ln (a/E1)) (b*ln (b/E2))
		df = df.with_columns([
			pl.when(pl.col('collocate_frequency') > 0)
			.then(pl.col('collocate_frequency') * (pl.col('collocate_frequency') / pl.col('expected_frequency1')).log())
			.otherwise(0)
			.alias('term1'),
			pl.when(pl.col('collocate_frequency_outside_context') > 0)
			.then(pl.col('collocate_frequency_outside_context') * (pl.col('collocate_frequency_outside_context') / pl.col('expected_frequency2')).log())
			.otherwise(0)
			.alias('term2') # 0 if no reference frequency
		])

		# G2
		df = df.with_columns(
			(2 * (pl.col('term1') + pl.col('term2'))).alias('log_likelihood')
		)
		columns.append('log_likelihood')

	unique_collocates = df.select(pl.len()).item()

	if statistical_significance_cut is not None and statistical_significance_cut > 0:
		p = statistical_significance_cut
		# bonferroni correction
		if apply_bonferroni:
			p_value_descriptor = f'Keywords filtered based on p-value {p} with Bonferroni correction (based on {unique_collocates} tests)'
			p = p / unique_collocates # adjust by criteria
		else:
			p_value_descriptor = f'Keywords filtered based on p-value: {p}'
		cut = chi2.ppf(1 - p, df=1)		
		df = df.filter(pl.col('log_likelihood') > cut)
		formatted_data.append(p_value_descriptor)
		unique_collocates = df.select(pl.len()).item()

	formatted_data.append(f'Unique collocates: {unique_collocates:,.0f}')

	# prepare report information and paging ...
	if order is None:
		order = effect_size_measure
	df = df.sort(order, descending = order_descending).with_row_index('rank', offset=1)

	if page_size > 0:
		start = (page_current - 1) * page_size
		df = df.slice(start, page_size)

	if page_size != 0 and unique_collocates > page_size:
		formatted_data.append(f'Showing {page_size} rows')
		formatted_data.append(f'Page {page_current} of {unique_collocates // page_size + 1}')

	#formatted_data.append(f'{total_descriptor}: {count_tokens:,.0f}')
	if debug:
		columns = df.columns

	logger.info(f"Collocates calculated in {time.time() - start_time:.2f} seconds.")

	return Result(type = 'collocates', df = df.select(columns), title=f'Collocates of "{token_str}"', description=f'{self.corpus.name}', summary_data={}, formatted_data=formatted_data)
	

In [None]:
collocates = Collocates(reuters)

In [None]:
for word in ["economy"]: # brown used 'i went in', 'any of us',  for testing "economy"
    %time collocates.collocates(word, order = None, order_descending = True, statistical_significance_cut = 0.0001, apply_bonferroni=True, effect_size_measure='logdice', context_length = 5, min_collocate_frequency = 5, page_current = 1).display()

"Collocates of ""economy""","Collocates of ""economy""","Collocates of ""economy""","Collocates of ""economy""","Collocates of ""economy""","Collocates of ""economy"""
Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus
Rank,Token,Collocate Frequency,Frequency,Logdice,Log Likelihood
1,stimulate,29,85,10.39,206.37
2,boost,20,222,9.60,84.59
3,japanese,35,944,9.52,88.82
4,domestic,27,700,9.39,70.45
5,german,23,537,9.35,64.41
6,world,35,1173,9.32,75.37
7,grew,12,103,9.09,57.00
8,sluggish,10,44,8.94,61.75
9,economy,18,621,8.89,195.51
10,measures,13,288,8.87,37.66


CPU times: user 82.1 ms, sys: 165 ms, total: 247 ms
Wall time: 159 ms


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()