# Collocates

> Functionality for collocation analysis.

In [None]:
#| default_exp collocates

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import numpy as np
import time
import polars as pl
from fastcore.basics import patch
import math

In [None]:
#| export
from conc.corpus import Corpus
from conc.result import Result
from conc.core import logger, PAGE_SIZE, set_logger_state

In [None]:
#| hide
import os

In [None]:
#| hide
np.set_printoptions(suppress=True)
set_logger_state('verbose')

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

path_to_toy_corpus = f'{save_path}toy.corpus'
path_to_brown_corpus = f'{save_path}brown.corpus'
path_to_reuters_corpus = f'{save_path}reuters.corpus'
path_to_gutenberg_corpus = f'{save_path}gutenberg.corpus'
path_to_gardenparty_corpus = f'{save_path}garden-party.corpus'
path_to_congress_corpus = f'{save_path}us-congressional-speeches-subset-100k.corpus'

In [None]:
#| hide
reuters = Corpus().load(path_to_reuters_corpus)
brown = Corpus().load(path_to_brown_corpus)
gardenparty = Corpus().load(path_to_gardenparty_corpus)
gutenberg = Corpus().load(path_to_gutenberg_corpus)
congress = Corpus().load(path_to_congress_corpus)

2025-06-06 14:04:12 - INFO - memory_usage - init, memory usage: 1775.4140625 MB
2025-06-06 14:04:13 - INFO - load - Load time: 0.207 seconds
2025-06-06 14:04:13 - INFO - memory_usage - init, memory usage: 1775.4140625 MB
2025-06-06 14:04:13 - INFO - load - Load time: 0.207 seconds
2025-06-06 14:04:13 - INFO - memory_usage - init, memory usage: 1775.4140625 MB
2025-06-06 14:04:13 - INFO - load - Load time: 0.211 seconds
2025-06-06 14:04:13 - INFO - memory_usage - init, memory usage: 1775.4140625 MB
2025-06-06 14:04:13 - INFO - load - Load time: 0.316 seconds
2025-06-06 14:04:13 - INFO - memory_usage - init, memory usage: 1773.4140625 MB
2025-06-06 14:04:14 - INFO - load - Load time: 0.209 seconds


In [None]:
#| export
class Collocates:
	""" Class for collocation analysis reporting. """
	def __init__(self,
			  corpus:Corpus # Corpus instance
			  ): 
		self.corpus = corpus


In [None]:
#| exporti
@patch
def _shift_zeroes_to_end(self:Collocates,
						arr:np.ndarray # Numpy array of collocate frequencies to process
						):
	""" Move 0 value positions for punctuation and space removal """
	result = np.empty_like(arr)
	for col in range(arr.shape[1]):
		col_data = arr[:, col]
		mask = col_data != 0
		result[:mask.sum(), col] = col_data[mask]
		result[mask.sum():, col] = 0
	return result

In [None]:
#| exporti
@patch
def _zero_after_value(self:Collocates,
					  arr:np.ndarray, # Numpy array of collocate frequencies to process
					  target: int # Target value to find in the array (e.g., an end-of-file token or a specific collocate frequency)
					  ):
	""" Set values from first occurence of target value to 0 in each column (for processing tokens outside text using eof token) """
	arr = arr.copy()  
	for col in range(arr.shape[1]):
		col_data = arr[:, col]
		idx = np.where(col_data == target)[0]
		if idx.size > 0:
			first_idx = idx[0]
			arr[first_idx:, col] = 0
	return arr

For learning journal = notebook pick up collocation dominoes

In [None]:
#| exporti
@patch
def _get_collocates_in_context(self:Collocates,
							   token_positions:np.ndarray, # Numpy array of token positions in the corpus
							   index:str, # Index to use - lower_index, orth_index
							   context_length:int = 5, # Number of context words to consider on each side of the token
							   position_offset:int = 1 # offset to start retrieving context words - -1 for left, positive for right (may be adjusted by sequence_len)
							   ) -> Result:
	""" Get collocates in context for a given token index, operates one side at a time. """

	start_time = time.time()

	if context_length < 1:
		# return empty result
		return np.zeros((0, 0), dtype=np.int32)

	if position_offset < 0:
		position_offset_step = -1
	else:
		position_offset_step = 1
	
	collected = False
	context_tokens_arr = []
	while collected == False:
		new_positions = np.array(token_positions[0] + position_offset, dtype = token_positions[0].dtype)
		context_tokens_arr.append(self.corpus.get_tokens_by_index(index)[new_positions])
		position_offset += position_offset_step
		if len(context_tokens_arr) >= context_length: # cleaning spaces and punctuation and check if need more iterations
			context_tokens = np.array(context_tokens_arr, dtype = token_positions[0].dtype)
			context_tokens = np.where(np.isin(context_tokens, self.corpus.punct_tokens + self.corpus.space_tokens), 0, context_tokens)
			counts = np.count_nonzero(context_tokens, axis=0)
			if np.min(counts) < context_length:
				pass
			else:
				collected = True

	context_tokens = self._shift_zeroes_to_end(context_tokens)
	context_tokens = context_tokens[:context_length, :]

	if self.corpus.EOF_TOKEN in context_tokens:
		context_tokens = self._zero_after_value(context_tokens, self.corpus.EOF_TOKEN)

	logger.info(f"Collocates retrieved in {time.time() - start_time:.2f} seconds.")

	return context_tokens

In [None]:
#| export
@patch
def collocates(self:Collocates, 
				token_str:str, # Token to search for
				collocation_measure:str = 'logdice', # statistical measure to use for collocation calculation: logdice, mutual_information
				context_length:int|None=5, # Window size per side in tokens - use this for setting context lengths on left and right to same value
				context_left:int|None=None, # If context_left or context_right > 0 sets context lengths independently
				context_right:int|None=None, # see context_left
				min_collocate_frequency:int=5, # Minimum count of collocates
				page_size:int=PAGE_SIZE, # number of rows to return, if 0 returns all
				page_current:int=1, # current page, ignored if page_size is 0
				exclude_punctuation:bool=True, # exclude punctuation tokens
				exclude_spaces:bool=True # exclude space tokens
				) -> Result:
	""" Report collocates for a given token string. """

	token_sequence, index_id = self.corpus.tokenize(token_str, simple_indexing=True)

	index_column = 'lower_index'
	frequency_column = 'frequency_lower'
	columns = ['rank', 'token', 'collocate_frequency', 'frequency']

	if collocation_measure not in ['logdice', 'mutual_information']:
		raise ValueError(f'Collocation measure must be one of "logdice" or "mutual_information"')

	start_time = time.time()
	debug = False

	sequence_len = len(token_sequence[0])
	token_positions = self.corpus.get_token_positions(token_sequence, index_id)

	count_tokens = self.corpus.token_count
	tokens_descriptor = 'all tokens'
	total_descriptor = 'Total tokens'
	if exclude_punctuation and exclude_spaces:
		count_tokens = self.corpus.word_token_count
		tokens_descriptor = 'word tokens'
		total_descriptor = 'Total word tokens'
	elif exclude_punctuation:
		space_tokens_count = self.corpus.spaces.select(pl.len()).collect(engine='streaming').item()
		count_tokens = self.corpus.word_token_count + space_tokens_count
		tokens_descriptor = 'word and space tokens'
		total_descriptor = 'Total word and space tokens'
	elif exclude_spaces:
		punct_tokens_count = self.corpus.puncts.select(pl.len()).collect(engine='streaming').item()
		count_tokens = self.corpus.word_token_count + punct_tokens_count
		tokens_descriptor = 'word and punctuation tokens'
		total_descriptor = 'Total word and punctuation tokens'

	formatted_data = []
	formatted_data.append(f'Report based on {tokens_descriptor}')

	# if any of context_length, context_left, context_right are None - set them to 0
	if context_length is None:
		context_length = 0
	if context_left is None:
		context_left = 0
	if context_right is None:
		context_right = 0

	if context_left == 0 and context_right == 0:
		context_left = context_length
		context_right = context_length
	elif (context_left > 0 or context_right > 0) and context_length > 0:
		logger.warning('Context length is ignored if either context_left or context_right is set to a value greater than 0. To remove this warning, set context_length to None or 0.')

	formatted_data.append(f'Context tokens left: {context_left}, context tokens right: {context_right}')

	# getting context tokens
	left_tokens = self._get_collocates_in_context(token_positions=token_positions, index=index_column, context_length=context_left, position_offset=-1)
	right_tokens = self._get_collocates_in_context(token_positions=token_positions, index=index_column, context_length=context_right, position_offset=sequence_len)
	combined_tokens = np.concatenate([left_tokens.flatten(), right_tokens.flatten()])
	combined_tokens = combined_tokens[combined_tokens != 0] # removes punctuation and space placeholder
	token_count_in_context_window = combined_tokens.shape[0]

	# getting frequencies of collocates
	unique_token_ids, counts = np.unique(combined_tokens, return_counts=True)
	#unique_token_ids = unique_token_ids.astype(np.int32)

	df = pl.DataFrame({
		'token_id': unique_token_ids,
		'collocate_frequency': counts
	})

	# adding frequency of collocates in corpus
	df = df.join(self.corpus.vocab.collect().select(['token_id', 'token', frequency_column]), on='token_id', how='left', maintain_order='left')
	df = df.rename({frequency_column: 'frequency'})

	filtering_descriptors = []
	if min_collocate_frequency > 1: # applying min_frequency filter
		df = df.filter(pl.col('collocate_frequency') >= min_collocate_frequency)
		filtering_descriptors.append(f'minimum collocation frequency ({min_collocate_frequency})')
	if len(filtering_descriptors) > 0:
		formatted_data.append(f'Filtered tokens by {(", ".join(filtering_descriptors))}')

	if collocation_measure == 'logdice':
		# calculating collocation measure
		# from old code: logdice = 14 + math.log2((2 * collocate_count) / (node_frequency + loaded_corpora[corpus_name]['frequency_lookup'][collocate]))
		df = df.with_columns(
			(pl.lit(14) + ((2 * pl.col('collocate_frequency')) / (pl.lit(token_positions[0].shape[0]) + pl.col('frequency'))).log(2))
			.alias('logdice')
		)
		columns.append('logdice')

	if collocation_measure == 'mutual_information':
		# from old code: mi = math.log2((loaded_corpora[corpus_name]['token_count'] * collocate_count) / (node_frequency * loaded_corpora[corpus_name]['frequency_lookup'][collocate]))
		df = df.with_columns(
			(pl.lit(count_tokens) * pl.col('collocate_frequency') / (pl.lit(token_positions[0].shape[0]) * pl.col('frequency'))).log(2)
			.alias('mutual_information')
		)
		columns.append('mutual_information')


	# based on calculation for keyness: https://ucrel.lancs.ac.uk/llwizard.html
	# a = collocate frequency in context, is collocate_frequency
	# b = collocate frequency outside context ...
	# TODO - add when syntax to handle case for individual tokens constituting the node, adjust collocate_frequency_outside_context by token_positions[0].shape[0]
	df = df.with_columns(
		(pl.col('frequency') - pl.col('collocate_frequency')).alias('collocate_frequency_outside_context')
	)

	# c = total tokens in context windows, is token_count_in_context_window calculated above
	# d = total tokens outside context windows
	total_tokens_outside_context_window = count_tokens - token_count_in_context_window - (token_positions[0].shape[0] * sequence_len)

	# E1 = c*(a+b) / (c+d) 
	# E2 = d*(a+b) / (c+d)
	# E1 and E2
	df = df.with_columns(
		((pl.lit(token_count_in_context_window) * (pl.col('collocate_frequency') + pl.col('collocate_frequency_outside_context'))) / (pl.lit(token_count_in_context_window) + pl.lit(total_tokens_outside_context_window))).alias('expected_frequency1'),
		((pl.lit(total_tokens_outside_context_window) * (pl.col('collocate_frequency') + pl.col('collocate_frequency_outside_context'))) / (pl.lit(token_count_in_context_window) + pl.lit(total_tokens_outside_context_window))).alias('expected_frequency2'), 
	)

	# G2 = 2*((a*ln (a/E1)) + (b*ln (b/E2))) 
	# components of G2 as term1 and term 2 - (a*ln (a/E1)) (b*ln (b/E2))
	df = df.with_columns([
		pl.when(pl.col('collocate_frequency') > 0)
		.then(pl.col('collocate_frequency') * (pl.col('collocate_frequency') / pl.col('expected_frequency1')).log())
		.otherwise(0)
		.alias('term1'),
		pl.when(pl.col('collocate_frequency_outside_context') > 0)
		.then(pl.col('collocate_frequency_outside_context') * (pl.col('collocate_frequency_outside_context') / pl.col('expected_frequency2')).log())
		.otherwise(0)
		.alias('term2') # 0 if no reference frequency
	])

	# G2
	df = df.with_columns(
		(2 * (pl.col('term1') + pl.col('term2'))).alias('log_likelihood')
	)
	columns.append('log_likelihood')

	# prepare report information and paging ...
	df = df.sort(collocation_measure, descending = True).with_row_index('rank', offset=1)
	unique_collocates = df.select(pl.len()).item()
	formatted_data.append(f'Unique collocates: {unique_collocates:,.0f}')

	if page_size > 0:
		start = (page_current - 1) * page_size
		df = df.slice(start, page_size)

	if page_size != 0 and unique_collocates > page_size:
		formatted_data.append(f'Showing {page_size} rows')
		formatted_data.append(f'Page {page_current} of {unique_collocates // page_size + 1}')

	#formatted_data.append(f'{total_descriptor}: {count_tokens:,.0f}')

	logger.info(f"Collocates calculated in {time.time() - start_time:.2f} seconds.")

	return Result(type = 'collocates', df = df.select(columns), title=f'Collocates of "{token_str}"', description=f'{self.corpus.name}', summary_data={}, formatted_data=formatted_data)
	

In [None]:
collocates = Collocates(reuters)

In [None]:
for word in ["fish"]: # brown used 'i went in', 'any of us',  for testing "economy"
    %time collocates.collocates(word, collocation_measure='logdice', context_length = 5, min_collocate_frequency = 5).display()

2025-06-06 15:29:19 - INFO - tokenize - Tokenization time: 0.00011 seconds
2025-06-06 15:29:19 - INFO - get_token_positions - Token indexing (31) time: 0.00165 seconds
2025-06-06 15:29:19 - INFO - _get_collocates_in_context - Collocates retrieved in 0.00 seconds.
2025-06-06 15:29:19 - INFO - _get_collocates_in_context - Collocates retrieved in 0.00 seconds.
2025-06-06 15:29:19 - INFO - collocates - Collocates calculated in 0.02 seconds.


[(np.int64(33020),)]


"Collocates of ""fish""","Collocates of ""fish""","Collocates of ""fish""","Collocates of ""fish""","Collocates of ""fish""","Collocates of ""fish"""
Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus
Rank,Token,Collocate Frequency,Frequency,Logdice,Log Likelihood
1,meal,7,68,11.18,73.11
2,production,5,1468,6.77,18.23
3,and,17,25645,4.44,15.22
4,said,14,25379,4.17,9.23
5,the,19,69263,3.17,1.01
6,of,10,36779,3.15,0.49
7,to,9,36328,3.02,0.16
8,in,7,29252,2.97,0.07
Report based on word tokens,Report based on word tokens,Report based on word tokens,Report based on word tokens,Report based on word tokens,Report based on word tokens
"Context tokens left: 5, context tokens right: 5","Context tokens left: 5, context tokens right: 5","Context tokens left: 5, context tokens right: 5","Context tokens left: 5, context tokens right: 5","Context tokens left: 5, context tokens right: 5","Context tokens left: 5, context tokens right: 5"


CPU times: user 56.1 ms, sys: 3.95 ms, total: 60.1 ms
Wall time: 25.5 ms


In [None]:
#| hide
## Collocation methods
# 
# # @patch
# def collocates(self: Corpus, word, nice_word, constrain_to = False, context_length = 5, limit = 50, cutoff=5, stat = 'ld', output = False):
#     elements = {}
#     nodes = []
#     edges = []

#     coll_token_sequence, coll_index_id = tokenize_string(corpus_name, word)
#     token_id = coll_token_sequence[0][0]
    
#     #print(token_id, nice_word)
    
#     if constrain_to == False:
#         nodes.append((token_id, {'label': nice_word, 'size': 1}))
    
#     #print(coll_token_sequence, coll_index_id)
#     coll_token_index = profile_get_token_index(corpus_name, coll_token_sequence, coll_index_id) # get_token_positions
#     #print(coll_token_index)
#     positional_columns, concordance = profile_get_concordance(corpus_name, coll_token_sequence, coll_token_index, context_words = context_length, index_id = LOWER)
#     #print(concordance)
#     collocates = []
#     for row in concordance:
#         if eof_token in row:
#             indexes = np.where(np.array(row) == eof_token)[0]
#             #print(indexes)
#             slice_min = -1
#             slice_max = context_length * 2 + 1
#             for i in indexes:
#                 if i < context_length and i > slice_min:
#                     slice_min = i
#                 elif i > context_length and i < slice_max:
#                     #print('***')
#                     slice_max = i
#             slice_min += 1
#             #slice_max -= 1
#             #print(slice_min,slice_max)
#             #print(row)
#             #print(row[slice_min:slice_max])
#             collocates.append(row[slice_min:slice_max])
#         else:
#             collocates.append(row)

#     node_frequency = len(collocates)
            
#     if len(collocates) < 1:
#         print('no collocates')
#     else:
#         #print(collocates)
#         collocates = np.concatenate(collocates)
#         collocates = np.unique(collocates, axis=0, return_counts=True)
#         #collocates = collocates[collocates[:,1].argsort()]
        
#         #print(collocates)
#         logdices = []

#         for row in range(len(collocates[0])):
#             collocate = collocates[0][row]
#             collocate_count = collocates[1][row]
#             if constrain_to == False:
#                 pass
#             else:
#                 if loaded_corpora[corpus_name]['vocab'][collocate] in constrain_to:
#                     pass
#                     #print(corpus['vocab'][collocate],' in constrain_to', constrain_to)
#                 else:
#                     continue
#             if collocate in coll_token_sequence:
#                 #print('match china')
#                 pass
#             elif collocate_count > 1:
#                 #14 + log2D=14+log2(2fxy/(fx+fy))

#                 if re.search(_RE_PUNCT,loaded_corpora[corpus_name]['vocab'][collocate]) is None: # FIX - HAVE OPION TO REMOVE PUNC
#                     if collocate_count >= cutoff:
#                         logdice = 14 + math.log2((2 * collocate_count) / (node_frequency + loaded_corpora[corpus_name]['frequency_lookup'][collocate]))
#                         mi = math.log2((loaded_corpora[corpus_name]['token_count'] * collocate_count) / (node_frequency * loaded_corpora[corpus_name]['frequency_lookup'][collocate]))
#                         logdices.append([logdice, mi, collocate, loaded_corpora[corpus_name]['vocab'][collocate], collocate_count, loaded_corpora[corpus_name]['frequency_lookup'][collocate]])
#                         #if logdice > 8:
#                         #print(collocate, node_frequency, loaded_corpora[corpus_name]['vocab'][collocate], collocates[1][row],logdice, mi)
#                         matches = np.where(collocates == collocate)[0]
#                         collocate_count = len(matches)
#         #        if (collocate_count > 5):
    
#     top_collocates = []
#     if stat == 'mi':
#         sorted_collocates = sorted(logdices, reverse=True, key=lambda x: x[1])[0:limit]
#         for row in sorted_collocates:
#             nodes.append((row[2], {'label': row[3], 'size': 1}))
#             edges.append((token_id, row[2], {'weight': 1}))
#             #print(row)
#             top_collocates.append(row[3])
#     else:
#         sorted_collocates = sorted(logdices, reverse=True, key=lambda x: x[0])[0:limit]
#         for row in sorted_collocates:
#             nodes.append((row[2], {'label': row[3], 'size': 1}))
#             edges.append((token_id, row[2], {'weight': row[0]}))
#             top_collocates.append(row[3])
    
#     if constrain_to == False:
#         for top_collocate in top_collocates:
#             top_collocate_elements, top_collocate_sorted, df = profile_prepare_collocates(corpus_name, top_collocate, top_collocate, constrain_to = top_collocates, context_length=context_length, limit = limit, stat=stat)
#             for edge in top_collocate_elements['edges']:
#                 edges.append(edge) 
    
    
#     #display(sorted(logdices, reverse=True, key=lambda x: x[0])[0:limit])
    
#     elements['nodes'] = nodes
#     elements['edges'] = edges
#     df = pd.DataFrame(sorted_collocates, columns = ['logdice', 'mi', 'collocate_token_id', 'collocate', 'collocate_count', 'collocate_token_frequency'])
#     # if output != False:
#     #     if output == 'file':
#     #         with open(output_dir + corpus_name + '/collocates-' + stat + '-' + nice_word + '.html', 'w', encoding='utf8') as f:
#     #             f.write(df.to_html(classes='table table-stripped'))
#     return elements, sorted_collocates, df


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()