# concordance

> Functionality for concordance analysis.

In [None]:
#| default_exp concordance

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import time
import numpy as np
import polars as pl
import math
from fastcore.basics import patch

In [None]:
#| export
from conc.corpus import Corpus
from conc.result import Result
from conc.core import logger, PAGE_SIZE, EOF_TOKEN_STR


In [None]:
#| export
class Concordance:
	""" Class for concordancing. """
	def __init__(self,
			  corpus:Corpus # Corpus instance
			  ): 
		self.corpus = corpus


In [None]:
#| exporti
@patch
def _get_concordance_sort(self:Concordance, 
						 token_index: list[np.ndarray], # token index to get sort columns for
						 sort_columns: list # sort columns to use
						 ) -> tuple[np.ndarray, np.ndarray]: # token ids for first sort column and corresponding sort order
	""" Get the first sort column for a concordance. """

	start_time = time.time()
	index = 'orth_index'
	seq = np.array(token_index[0]+sort_columns[0])
	sort_column_ids = getattr(self.corpus, index)[seq]
	sort_column_order = self.corpus.token_ids_to_sort_order(sort_column_ids)
	logger.info(f'Concordance sort column ({sort_column_ids.shape[0]}) retrieval time: {(time.time() - start_time):.5f} seconds')
	return sort_column_ids, sort_column_order


In [None]:
#| hide
path_to_corpus_file = '../test-corpora/saved/brown.corpus'

In [None]:
# load the corpus
brown = Corpus('brown').load(path_to_corpus_file)

In [None]:
# instantiate the Concordance class
report_brown = Concordance(brown)

In [None]:
#| hide
token_str = 'dog'
brown_token_sequence, brown_index_id = brown.tokenize(token_str, simple_indexing=True)
brown_token_index = brown.get_token_index(brown_token_sequence, brown_index_id)

sort_column_ids, sort_column_order = report_brown._get_concordance_sort(brown_token_index, [1, 2, 3])
print(sort_column_ids[:4])
print(brown.token_ids_to_tokens(sort_column_ids)[:4])
print(sort_column_order[:4])


[29064 38309 33838 15829]
['license' '.' 'owners' 'catchers']
[29512    40 36157  9356]


In [None]:
#| export
@patch
def concordance(self: Concordance, 
				token_str: str, # token string to get concordance for 
				context_words:int = 5, # number of words to show on left and right of token string
				order:str='1R2R3R', # order of sort columns
				page_size:int=PAGE_SIZE, # number of results to display per results page
				page_current:int=0, # current page of results
				show_all_columns:bool = False, # df with all columns or just essentials
				use_cache:bool = True # retrieve the results from cache if available
				) -> Result: # concordance report results
	""" Report concordance for a token string. """

	# handled output from get concordance that returns columnar format rather than rowwise
	# shifted to polars dataframes 
	# make sure clean out Corpus.EOF_TOKEN from left and right
	# removed x*y iteration (is very slow) for getting tokens - apply vectorized method
	# TODO: improve ordering so not fixed options e.g. include 3R1R2R
	# TODO add in ordering by metadata columns or doc
	# DONE - reducing data retrieved to just the sort columns and then doing the concordance display separately here
	# DONE - could speed up the sort so that does a partial sort (e.g. just one or two columns) to get position of the slice - then handle ordering with smaller slice of data
	# e.g. if concordancing 'the' find what sort0 word is before start of that page and what word after - then return that slice and sort that slice only  
	# IDEA: potentially get sort columns until small enough result
	# TODO: look at retrieval of document_ids - use token2doc_index
	# TODO avoid any duplication related to retrieval of concordance vectors

	token_sequence, index_id = self.corpus.tokenize(token_str, simple_indexing=True)

	start_time = time.time()
	sequence_len = len(token_sequence[0])
	concordance_range = range(-1 * context_words, context_words + sequence_len)
	positional_columns = [str(x) for x in concordance_range]

	index = 'orth_index'

	cache_id = tuple(['concordance'] + list(token_sequence) + [order])

	if use_cache == True and cache_id in self.corpus.results_cache:
		logger.info('Using cached concordance results')
		positional_columns = self.corpus.results_cache[cache_id][0]
		concordance_df = self.corpus.results_cache[cache_id][1]
		total_count = self.corpus.results_cache[cache_id][2]
		total_docs = self.corpus.results_cache[cache_id][3]
		sort_columns = self.corpus.results_cache[cache_id][4]
	else:
		logger.info('Processing concordance results')
		token_index = self.corpus.get_token_index(token_sequence, index_id)

		if len(token_index[0]) == 0:
			logger.info('No tokens found')
			return None, {}, []

		if order == '1L2L3L':
			sort_columns = [-1,-2,-3]
		elif order == '3L2L1L':
			sort_columns = [-3,-2,-1]
		elif order == '2L1L1R':
			sort_columns = [-2,-1,sequence_len + 1 - 1]
		elif order == '1L1R2R':
			sort_columns = [-1,sequence_len + 1 - 1,sequence_len + 2 - 1]
		else:
			# i.e. 1R2R3R
			sort_columns = [sequence_len + 1 - 1,sequence_len + 2 - 1,sequence_len + 3 - 1]

		# getting first sort column here
		sort_column_ids, sort_column_order = self._get_concordance_sort(token_index, sort_columns)
		
		concordance_df = pl.DataFrame([pl.Series(name='index', values=token_index[0]), pl.Series(name='sort0', values=sort_column_order), pl.Series(name=str(sort_columns[0]), values=sort_column_ids)])
		concordance_df = concordance_df.sort('sort0')
		concordance_df = concordance_df.with_row_index('row')

		total_count = len(concordance_df)
		total_docs = len(np.unique(self.corpus.token2doc_index[np.array(token_index[0])]))

		self.corpus.results_cache[cache_id] = [positional_columns, concordance_df, total_count, total_docs, sort_columns]

	# working out relevant slice to populate 
	resultset_start = page_size*page_current
	resultset_len = page_size
	resultset_end = min(resultset_start + resultset_len, len(concordance_df) - 1)
	
	start_order = concordance_df['sort0'][resultset_start]
	end_order = concordance_df['sort0'][resultset_end]
	start_order_pos = concordance_df.filter(pl.col("sort0") == start_order).head(1)['row'].item()
	end_order_pos = concordance_df.filter(pl.col("sort0") == end_order).tail(1)['row'].item()
	
	# populating a smaller chunk of the concordance report - as only need to retrieve/sort a subset
	concordance_result_df = concordance_df.slice(start_order_pos, end_order_pos - start_order_pos + 1)

	results_start_time = time.time()
	concordance_columns = []
	seq = concordance_result_df['index'].to_numpy()
	for pos in concordance_range:
		tokens = getattr(self.corpus, index)[np.array(seq+pos)]
		concordance_columns.append(pl.Series(name=str(pos), values=tokens))
		if pos in sort_columns:
			column_name = 'sort'+str(sort_columns.index(pos))
			if column_name != 'sort0':
				concordance_columns.append(pl.Series(name=column_name, values=self.corpus.token_ids_to_sort_order(tokens)))
	logger.info(f'Concordance results ({len(concordance_columns[0])}) retrieval time: {(time.time() - results_start_time):.5f} seconds')

	concordance_result_df = concordance_result_df.with_columns(concordance_columns)
	offsets_arr = np.array(self.corpus.offsets,dtype=np.uint64)
	document_ids = np.searchsorted(offsets_arr, concordance_result_df['index'], side = 'right') - 1
	concordance_result_df = concordance_result_df.with_columns(pl.Series(name="document_id", values=document_ids))
	concordance_result_df = concordance_result_df.sort(['sort0','sort1','sort2'])
		
	# slicing this further to get only the required page of results and then populating with left, keyword, right strings
	concordance_view_df = concordance_result_df.slice(start_order_pos - resultset_start, page_size)

	concordance_left = []
	concordance_right = []
	concordance_keyword = []

	for pos in positional_columns:
		if int(pos) < 0:
			concordance_left.append(self.corpus.token_ids_to_tokens(concordance_view_df[str(pos)]))
		elif int(pos) == 0 or int(pos) < sequence_len:
			concordance_keyword.append(self.corpus.token_ids_to_tokens(concordance_view_df[str(pos)]))
		else:
			concordance_right.append(self.corpus.token_ids_to_tokens(concordance_view_df[str(pos)]))

	concordance_left = [(' '.join(column)).split(EOF_TOKEN_STR)[-1] for column in np.array(concordance_left).T]
	concordance_keyword = [' '.join(column) for column in np.array(concordance_keyword).T]
	concordance_right = [(' '.join(column)).split(EOF_TOKEN_STR)[0] for column in np.array(concordance_right).T]

	concordance_view_df = concordance_view_df.with_columns(pl.Series(name="left", values=concordance_left), pl.Series(name="keyword", values=concordance_keyword), pl.Series(name="right", values=concordance_right))

	total_pages = math.ceil(total_count/page_size)
	summary_data = {'total_count': total_count, 'total_docs': total_docs, 'page': page_current, 'total_pages': total_pages}
	formatted_data = [f'Total Concordance Rows: {total_count}', f'Total Documents: {total_docs}', f'Showing {min(page_size, total_count)} rows', f'Page {page_current+1} of {total_pages}']

	if show_all_columns == False:
		concordance_view_df = concordance_view_df[['document_id', 'left', 'keyword', 'right']]
	
	logger.info(f'Concordance report time: {(time.time() - start_time):.5f} seconds')

	return Result(type = 'concordance', df=concordance_view_df, title=f'Concordance for "{token_str}"', description=f'Context tokens: {context_words}, Order: {order}', summary_data=summary_data, formatted_data=formatted_data)


In [None]:
report_brown.concordance('cause', context_words = 10, order='1R2R3R').display()


"Concordance for ""cause""","Concordance for ""cause""","Concordance for ""cause""","Concordance for ""cause"""
"Context tokens: 10, Order: 1R2R3R","Context tokens: 10, Order: 1R2R3R","Context tokens: 10, Order: 1R2R3R","Context tokens: 10, Order: 1R2R3R"
Document Id,Left,Keyword,Right
86,abstract principle connected with it -- such as ` `,cause,"'' . all practical purposes , the West stands disunited"
195,to stand or fall only by the merits of my,cause,'' . seven recognized that independence was but the first
99,professionals '' but agitators for some kind of ` `,cause,"'' or ` ` reform '' , and this was"
206,he really wants is to find ` ` a sacred,cause,'' to which he can honestly devote himself . restless
262,` of '' that lost ` ` and '' dying,cause,", ` ` and in the '' ` ` sprung"
199,things happening in the earth and sky with no discernable,cause,", and these they attribute to the will of God"
287,"sign the request , because of illness or other good",cause,", another person who stands in close personal or business"
252,not be at any fault for money for prosecuting the,cause,", for himself will procure it and lay it down"
75,short views -- only up to lunchtime '' . the,cause,", his mood in the fifties rarely rises above the"
239,"Mando , pleading her",cause,", must have said that Dr. Brown was the most"


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()