# concordance

> Functionality for concordance analysis.

In [None]:
#| default_exp concordance

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import time
import numpy as np
import polars as pl
import math
from fastcore.basics import patch
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import ipywidgets as widgets

In [None]:
#| export
from conc.corpus import Corpus
from conc.result import Result
from conc.core import logger, PAGE_SIZE, EOF_TOKEN_STR, ERR_TOKEN_STR

In [None]:
#| hide
from conc.core import set_logger_state

In [None]:
#| hide
import os

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

path_to_toy_corpus = f'{save_path}toy.corpus'
path_to_brown_corpus = f'{save_path}brown.corpus'
path_to_reuters_corpus = f'{save_path}reuters.corpus'

In [None]:
#| export
class Concordance:
	""" Class for concordancing. """
	def __init__(self,
			  corpus:Corpus # Corpus instance
			  ): 
		self.corpus = corpus


In [None]:
#| exporti
@patch
def _get_concordance_sort(self:Concordance, 
						 token_positions: list[np.ndarray], # token index to get sort columns for
						 sort_columns: list # sort columns to use
						 ) -> tuple[np.ndarray, np.ndarray]: # token ids for first sort column and corresponding sort order
	""" Get the first sort column for a concordance. """

	start_time = time.time()
	index = 'orth_index'
	seq = np.array(token_positions[0]+sort_columns[0])
	sort_column_ids = self.corpus.get_tokens_by_index('orth_index')[seq]
	sort_column_order = self.corpus.token_ids_to_sort_order(sort_column_ids)
	logger.info(f'Concordance sort column ({sort_column_ids.shape[0]}) retrieval time: {(time.time() - start_time):.5f} seconds')
	return sort_column_ids, sort_column_order


In [None]:
#| hide
toy = Corpus().load(path_to_toy_corpus)
report_toy = Concordance(toy)

In [None]:
# load the corpus
brown = Corpus('brown').load(path_to_brown_corpus)

In [None]:
# instantiate the Concordance class
report_brown = Concordance(brown)

In [None]:
#| hide
token_str = 'dog'
token_sequence, index_id = brown.tokenize(token_str, simple_indexing=True)
token_positions = brown.get_token_positions(token_sequence, index_id)

sort_column_ids, sort_column_order = report_brown._get_concordance_sort(token_positions, [1, 2, 3])
print(sort_column_ids[:4])
print(brown.token_ids_to_tokens(sort_column_ids)[:4])
print(sort_column_order[:4])


[29064 38309 33838 15829]
['license' '.' 'owners' 'catchers']
[29512    41 36156  9357]


In [None]:
#| export
@patch
def concordance(self: Concordance, 
				token_str: str, # token string to get concordance for 
				context_length:int = 5, # number of words to show on left and right of token string
				order:str='1R2R3R', # order of sort columns - one of 1L2L3L, 3L2L1L, 2L1L1R, 1L1R2R, 1R2R3R, LEFT, RIGHT
				page_size:int=PAGE_SIZE, # number of results to display per results page
				page_current:int=1, # current page of results
				show_all_columns:bool = False, # df with all columns or just essentials
				use_cache:bool = True # retrieve the results from cache if available (currently ignored)
				) -> Result: # concordance report results
	""" Report concordance for a token string. """

	# DONE - reducing data retrieved to just the sort columns and then doing the concordance display separately here
	# DONE - speed up the sort so that does a partial sort (e.g. just one or two columns) to get position of the slice - then handle ordering with smaller slice of data
	# IDEA: potentially get sort columns until small enough result
	
	if order not in ['1L2L3L', '3L2L1L', '2L1L1R', '1L1R2R', '1R2R3R', 'LEFT', 'RIGHT']:
		raise ValueError(f'Invalid order: order must be one of: 1L2L3L, 3L2L1L, 2L1L1R, 1L1R2R, 1R2R3R, LEFT, RIGHT')
	
	if order == 'LEFT':
		order = '1L2L3L'
	elif order == 'RIGHT':
		order = '1R2R3R'

	token_sequence, index_id = self.corpus.tokenize(token_str, simple_indexing=True)

	start_time = time.time()
	sequence_len = len(token_sequence[0])
	concordance_range = range(-1 * context_length, context_length + sequence_len)
	positional_columns = [str(x) for x in concordance_range]

	index = 'orth_index'

	use_cache = False # forcing off for now

	cache_id = tuple(['concordance'] + list(token_sequence) + [order])

	if use_cache == True and cache_id in self.corpus.results_cache:
		logger.info('Using cached concordance results')
		positional_columns = self.corpus.results_cache[cache_id][0]
		concordance_df = self.corpus.results_cache[cache_id][1]
		total_count = self.corpus.results_cache[cache_id][2]
		total_docs = self.corpus.results_cache[cache_id][3]
		sort_columns = self.corpus.results_cache[cache_id][4]
	else:
		logger.info('Processing concordance results')
		token_positions = self.corpus.get_token_positions(token_sequence, index_id)

		if len(token_positions[0]) == 0:
			logger.info('No tokens found')
			return Result(type = 'concordance', df=pl.DataFrame(), title=f'Concordance for "{token_str}"', description=f'No matches', summary_data={}, formatted_data=[])

		if order == '1L2L3L':
			sort_columns = [-1,-2,-3]
		elif order == '3L2L1L':
			sort_columns = [-3,-2,-1]
		elif order == '2L1L1R':
			sort_columns = [-2,-1,sequence_len + 1 - 1]
		elif order == '1L1R2R':
			sort_columns = [-1,sequence_len + 1 - 1,sequence_len + 2 - 1]
		else:
			# i.e. 1R2R3R
			sort_columns = [sequence_len + 1 - 1,sequence_len + 2 - 1,sequence_len + 3 - 1]

		# getting first sort column here
		sort_column_ids, sort_column_order = self._get_concordance_sort(token_positions, sort_columns)
		
		concordance_df = pl.DataFrame([pl.Series(name='index', values=token_positions[0]), pl.Series(name='sort0', values=sort_column_order), pl.Series(name=str(sort_columns[0]), values=sort_column_ids)])
		concordance_df = concordance_df.sort('sort0')
		concordance_df = concordance_df.with_row_index('row')

		total_count = len(concordance_df)
		total_docs = len(np.unique(self.corpus.get_tokens_by_index('token2doc_index')[np.array(token_positions[0])])) # REFACTORED - was using old self.corpus.token2doc_index

		self.corpus.results_cache[cache_id] = [positional_columns, concordance_df, total_count, total_docs, sort_columns]

	# working out relevant slice to populate 
	resultset_start = page_size*(page_current-1)
	resultset_len = page_size
	resultset_end = min(resultset_start + resultset_len, len(concordance_df) - 1)
	
	start_order = concordance_df['sort0'][resultset_start]
	end_order = concordance_df['sort0'][resultset_end]
	start_order_pos = concordance_df.filter(pl.col("sort0") == start_order).head(1)['row'].item()
	end_order_pos = concordance_df.filter(pl.col("sort0") == end_order).tail(1)['row'].item()
	
	# populating a smaller chunk of the concordance report - as only need to retrieve/sort a subset
	concordance_result_df = concordance_df.slice(start_order_pos, end_order_pos - start_order_pos + 1)

	results_start_time = time.time()
	concordance_columns = []
	seq = concordance_result_df['index'].to_numpy()
	for pos in concordance_range:
		tokens = self.corpus.get_tokens_by_index(index)[np.array(seq+pos)] # REFACTORED - was using getattr call to get orth_index here
		concordance_columns.append(pl.Series(name=str(pos), values=tokens))
		if pos in sort_columns:
			column_name = 'sort'+str(sort_columns.index(pos))
			if column_name != 'sort0':
				concordance_columns.append(pl.Series(name=column_name, values=self.corpus.token_ids_to_sort_order(tokens)))
	logger.info(f'Concordance results ({len(concordance_columns[0])}) retrieval time: {(time.time() - results_start_time):.5f} seconds')

	concordance_result_df = concordance_result_df.with_columns(concordance_columns)
	#offsets_arr = np.array(self.corpus.offsets,dtype=np.uint64) # FIX
	#document_ids = np.searchsorted(offsets_arr, concordance_result_df['index'], side = 'right') - 1 
	document_ids = self.corpus.get_tokens_by_index('token2doc_index')[np.array(concordance_result_df['index'])] # REFACTORED to remove offsets functionality
	concordance_result_df = concordance_result_df.with_columns(pl.Series(name="document_id", values=document_ids))
	concordance_result_df = concordance_result_df.sort(['sort0','sort1','sort2'])
		
	# slicing this further to get only the required page of results and then populating with left, keyword, right strings
	concordance_view_df = concordance_result_df.slice(start_order_pos - resultset_start, page_size)

	concordance_left = []
	concordance_right = []
	concordance_keyword = []

	for pos in positional_columns:
		if int(pos) < 0:
			concordance_left.append(self.corpus.token_ids_to_tokens(concordance_view_df[str(pos)].to_numpy()))
		elif int(pos) == 0 or int(pos) < sequence_len:
			concordance_keyword.append(self.corpus.token_ids_to_tokens(concordance_view_df[str(pos)].to_numpy()))
		else:
			concordance_right.append(self.corpus.token_ids_to_tokens(concordance_view_df[str(pos)].to_numpy()))

	concordance_left = [(' '.join(column)).split(EOF_TOKEN_STR)[-1] for column in np.array(concordance_left).T]
	concordance_keyword = [' '.join(column) for column in np.array(concordance_keyword).T]
	concordance_right = [(' '.join(column)).split(EOF_TOKEN_STR)[0] for column in np.array(concordance_right).T]

	concordance_view_df = concordance_view_df.with_columns(pl.Series(name='left', values=concordance_left), pl.Series(name='node', values=concordance_keyword), pl.Series(name='right', values=concordance_right))

	total_pages = math.ceil(total_count/page_size)
	summary_data = {'total_count': total_count, 'total_docs': total_docs, 'page': page_current, 'total_pages': total_pages}
	formatted_data = [f'Total Concordance Lines: {total_count}', f'Total Documents: {total_docs}', f'Showing {min(page_size, total_count)} lines', f'Page {page_current} of {total_pages}']

	if show_all_columns == False:
		concordance_view_df = concordance_view_df[['document_id', 'left', 'node', 'right']]
	
	logger.info(f'Concordance report time: {(time.time() - start_time):.5f} seconds')

	return Result(type = 'concordance', df=concordance_view_df, title=f'Concordance for "{token_str}"', description=f'{self.corpus.name}, Context tokens: {context_length}, Order: {order}', summary_data=summary_data, formatted_data=formatted_data)


In [None]:
#| hide
# no result
assert report_toy.concordance('dsahjhdsjhdsa', context_length=5).df.select(pl.len()).item() == 0

In [None]:
#| hide
report_toy.concordance('the', context_length=5).display()

"Concordance for ""the""","Concordance for ""the""","Concordance for ""the""","Concordance for ""the"""
"Toy Corpus, Context tokens: 5, Order: 1R2R3R","Toy Corpus, Context tokens: 5, Order: 1R2R3R","Toy Corpus, Context tokens: 5, Order: 1R2R3R","Toy Corpus, Context tokens: 5, Order: 1R2R3R"
Document Id,Left,Node,Right
5,,The,cat is climbing a tree
3,,The,cat is meowing .
1,,The,cat sat on the mat
4,,The,dog is barking .
6,,The,dog is digging a hole
2,,The,dog sat on the mat
1,The cat sat on,the,mat .
2,The dog sat on,the,mat .
Total Concordance Lines: 8,Total Concordance Lines: 8,Total Concordance Lines: 8,Total Concordance Lines: 8
Total Documents: 6,Total Documents: 6,Total Documents: 6,Total Documents: 6


In [None]:
#| hide
reuters = Corpus().load(path_to_reuters_corpus)
conc_reuters = Concordance(reuters)

In [None]:
#| hide
conc_reuters.concordance('the company said', context_length = 5, order='1R2R3R').display()

"Concordance for ""the company said""","Concordance for ""the company said""","Concordance for ""the company said""","Concordance for ""the company said"""
"Reuters Corpus, Context tokens: 5, Order: 1R2R3R","Reuters Corpus, Context tokens: 5, Order: 1R2R3R","Reuters Corpus, Context tokens: 5, Order: 1R2R3R","Reuters Corpus, Context tokens: 5, Order: 1R2R3R"
Document Id,Left,Node,Right
2744,through a tender offer .,The company said,""" The negotiations would determine"
10501,1.25 dlrs a share .,The company said,""" this could bring earnings"
8353,of gold per ton .,The company said,& lt;Manitoba Mineral Resources Ltd
2186,". In a statement ,",the company said,", "" The SEC action"
8898,Co > of Japan .,The company said,", "" The discussions have"
6379,"In a brief statement ,",the company said,", "" We are studying"
6221,"special cost escrow accounts ,",the company said,", adding , that there"
4264,"close in near future ,",the company said,", adding it is prepared"
6319,"taxes . In addition ,",the company said,", Georgia Power 's contracts"
4664,the conversion of debentures .,The company said,", however , it expects"


In [None]:
report_brown.concordance('good at', context_length = 10, order='1R2R3R').display()

"Concordance for ""good at""","Concordance for ""good at""","Concordance for ""good at""","Concordance for ""good at"""
"Brown Corpus, Context tokens: 10, Order: 1R2R3R","Brown Corpus, Context tokens: 10, Order: 1R2R3R","Brown Corpus, Context tokens: 10, Order: 1R2R3R","Brown Corpus, Context tokens: 10, Order: 1R2R3R"
Document Id,Left,Node,Right
484,"about twenty miles away , and he was also pretty",good at,"anything in the carpentry line . was a vivid ,"
263,"he says , ' as a storyteller and was precociously",good at,"description , dialogue , and most of the other staples"
479,and not a method of passing the day . was,good at,his job . probably was n't hard for him to
474,trying to flatter her vanity . You must have been,good at,history at school . did you go to school ''
82,"enough of unequal merit , but all of them pretty",good at,that . consisted of a new arrangement of ` `
474,Why not '' ? ? said . I 'm not,good at,that kind of thing '' . This afternoon let 's
Total Concordance Lines: 6,Total Concordance Lines: 6,Total Concordance Lines: 6,Total Concordance Lines: 6
Total Documents: 5,Total Documents: 5,Total Documents: 5,Total Documents: 5
Showing 6 lines,Showing 6 lines,Showing 6 lines,Showing 6 lines
Page 1 of 1,Page 1 of 1,Page 1 of 1,Page 1 of 1


In [None]:
#| hide
# congress = Corpus().load(f'{save_path}us-congressional-speeches-subset-100k.corpus')
# report_congress = Concordance(congress)
# congress._init_token_arrays()

In [None]:
#| hide
# %time report_congress.concordance('god', context_length = 5, order='1R2R3R').display()

In [None]:
#| hide
# congress.tokenize('government', simple_indexing=True)

In [None]:
#| export
@patch
def concordance_plot(self: Concordance,
					 token_str: str,
					 page_size: int = 10):
	"""Display concordance plot."""

	import numpy as np
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	#import ipywidgets as widgets

	# Tokenize and get positions
	token_sequence, index_id = self.corpus.tokenize(token_str, simple_indexing=True)
	token_positions = self.corpus.get_token_positions(token_sequence, index_id)
	sequence_len = len(token_sequence[0])

	if len(token_positions[0]) == 0:
		print("No matches found.")
		return

	document_ids = self.corpus.get_tokens_by_index('token2doc_index')[token_positions[0]]
	unique_document_ids = np.unique(document_ids)
	num_docs = len(unique_document_ids)
	num_pages = math.ceil(num_docs / page_size)
	
	font_family = "'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Helvetica Neue', 'Fira Sans', 'Droid Sans', Arial, sans-serif;"
	plots_per_page = page_size
	per_subplot_height = 50
	band_gap = 0.05
	

	frames = []
	for page in range(num_pages):
		start = page * plots_per_page
		end = min(start + plots_per_page, num_docs)
		page_document_ids = unique_document_ids[start:end]
		n_bands = len(page_document_ids)
		data = []
		shapes = []
		annotations = []

		for idx, doc_id in enumerate(page_document_ids):
			y_base = (n_bands - idx - 1) * (per_subplot_height + band_gap)
			doc_mask = (document_ids == doc_id)
			doc_positions = np.array(token_positions[0])[doc_mask]
			if len(doc_positions) == 0:
				continue

			# Get min/max positions for normalization
			tokens_df = self.corpus.tokens.with_row_index('position')
			doc_range = tokens_df.filter(
				pl.col('token2doc_index') == doc_id
			).select(['position']).collect().to_numpy()
			position_min = doc_range.min()
			position_max = doc_range.max()

			norm_pos = [
				(pos - position_min) / (position_max - position_min) * 100 if position_max > position_min else 0
				for pos in doc_positions
			]
			# Example context for each line
			examples = []
			for pos in doc_positions:
				tokens_for_example = self.corpus.get_tokens_by_index('orth_index')[pos-5:pos+6]
				if self.corpus.EOF_TOKEN in tokens_for_example:
					positions_eof = np.where(tokens_for_example == self.corpus.EOF_TOKEN)[0]
					if len(positions_eof) > 0 and positions_eof[0] < 5:
						tokens_for_example = tokens_for_example[positions_eof[0] + 1:]
					else:
						positions_eof = np.where(tokens_for_example == self.corpus.EOF_TOKEN)[0]
						if len(positions_eof) > 0 and positions_eof[-1] > 5:
							tokens_for_example = tokens_for_example[:positions_eof[-1]]
				tokens_for_example = self.corpus.token_ids_to_tokens(tokens_for_example)
				examples.append(' '.join(tokens_for_example))

			# Add concordance lines as vertical traces
			for x0, example in zip(norm_pos, examples):
				row = idx + 1
				data.append(
					go.Scatter(
						x=[x0, x0],
						y=[y_base, y_base + per_subplot_height],
						mode='lines',
						line=dict(color='black', width=2),
						showlegend=False,
						hoverinfo='text',
						hovertext=example
					),
				)
			# Doc label and line count as annotation
			lines_count = len(norm_pos)
			doc_label = f"Doc {doc_id}"
			lines_string = f"{lines_count} line" + ("s" if lines_count != 1 else "")
			annotations.append(
				dict(
					text=doc_label,
					x=-5, y=y_base + per_subplot_height * 0.65,
					xref="x", yref="y",
					showarrow=False,
					xanchor="right",
					yanchor="middle",
					font=dict(size=12, family=font_family)
				)
			)
			annotations.append(
				dict(
					text=lines_string,
					x=-5, y=y_base + per_subplot_height * 0.22,
					xref="x", yref="y",
					showarrow=False,
					xanchor="right",
					yanchor="middle",
					font=dict(size=11, color="gray", family=font_family)
				)
			)

		frames.append(go.Frame(
			data=data,
			name=f"{page+1}",
			layout=go.Layout(
				annotations=annotations
			)
		))

	fig = make_subplots(
		rows=plots_per_page,
		cols=1,
		vertical_spacing=0.05,
		subplot_titles=[None] * plots_per_page,

	)

	for idx in range(plots_per_page):
		row = idx + 1
		fig.add_shape(
			type="rect",
			x0=0, y0=0, x1=100, y1=1,
			line=dict(color="black", width=1),
			fillcolor="gray",
			opacity=0.2,  # more visible
			layer="below",
			row=row, col=1
		)

		fig.update_xaxes(range=[0, 100], visible=False, fixedrange=True, row=row, col=1)
		fig.update_yaxes(range=[0, 1], visible=False, fixedrange=True, row=row, col=1)

	fig.update_layout(
		height=per_subplot_height * plots_per_page,
		width=600,
		showlegend=False,
		title_text=f'Concordance Plot for "{token_str}"',
		title_x=0.5,
		title_y=0.95,
		margin=dict(t=50, b=150, l=80, r=20),
		font=dict(family=font_family, color="black", size=12),
		plot_bgcolor="white",  # ensure white background
		paper_bgcolor="white"
	)

	# Slider for paging
	# page_slider = widgets.IntSlider(value=1, min=1, max=num_pages, step=1, description='Page', layout=widgets.Layout(width='600px', margin='10px 0 10px 0'))


	footer_text = f"{self.corpus.name}<br>Total Documents: {num_docs}<br>Total Concordance Lines: {len(token_positions[0])}"

	fig.update_layout(
		annotations=[
			dict(
				text=footer_text,
				xref="paper", yref="paper",
				x=0, y=-0.30,  # y < 0 puts it below the plot area; adjust as needed
				showarrow=False,
				xanchor="left",
				yanchor="top",
				align="left",
				font=dict(size=12, color="black", family=font_family)
			)
		]
	)


	fig.frames = frames

	# # print data for first frame for debugging
	# print("First frame data:")
	# if len(frames) > 0:
	# 	first_frame = frames[0]
	# 	for trace in first_frame.data:
	# 		print(f"Trace: {trace.name}, x: {trace.x}, y: {trace.y}, hovertext: {trace.hovertext}")
	# 	for annotation in frames[0].layout.annotations:
	# 		print(f"Annotation: {annotation.text}, x: {annotation.x}, y: {annotation.y}")



	fig.update_layout(
		sliders=[{
			"active": 0,
			"currentvalue": {"prefix": "Page: "},
			"pad": {"t": 10, "b": 10},
			"steps": [
				{
					"method": "animate",
					"args": [[f.name], {"frame": {"duration": 0, "redraw": True}, "mode": "immediate"}],
					"label": str(i+1),
				}
				for i, f in enumerate(frames)
			]
		}]
	)

	fig.show(config={'staticPlot': True})  # Renders in notebook or browser	

	# def update(change):
	# 	page = change['new']
	# 	fig_widget.data = ()  # Remove all previous traces
	# 	fig_widget.layout.annotations = ()

	# 	start = (page - 1) * plots_per_page
	# 	end = min(start + plots_per_page, num_docs)
	# 	page_document_ids = unique_document_ids[start:end]

	# 	# Get min/max positions for normalization
	# 	doc_range = self.corpus.tokens.with_row_index('position').filter(
	# 		pl.col('token2doc_index').is_in(page_document_ids)
	# 	).group_by('token2doc_index').agg([
	# 		pl.col('position').min().alias('min'),
	# 		pl.col('position').max().alias('max')
	# 	]).collect()

	# 	# Prepare normalized positions for each doc on this page
	# 	documents = [[] for _ in range(len(page_document_ids))]
	# 	normalized_documents = [[] for _ in range(len(page_document_ids))]
	# 	examples = [[] for _ in range(len(page_document_ids))]
	# 	for i, doc_id in enumerate(document_ids):
	# 		if doc_id not in page_document_ids:
	# 			continue
	# 		doc_index = np.where(page_document_ids == doc_id)[0][0]
	# 		documents[doc_index].append(token_positions[0][i])
	# 		position_min, position_max = doc_range.filter(
	# 			pl.col('token2doc_index') == doc_id
	# 		).select(['min', 'max']).to_numpy()[0]
	# 		norm_pos = (token_positions[0][i] - position_min) / (position_max - position_min) * 100 if position_max > position_min else 0
	# 		normalized_documents[doc_index].append(norm_pos)
	# 		tokens_for_example = self.corpus.get_tokens_by_index('orth_index')[token_positions[0][i]-5:token_positions[0][i]+6]
			
	# 		if self.corpus.EOF_TOKEN in tokens_for_example:
	# 			positions = np.where(tokens_for_example == self.corpus.EOF_TOKEN)[0]
	# 			if len(positions) > 0 and positions[0] < 5:
	# 				tokens_for_example = tokens_for_example[positions[0] + 1:]
	# 			else:
	# 				positions = np.where(tokens_for_example == self.corpus.EOF_TOKEN)[0]
	# 				if len(positions) > 0 and positions[-1] > 5:
	# 					tokens_for_example = tokens_for_example[:positions[-1]]
	# 		tokens_for_example = self.corpus.token_ids_to_tokens(tokens_for_example)
	# 		examples[doc_index].append(' '.join(tokens_for_example))

	# 		# yref = "y" if i == 0 else f"y{i + 1} domain"

	# 	n_plots_this_page = end - start
	# 	if n_plots_this_page < plots_per_page or ('old' in change and change['old'] == num_pages):
	# 		fig_widget.layout.shapes = ()
	# 		for idx in range(n_plots_this_page):
	# 			row = idx + 1
	# 			fig_widget.add_shape(
	# 				type="rect",
	# 				x0=0, y0=0, x1=100, y1=1,
	# 				line=dict(color="black", width=1),
	# 				fillcolor="gray",
	# 				opacity=0.08,
	# 				layer="below",
	# 				row=row, col=1
	# 			)

	# 	# Add traces for each subplot
	# 	for idx, positions in enumerate(normalized_documents):
	# 		row = idx + 1
	# 		for pos, x0 in enumerate(positions):
	# 			fig_widget.add_trace(
	# 				go.Scatter(
	# 					x=[x0, x0],
	# 					y=[0, 1],
	# 					mode='lines',
	# 					line=dict(color='black', width=2),
	# 					showlegend=False,
	# 					hoverinfo='text',
	# 					hovertext=f"{examples[idx][pos]}" if examples[idx] else ""
	# 				),
	# 				row=row, col=1
	# 			)

	# 		yref = "y" if row == 1 else f"y{row} domain"

	# 		# doc id from document_ids
	# 		doc_id = page_document_ids[idx]
	# 		fig_widget.add_annotation(
	# 			text=f"Doc {doc_id}",
	# 			xref="paper", yref=yref,
	# 			x=-0.03, y=0.65,
	# 			showarrow=False,
	# 			xanchor="right",
	# 			yanchor="middle",
	# 			font=dict(size=12, family = font_family)
	# 		)

	# 		lines_count = len(normalized_documents[idx])
	# 		if lines_count == 0:
	# 			lines_string = "No lines"
	# 		elif lines_count == 1:
	# 			lines_string = "1 line"
	# 		else:
	# 			# pluralize 'line' based on count
	# 			# e.g. "5 lines"
	# 			lines_string = f"{lines_count} lines"

	# 		fig_widget.add_annotation(
	# 			text=f"{lines_string}",
	# 			xref="paper", yref=yref,
	# 			x=-0.03, y=0.22, 
	# 			showarrow=False,
	# 			xanchor="right",
	# 			yanchor="middle",
	# 			font=dict(size=11, color="gray", family = font_family)
	# 		)



	# footer = widgets.HTML(
	# 	value=f"<div style='text-align: left; font-size: 12px; color: black; margin-left: 80px;margin-bottom:10px;line-height:1.7;'>{self.corpus.name}<br>Total Documents: {num_docs}<br>Total Concordance Lines: {len(token_positions[0])}</div>"
	# )

	# display(page_slider)
	# display(fig_widget)
	# display(footer)






In [None]:
#| hide
token_str = 'the company said' # check this is clean
total_documents = 103
corpus_name = 'A Very Test Corpus'
concordance_lines = 120
total_pages = math.ceil(total_documents / 10)
plots_per_page = 10
row_height = 60
start_first_row_at = 30
row_adjustment = row_height - start_first_row_at
default_font_size = 12
subplot_height = 40
plot_height = start_first_row_at + (row_height * plots_per_page) + subplot_height - row_height
plot_x = 160
label_x_right = plot_x - 10
footer_margin = 20

html = '''<!doctype html>
<html lang="en">
  <head>
    <meta charset="UTF-8"><title>Conc Plot</title>
<style>
.conc-plot-wrapper {
background: white;
width:1000px;
color: #000;
font-family: 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Helvetica Neue', 'Fira Sans', 'Droid Sans', Arial, sans-serif;
line-height: 1.1;
margin: 20px 0 20px 0;
'''
html += f'font-size: {default_font_size}px;'
html += '''
}

.conc-plot-wrapper h2 {
color: #000;
font-family: 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Helvetica Neue', 'Fira Sans', 'Droid Sans', Arial, sans-serif;
text-align:center;
font-size: 24px;
font-weight: 600;
line-height: 2;
}

.conc-concordance-plot {

}

.conc-concordance-plot-summary {
margin: 0 40px 10px 160px;
color: #000;
font-size: 12px;
font-family: 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Helvetica Neue', 'Fira Sans', 'Droid Sans', Arial, sans-serif;

}

.conc-concordance-plot-controls {
margin: 0 40px 20px 40px;
}

.conc-concordance-plot-controls input[type="range"] {
-webkit-appearance: none;
width: 100%;
height: 15px;
background: #ccc;
border-radius: 5px;
outline: none;
opacity: 0.7;
transition: opacity .2s;
}

.conc-concordance-plot-controls label {
font-family: 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Helvetica Neue', 'Fira Sans', 'Droid Sans', Arial, sans-serif;
color: #000;
}

.conc-concordance-plot rect {
fill: #ccc;
width: 800px;
height: 40px;
}

.conc-concordance-plot .label {
font-family: 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Helvetica Neue', 'Fira Sans', 'Droid Sans', Arial, sans-serif;
}

#conc-concordance-plot g {
  background: transparent;
}

#conc-concordance-plot line {
  cursor: pointer;
}

#conc-concordance-plot line.conc-concordanceplot-line:hover + text {
  opacity:1;
}

#conc-concordance-plot line.conc-concordanceplot-line:hover + text {
  opacity:1;
}

#conc-concordance-plot line.highlight {
stroke: red !important;
}

</style></head>
<body>
'''

html += '<div class="conc-plot-wrapper">'
html += f'<h2>Concordance Plot for "{token_str}"</h2>'
html += f'<svg class="conc-concordance-plot" id="conc-concordance-plot" width="1000" height="{plot_height}" xmlns="http://www.w3.org/2000/svg">'

lines_html = ''

for i in range(1, 11):

  line_text = '1 line'

  html += f'<rect x="{plot_x}" y="{((i * row_height) - row_adjustment)}" height="40" width="800" />'
  html += f'<text class="label" x="{label_x_right}" y="{((i * row_height) - row_adjustment + default_font_size*1.4)}" font-size="{default_font_size}" text-anchor="end">Doc {i}</text>'
  html += f'<text class="label" x="{label_x_right}" y="{((i * row_height) - row_adjustment + default_font_size*1.4*2)}" font-size="{default_font_size}" text-anchor="end">{line_text}</text>'

  # create some dummy X values between 0 and 100, random number of obs - max 10
  x_values = np.random.uniform(0, 100, np.random.randint(1, 10)) * 8
  # draw vertical black line for each X value at plot_x + x_value with height = subplot_height
  for x_value in x_values:
    lines_html += f'<g><line class="conc-concordanceplot-line" x1="{plot_x + x_value}" y1="{((i * row_height) - row_adjustment)}" x2="{plot_x + x_value}" y2="{((i * row_height) - row_adjustment + subplot_height)}" style="stroke:black;stroke-width:5;pointer-events:all;opacity:0;" />'
    lines_html += f'<line class="conc-concordanceplot-line" x1="{plot_x + x_value}" y1="{((i * row_height) - row_adjustment)}" x2="{plot_x + x_value}" y2="{((i * row_height) - row_adjustment + subplot_height)}" style="stroke:black;stroke-width:2;pointer-events:all;" />'
    lines_html += f'<text x="{plot_x + x_value + 10}" y="{((i * row_height) - row_adjustment - 10)}" font-size="14" fill="black" style="opacity:0;">Example concordance line for Doc {i} at position {x_value}</text></g>'


html += f'{lines_html}'

html +=f'''
</svg>

<div class="conc-concordance-plot-summary">
{corpus_name}<br>
Total Documents: {total_documents}<br>
Total Concordance Lines: {concordance_lines}
</div>
<div class="conc-concordance-plot-controls"><label for="conc-concordance-plot-slider" id="conc-concordance-plot-slider-label">Page <span id="conc-concordance-plot-page-number">1</span> of {total_pages}</label>
<input type="range" min="1" max="10" value="1" step="1" class="slider" id="conc-concordance-plot-slider"></div>
</div>'''

html += f'''
<script>
'''

html += '''
function initSlider() {
  const slider = document.getElementById('conc-concordance-plot-slider');
  slider.addEventListener('input', function() {
    const page = this.value;

    // Update the label to show the current page
    const page_number = document.getElementById('conc-concordance-plot-page-number');
    page_number.textContent = `${page}`;

        // Update the SVG elements to show the correct data for the selected page
  });
}

if (document.readyState !== 'loading') {
  initSlider();

  var bars = document.getElementsByClassName('conc-concordanceplot-line');
  for (var i = 0; i < bars.length; i++) {
      bars[i].addEventListener('mouseover', mouseOverEffect);
      bars[i].addEventListener('mouseout', mouseOutEffect);
  }
  
} else {
  document.addEventListener('DOMContentLoaded', initSlider);
}



function mouseOverEffect() {
  this.classList.add("highlight");
}
function mouseOutEffect() {
  this.classList.remove("highlight");
}



</script>
</body></html>
'''

from IPython.display import HTML
display(HTML(html))

# save html as tmp.html
with open('tmp.html', 'w', encoding='utf8') as f:
  f.write(html)
conc_reuters.concordance_plot('the company said', page_size=10)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()