# ngrams

> Functionality for ngram analysis.

In [None]:
#| default_exp ngrams

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import numpy as np
import time
import polars as pl
from spacy.attrs import ORTH, LOWER # remove? - add ENT_TYPE, ENT_IOB
import math
from fastcore.basics import patch

In [None]:
#| export
from conc.corpus import Corpus
from conc.result import Result
from conc.core import logger, PAGE_SIZE, set_logger_state

In [None]:
#| export
class Ngrams:
	""" Class for n-gram analysis reporting. """
	def __init__(self,
			  corpus:Corpus # Corpus instance
			  ): 
		self.corpus = corpus


In [None]:
#| exporti
@patch
def _get_ngrams(self:Ngrams, 
			   token_sequence: list[np.ndarray], # token sequence to get index for 
			   index_id: int, # index to search (i.e. ORTH, LOWER)
			   token_positions: list[np.ndarray], # positions of token sequence, returned by get_token_positions 
			   ngram_length: int = 2, # length of ngram
			   ngram_word_position: str = 'LEFT' # specify if token sequence is on LEFT, RIGHT, or MIDDLE of ngrams
			   ) -> np.ndarray: # array of ngrams results

	""" Get ngram data for a token sequence. """
	
	start_time = time.time()
	sequence_len = len(token_sequence[0])
	variants_len = len(token_sequence)
	token_index_len = len(token_positions[0])

	if index_id == ORTH:
		index = 'orth_index'
	else:
		index = 'lower_index'

	if ngram_word_position == 'RIGHT':
		ngram_range = range(-1 * ngram_length + sequence_len, sequence_len)
	elif ngram_word_position == 'MIDDLE':
		ngram_range = range(-1 * ngram_length + sequence_len + 1, sequence_len + 1)
	else:
		ngram_range = range(0, ngram_length)

	ngrams = []
	
	for pos in ngram_range:
		if variants_len == 1 and pos > -1 and pos < sequence_len:
			ngrams.append(np.full(token_index_len, token_sequence[0][pos]))
		else:
			seq = token_positions[0] + pos
			ngrams.append(self.corpus.get_tokens_by_index(index)[seq])

	ngrams = np.stack(ngrams)

	# getting positions to search for EOF_TOKEN and filter out ngrams crossing doc boundaries
	positions = (np.array(ngram_range)[:, None] != np.arange(sequence_len)).all(axis=1)
	ngrams = np.delete(ngrams, np.where(ngrams[positions] == self.corpus.EOF_TOKEN)[1], axis=1)

	logger.info(f'Ngrams ({ngrams.shape[1]}) retrieval time: {(time.time() - start_time):.5f} seconds')
	return ngrams


In [None]:
#| hide
import os

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

path_to_toy_corpus = f'{save_path}toy.corpus'
path_to_brown_corpus = f'{save_path}brown.corpus'
path_to_reuters_corpus = f'{save_path}reuters.corpus'

In [None]:
#| hide
toy = Corpus().load(path_to_toy_corpus)
report_toy = Ngrams(toy)

In [None]:
#| hide
token_str = 'dog'
token_sequence, index_id = toy.tokenize(token_str, simple_indexing=True)
token_positions = toy.get_token_positions(token_sequence, index_id)
ngrams = report_toy._get_ngrams(token_sequence, index_id, token_positions, ngram_length = 2, ngram_word_position = 'LEFT')
print(ngrams)

[[11 11 11]
 [12  2  2]]


In [None]:
# load the corpus
brown = Corpus().load(path_to_brown_corpus)

In [None]:
# instantiate the Ngrams class
report_brown = Ngrams(brown)

In [None]:
#| hide
token_str = 'dog'
token_sequence, index_id = brown.tokenize(token_str, simple_indexing=True)
token_positions = brown.get_token_positions(token_sequence, index_id)
ngrams = report_brown._get_ngrams(token_sequence, index_id, token_positions, ngram_length = 2, ngram_word_position = 'LEFT')
print(ngrams)

[[23289 23289 23289 23289 23289 23289 23289 23289 23289 23289 23289 23289
  23289 23289 23289 23289 23289 23289 23289 23289 23289 23289 23289 23289
  23289 23289 23289 23289 23289 23289 23289 23289 23289 23289 23289 23289
  23289 23289 23289 23289 23289 23289 23289 23289 23289 23289 23289 23289
  23289 23289 23289 23289 23289 23289 23289 23289 23289 23289 23289 23289
  23289 23289 23289 23289 23289 23289 23289 23289 23289 23289 23289 23289
  23289 23289 23289 23289 23289 23289 23289]
 [29064 38309 33838 15829   795 47534  9374 42833 36136 21550 17435 43533
  25158  8128 11957  5963  5280 14413 16089  8128 51356 51356 13607 15531
   1147  5280 27404  1916  5963 38309  5280 33838 20463 38309 49761 27963
  55470 55470 27831  9374  8128 38309 38309 38309  8128  9374  8128 30252
  35250 32807 49398 38309  7126 43244 38309  8128  8128 38309  9374  8128
  38309 22848 34812 21550 49732 27963  8128 13462 38382 38309 49732 42833
  55687 38309 15800  3883 38309 46196 14474]]


In [None]:
#| export
@patch
def ngrams(self: Ngrams, 
		   token_str: str, # token string to get ngrams for 
		   ngram_length:int = 2, # length of ngram
		   ngram_word_position:str = 'LEFT', # specify if token sequence is on LEFT, RIGHT, or MIDDLE of ngrams
		   page_size:int = PAGE_SIZE, # number of results to display per results page 
		   page_current:int = 1, # current page of results
		   show_all_columns:bool = False, # return raw df with all columns or just ngram and frequency
		   use_cache:bool = True # retrieve the results from cache if available
		   ) -> Result: # return a Result object with ngram data
	""" Report ngrams for a token string. """

	token_sequence, index_id = self.corpus.tokenize(token_str, simple_indexing=True)

	start_time = time.time()
	cache_id = tuple(['ngram'] + list(token_sequence) + [ngram_length, ngram_word_position])

	if use_cache == True and cache_id in self.corpus.results_cache:
		logger.info('Using cached ngrams results')
		ngrams_report = self.corpus.results_cache[cache_id][0]
		total_unique = self.corpus.results_cache[cache_id][1]
		total_count = self.corpus.results_cache[cache_id][2]
	else:
		token_positions = self.corpus.get_token_positions(token_sequence, index_id)
		
		if len(token_positions[0]) == 0:
			logger.info('No tokens found')
			return None, {}, []

		logger.info('Generating ngrams results')
		ngrams = self._get_ngrams(token_sequence, index_id, token_positions, ngram_length = ngram_length, ngram_word_position = ngram_word_position)
		total_count = ngrams.shape[1]
		schema = [f'token_{i+1}' for i in range(ngram_length)]
		ngrams_report = pl.DataFrame(ngrams.T, schema=schema).to_struct(name = 'ngram_token_ids').value_counts(sort=True).rename({"count": "frequency"})
		ngrams_report = ngrams_report.with_row_index(name='rank', offset=1)
		total_unique = len(ngrams_report)
		self.corpus.results_cache[cache_id] = (ngrams_report, total_unique, total_count)
	
	resultset_start = page_size*(page_current-1)

	# get specific chunk of report into polars based on resultset_start:
	ngrams_report_page = ngrams_report.slice(resultset_start, page_size).unnest('ngram_token_ids')
	token_strs = []
	for i in range(ngram_length):
		token_strs.append(self.corpus.token_ids_to_tokens(ngrams_report_page[f'token_{i+1}'].to_numpy()))
	token_strs = np.array(token_strs)
	ngram_text = [' '.join(column) for column in token_strs.T]
	ngrams_report_page = ngrams_report_page.with_columns(pl.Series(name="ngram", values=ngram_text))
	#ngrams_report_page = ngrams_report_page.to_pandas().set_index('rank')
	total_pages = math.ceil(total_unique/page_size)
	summary_data = {'ngram_length': ngram_length, 'ngram_word_position': ngram_word_position, 'total_unique': total_unique, 'total_count': total_count, 'page_current': page_current, 'total_pages': total_pages}
	formatted_data = [f'Total unique ngrams: {total_unique:,}', f'Total ngrams: {total_count:,}', f'Showing {min(page_size, total_count)} rows', f'Page {page_current} of {total_pages}'] 

	if show_all_columns == False:
		ngrams_report_page = ngrams_report_page[['rank', 'ngram', 'frequency']]
	
	logger.info(f'Ngrams report time: {(time.time() - start_time):.5f} seconds')

	return Result(type = 'ngrams', df=ngrams_report_page, title=f'Ngrams for "{token_str}"', description=f'Ngram length: {ngram_length}, Token position: {ngram_word_position.lower()}', summary_data=summary_data, formatted_data=formatted_data)


In [None]:
# run the ngrams method and display the results
report_brown.ngrams('dog', ngram_length = 2, ngram_word_position = 'LEFT').display()

2025-06-04 13:56:24 - INFO - tokenize - Tokenization time: 0.00012 seconds
2025-06-04 13:56:24 - INFO - ngrams - Using cached ngrams results
2025-06-04 13:56:24 - INFO - ngrams - Ngrams report time: 0.00144 seconds


"Ngrams for ""dog""","Ngrams for ""dog""","Ngrams for ""dog"""
"Ngram length: 2, Token position: left","Ngram length: 2, Token position: left","Ngram length: 2, Token position: left"
Rank,Ngram,Frequency
1,dog .,13
2,"dog ,",9
3,dog in,4
4,dog world,3
5,dog owners,2
6,dog '',2
7,dog would,2
8,dog show,2
9,dog because,2
10,dog -,2


In [None]:
# run the ngrams method and display the results
report_brown.ngrams('the same', ngram_length = 3, ngram_word_position = 'LEFT', page_size = 10).display()

2025-06-04 13:56:28 - INFO - tokenize - Tokenization time: 0.00012 seconds
2025-06-04 13:56:28 - INFO - ngrams - Using cached ngrams results
2025-06-04 13:56:28 - INFO - ngrams - Ngrams report time: 0.00123 seconds


"Ngrams for ""the same""","Ngrams for ""the same""","Ngrams for ""the same"""
"Ngram length: 3, Token position: left","Ngram length: 3, Token position: left","Ngram length: 3, Token position: left"
Rank,Ngram,Frequency
1,the same time,93
2,the same .,24
3,the same as,22
4,the same way,22
5,the same thing,19
6,"the same ,",11
7,the same manner,10
8,the same instant,7
9,the same period,6
10,the same amount,6


In [None]:
#| export
@patch
def ngram_frequencies(self: Ngrams, 
                ngram_length:int=2, # length of ngram
                case_insensitive:bool=True, # frequencies for tokens lowercased or with case preserved
				normalize_by:int=1000000, # normalize frequencies by a number (e.g. 10000)
				page_size:int=PAGE_SIZE, # number of rows to return
				page_current:int=1, # current page
				exclude_punctuation:bool=True, # exclude punctuation tokens
				exclude_spaces:bool=True # exclude space tokens
				) -> Result: # return a Result object with the frequency table
	""" Report frequent ngrams. """
	
	if type(normalize_by) != int:
		raise ValueError('normalize_by must be an integer, e.g. 1000000 or 10000')

	start_time = time.time()

	if case_insensitive:
		index = 'lower_index'
	else:
		index = 'orth_index'

	filter = [self.corpus.EOF_TOKEN]
	if exclude_punctuation == True:
		filter += self.corpus.punct_tokens
	if exclude_spaces == True:
		filter += self.corpus.space_tokens

	resultset_start = page_size*(page_current-1)

	# ngrams = self.corpus.get_ngrams_by_index(ngram_length = ngram_length, index = index).T
	ngrams_report = self.corpus.tokens.with_columns([pl.col(index).shift(-i).alias(f'token_{i+1}') for i in range(ngram_length)])

	schema = [f'token_{i+1}' for i in range(ngram_length)]

	# ngrams_report = pl.LazyFrame(ngrams.T, schema=schema)
	ngrams_report = ngrams_report.group_by(schema).agg(pl.len().alias("frequency")).sort(by="frequency", descending=True)

	for i in range(ngram_length):
		ngrams_report = ngrams_report.filter(~pl.col(f'token_{i+1}').is_in(filter))

	ngrams_report_page = ngrams_report.slice(resultset_start, page_size).collect(engine = 'streaming')
	logger.info(f'collected report page: {(time.time() - start_time):.5f} seconds')
	token_strs = []
	for i in range(ngram_length):
		token_strs.append(self.corpus.token_ids_to_tokens(ngrams_report_page.select(pl.col(f'token_{i+1}')).to_numpy().flatten()))
	token_strs = np.array(token_strs)
	ngram_text = [' '.join(column) for column in token_strs.T]
	ngrams_report_page = ngrams_report_page.with_columns(pl.Series(name="ngram", values=ngram_text)).with_row_index(name='rank', offset=(page_current-1)*page_size+1)

	ngrams_report_page = ngrams_report_page[['rank', 'ngram', 'frequency']]

	return Result(type = 'ngrams', df=ngrams_report_page, title=f'Ngram Frequencies', description=f'Ngram length: {ngram_length}', summary_data = {}, formatted_data = [])

In [None]:
set_logger_state('verbose')

In [None]:
report_brown.ngram_frequencies(ngram_length = 3, case_insensitive = False).display()

2025-06-04 13:56:35 - INFO - ngram_frequencies - collected report page: 0.03307 seconds


Ngram Frequencies,Ngram Frequencies,Ngram Frequencies
Ngram length: 3,Ngram length: 3,Ngram length: 3
Rank,Ngram,Frequency
1,the United States,324
2,one of the,318
3,as well as,238
4,out of the,168
5,some of the,154
6,the fact that,150
7,the end of,148
8,part of the,140
9,to be a,130
10,of the United,129


In [None]:
#| hide
congress = Corpus().load(f'{save_path}us-congressional-speeches-subset-100k.corpus')
report_congress = Ngrams(congress)
congress._init_token_arrays()

2025-06-04 13:56:38 - INFO - memory_usage - init, memory usage: 2153.671875 MB
2025-06-04 13:56:38 - INFO - load - Load time: 0.231 seconds
2025-06-04 13:56:38 - INFO - _init_token_arrays - Created tokens_array in 0.040 seconds
2025-06-04 13:56:38 - INFO - _init_token_arrays - Created tokens_lookup in 0.030 seconds
2025-06-04 13:56:39 - INFO - _init_token_arrays - Created tokens_sort_order in 0.314 seconds


In [None]:
#| hide
%time report_congress.ngram_frequencies(ngram_length = 2, case_insensitive = True, page_current = 1).display()

2025-06-04 13:56:39 - INFO - ngram_frequencies - collected report page: 0.27940 seconds


Ngram Frequencies,Ngram Frequencies,Ngram Frequencies
Ngram length: 2,Ngram length: 2,Ngram length: 2
Rank,Ngram,Frequency
1,of the,227943
2,in the,114241
3,to the,92967
4,it is,51659
5,that the,51620
6,for the,46516
7,on the,43924
8,and the,43053
9,by the,40236
10,the senator,37269


CPU times: user 5.34 s, sys: 155 ms, total: 5.5 s
Wall time: 285 ms


In [None]:
#| hide
%time report_congress.ngrams('liberty', ngram_length = 2, ngram_word_position = 'RIGHT', page_size = 10).display()

2025-06-04 13:56:40 - INFO - tokenize - Tokenization time: 0.00024 seconds
2025-06-04 13:56:40 - INFO - get_token_positions - Token indexing (1060) time: 0.08205 seconds
2025-06-04 13:56:40 - INFO - ngrams - Generating ngrams results
2025-06-04 13:56:40 - INFO - _get_ngrams - Ngrams (1060) retrieval time: 0.00048 seconds
2025-06-04 13:56:40 - INFO - ngrams - Ngrams report time: 0.08448 seconds


"Ngrams for ""liberty""","Ngrams for ""liberty""","Ngrams for ""liberty"""
"Ngram length: 2, Token position: right","Ngram length: 2, Token position: right","Ngram length: 2, Token position: right"
Rank,Ngram,Frequency
1,of liberty,227
2,the liberty,124
3,. liberty,99
4,at liberty,98
5,and liberty,62
6,for liberty,41
7,religious liberty,28
8,individual liberty,28
9,their liberty,18
10,radio liberty,15


CPU times: user 149 ms, sys: 37.1 ms, total: 186 ms
Wall time: 90.5 ms


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()