# ngrams

> Functionality for ngram analysis.

In [None]:
#| default_exp ngrams

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import numpy as np
import time
import polars as pl
from spacy.attrs import ORTH, LOWER # remove? - add ENT_TYPE, ENT_IOB
import math
from fastcore.basics import patch

In [None]:
#| export
from conc.corpus import Corpus
from conc.result import Result
from conc.core import logger, PAGE_SIZE, set_logger_state

In [None]:
#| hide
import os

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

path_to_toy_corpus = f'{save_path}toy.corpus'
path_to_brown_corpus = f'{save_path}brown.corpus'
path_to_reuters_corpus = f'{save_path}reuters.corpus'

In [None]:
#| export
class Ngrams:
	""" Class for n-gram analysis reporting. """
	def __init__(self,
			  corpus:Corpus # Corpus instance
			  ): 
		self.corpus = corpus


In [None]:
#| hide
toy = Corpus().load(path_to_toy_corpus)
ngrams_toy = Ngrams(toy)

2025-06-07 16:05:53 - INFO - memory_usage - init, memory usage: 1167.29296875 MB


2025-06-07 16:05:54 - INFO - load - Load time: 0.917 seconds


In [None]:
#| exporti
@patch
def _get_ngrams(self:Ngrams, 
			   token_sequence: list[np.ndarray], # token sequence to get index for 
			   index_id: int, # index to search (i.e. ORTH, LOWER)
			   token_positions: list[np.ndarray], # positions of token sequence, returned by get_token_positions 
			   ngram_length: int = 2, # length of ngram
			   ngram_token_position: str = 'LEFT', # specify if token sequence is on LEFT, RIGHT, or MIDDLE of ngrams
			   exclude_punctuation:bool=True, # exclude punctuation tokens
			   exclude_spaces:bool=True # exclude space tokens
			   ) -> np.ndarray: # array of ngrams results

	""" Get ngram data for a token sequence. """
	
	start_time = time.time()
	sequence_len = len(token_sequence[0])
	variants_len = len(token_sequence)
	token_index_len = len(token_positions[0])

	if index_id == ORTH:
		index = 'orth_index'
	else:
		index = 'lower_index'

	if ngram_token_position == 'RIGHT':
		ngram_range = range(-1 * ngram_length + sequence_len, sequence_len)
	elif ngram_token_position == 'MIDDLE':
		ngram_range = range(-1 * ngram_length + sequence_len + 1, sequence_len + 1)
	else:
		ngram_range = range(0, ngram_length)

	ngrams = []
	
	for pos in ngram_range:
		if variants_len == 1 and pos > -1 and pos < sequence_len:
			ngrams.append(np.full(token_index_len, token_sequence[0][pos]))
		else:
			seq = token_positions[0] + pos
			ngrams.append(self.corpus.get_tokens_by_index(index)[seq])

	ngrams = np.stack(ngrams)

	# getting positions to search for EOF_TOKEN and filter out ngrams crossing doc boundaries
	positions = (np.array(ngram_range)[:, None] != np.arange(sequence_len)).all(axis=1)
	ngrams = np.delete(ngrams, np.where(ngrams[positions] == self.corpus.EOF_TOKEN)[1], axis=1)

	logger.info(f'Ngrams ({ngrams.shape[1]}) retrieval time: {(time.time() - start_time):.5f} seconds')
	return ngrams


In [None]:
#| hide
token_str = 'dog'
token_sequence, index_id = toy.tokenize(token_str, simple_indexing=True)
token_positions = toy.get_token_positions(token_sequence, index_id)
ngrams = ngrams_toy._get_ngrams(token_sequence, index_id, token_positions, ngram_length = 2, ngram_token_position = 'LEFT')
print(ngrams)

2025-06-07 16:05:55 - INFO - _init_token_arrays - Created tokens_array in 0.002 seconds
2025-06-07 16:05:55 - INFO - _init_token_arrays - Created tokens_lookup in 0.000 seconds
2025-06-07 16:05:55 - INFO - _init_token_arrays - Created tokens_sort_order in 0.001 seconds
2025-06-07 16:05:55 - INFO - tokenize - Tokenization time: 0.00569 seconds
2025-06-07 16:05:55 - INFO - get_token_positions - Token indexing (3) time: 0.00168 seconds
2025-06-07 16:05:55 - INFO - _get_ngrams - Ngrams (3) retrieval time: 0.00042 seconds


[[11 11 11]
 [12  2  2]]


In [None]:
# load the corpus
reuters = Corpus().load(path_to_reuters_corpus)

2025-06-07 16:05:55 - INFO - memory_usage - init, memory usage: 1167.421875 MB


2025-06-07 16:05:56 - INFO - load - Load time: 0.873 seconds


In [None]:
# instantiate the Ngrams class
ngrams_reuters = Ngrams(reuters)

In [None]:
#| hide
token_str = 'ocean'
token_sequence, index_id = reuters.tokenize(token_str, simple_indexing=True)
token_positions = reuters.get_token_positions(token_sequence, index_id)
ngrams = ngrams_reuters._get_ngrams(token_sequence, index_id, token_positions, ngram_length = 2, ngram_token_position = 'LEFT')
print(ngrams)

2025-06-07 16:05:56 - INFO - _init_token_arrays - Created tokens_array in 0.024 seconds
2025-06-07 16:05:56 - INFO - _init_token_arrays - Created tokens_lookup in 0.033 seconds


2025-06-07 16:05:56 - INFO - _init_token_arrays - Created tokens_sort_order in 0.238 seconds
2025-06-07 16:05:56 - INFO - tokenize - Tokenization time: 0.30132 seconds
2025-06-07 16:05:56 - INFO - get_token_positions - Token indexing (29) time: 0.02131 seconds
2025-06-07 16:05:56 - INFO - _get_ngrams - Ngrams (29) retrieval time: 0.00022 seconds


[[48119 48119 48119 48119 48119 48119 48119 48119 48119 48119 48119 48119
  48119 48119 48119 48119 48119 48119 48119 48119 48119 48119 48119 48119
  48119 48119 48119 48119 48119]
 [13571 52710 46667 60529 46667 60529   936 71188 32924 72663 32924 72663
   9490 43372 26836 12053 53521 11720 13571 11720 11720 52211 13571 40736
  10743 67894 12439 65912 75228]]


In [None]:
#| export
@patch
def ngrams(self: Ngrams, 
		   token_str: str, # token string to get ngrams for 
		   ngram_length:int = 2, # length of ngram
		   ngram_token_position:str = 'LEFT', # specify if token sequence is on LEFT, RIGHT, or MIDDLE of ngrams
		   normalize_by:int=10000, # normalize frequencies by a number (e.g. 10000)
		   page_size:int = PAGE_SIZE, # number of results to display per results page 
		   page_current:int = 1, # current page of results
		   show_all_columns:bool = False, # return raw df with all columns or just ngram and frequency
		   exclude_punctuation:bool=True, # exclude punctuation tokens
		   exclude_spaces:bool=True, # exclude space tokens
		   use_cache:bool = True # retrieve the results from cache if available
		   ) -> Result: # return a Result object with ngram data
	""" Report ngrams for a token string. """

	if type(normalize_by) != int:
		raise ValueError('normalize_by must be an integer, e.g. 1000000 or 10000')

	token_sequence, index_id = self.corpus.tokenize(token_str, simple_indexing=True)

	start_time = time.time()
	cache_id = tuple(['ngram'] + list(token_sequence) + [ngram_length, ngram_token_position])

	if use_cache == True and cache_id in self.corpus.results_cache:
		logger.info('Using cached ngrams results')
		ngrams_report = self.corpus.results_cache[cache_id][0]
		total_unique = self.corpus.results_cache[cache_id][1]
		total_count = self.corpus.results_cache[cache_id][2]
	else:
		token_positions = self.corpus.get_token_positions(token_sequence, index_id)
		
		if len(token_positions[0]) == 0:
			logger.info('No tokens found')
			return Result(type = 'ngrams', df=pl.DataFrame(), title=f'Ngrams for "{token_str}"', description=f'No matches', summary_data={}, formatted_data=[])

		logger.info('Generating ngrams results')
		ngrams = self._get_ngrams(token_sequence, index_id, token_positions, ngram_length = ngram_length, ngram_token_position = ngram_token_position)
		total_count = ngrams.shape[1]
		schema = [f'token_{i+1}' for i in range(ngram_length)]
		ngrams_report = pl.DataFrame(ngrams.T, schema=schema).to_struct(name = 'ngram_token_ids').value_counts(sort=True).rename({"count": "frequency"})
		ngrams_report = ngrams_report.with_row_index(name='rank', offset=1)
		total_unique = len(ngrams_report)
		self.corpus.results_cache[cache_id] = (ngrams_report, total_unique, total_count)
	
	count_tokens, tokens_descriptor, total_descriptor = self.corpus.get_token_count_text(exclude_punctuation, exclude_spaces)

	resultset_start = page_size*(page_current-1)

	# get specific chunk of report into polars based on resultset_start:
	ngrams_report_page = ngrams_report.slice(resultset_start, page_size).unnest('ngram_token_ids')
	ngrams_report_page = ngrams_report_page.with_columns(((pl.col("frequency") / pl.lit(count_tokens)) * normalize_by).alias('normalized_frequency'))

	token_strs = []
	for i in range(ngram_length):
		token_strs.append(self.corpus.token_ids_to_tokens(ngrams_report_page[f'token_{i+1}'].to_numpy()))
	token_strs = np.array(token_strs)
	ngram_text = [' '.join(column) for column in token_strs.T]
	ngrams_report_page = ngrams_report_page.with_columns(pl.Series(name="ngram", values=ngram_text))
	#ngrams_report_page = ngrams_report_page.to_pandas().set_index('rank')
	total_pages = math.ceil(total_unique/page_size)
	summary_data = {'ngram_length': ngram_length, 'ngram_token_position': ngram_token_position, 'total_unique': total_unique, 'total_count': total_count, 'page_current': page_current, 'total_pages': total_pages}
	formatted_data = [f'Report based on {tokens_descriptor}', f'Ngram length: {ngram_length}, Token position: {ngram_token_position.lower()}']

	if normalize_by is not None:
		formatted_data.append(f'Normalized Frequency is per {normalize_by:,.0f} tokens')

	formatted_data.extend([f'Total unique ngrams: {total_unique:,}', f'Total ngrams: {total_count:,}'])

	if page_size != 0 and total_count > page_size:
		formatted_data.extend([f'Showing {min(page_size, total_count)} rows', f'Page {page_current} of {total_pages}']) 

	if show_all_columns == False:
		ngrams_report_page = ngrams_report_page[['rank', 'ngram', 'frequency', 'normalized_frequency']]
	
	logger.info(f'Ngrams report time: {(time.time() - start_time):.5f} seconds')

	return Result(type = 'ngrams', df=ngrams_report_page, title=f'Ngrams for "{token_str}"', description=f'{self.corpus.name}', summary_data=summary_data, formatted_data=formatted_data)


In [None]:
# run the ngrams method and display the results
ngrams_reuters.ngrams('environmental', ngram_length = 2, ngram_token_position = 'LEFT').display()

2025-06-07 16:05:57 - INFO - tokenize - Tokenization time: 0.00030 seconds
2025-06-07 16:05:57 - INFO - get_token_positions - Token indexing (34) time: 0.00464 seconds
2025-06-07 16:05:57 - INFO - ngrams - Generating ngrams results
2025-06-07 16:05:57 - INFO - _get_ngrams - Ngrams (34) retrieval time: 0.00024 seconds
2025-06-07 16:05:57 - INFO - ngrams - Ngrams report time: 0.01103 seconds


"Ngrams for ""environmental""","Ngrams for ""environmental""","Ngrams for ""environmental""","Ngrams for ""environmental"""
Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus
Rank,Ngram,Frequency,Normalized Frequency
1,environmental systems,4,0.03
2,environmental,4,0.03
3,environmental protection,3,0.02
4,environmental damage,2,0.01
5,"environmental ,",2,0.01
6,environmental impact,2,0.01
7,environmental controls,1,0.01
8,environmental approval,1,0.01
9,environmental and,1,0.01
10,environmental sciences,1,0.01


In [None]:
# run the ngrams method and display the results
ngrams_reuters.ngrams('the highest', ngram_length = 3, ngram_token_position = 'LEFT', page_size = 10).display()

2025-06-07 16:05:57 - INFO - tokenize - Tokenization time: 0.00041 seconds
2025-06-07 16:05:57 - INFO - get_token_positions - Token indexing (42) time: 0.02762 seconds
2025-06-07 16:05:57 - INFO - ngrams - Generating ngrams results
2025-06-07 16:05:57 - INFO - _get_ngrams - Ngrams (42) retrieval time: 0.00026 seconds
2025-06-07 16:05:57 - INFO - ngrams - Ngrams report time: 0.03274 seconds


"Ngrams for ""the highest""","Ngrams for ""the highest""","Ngrams for ""the highest""","Ngrams for ""the highest"""
Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus
Rank,Ngram,Frequency,Normalized Frequency
1,the highest,8,0.06
2,the highest since,5,0.04
3,the highest in,3,0.02
4,the highest level,2,0.01
5,the highest such,2,0.01
6,the highest positive,2,0.01
7,the highest depreciation,1,0.01
8,the highest rises,1,0.01
9,the highest set,1,0.01
10,the highest daily,1,0.01


In [None]:
#| export
@patch
def ngram_frequencies(self: Ngrams, 
                ngram_length:int=2, # length of ngram
                case_sensitive:bool=False, # frequencies for tokens lowercased or with case preserved
				normalize_by:int=10000, # normalize frequencies by a number (e.g. 10000)
				page_size:int=PAGE_SIZE, # number of rows to return
				page_current:int=1, # current page
				exclude_punctuation:bool=True, # exclude punctuation tokens
				exclude_spaces:bool=True # exclude space tokens
				) -> Result: # return a Result object with the frequency table
	""" Report frequent ngrams. """
	
	if type(normalize_by) != int:
		raise ValueError('normalize_by must be an integer, e.g. 1000000 or 10000')

	start_time = time.time()

	if case_sensitive:
		index = 'orth_index'
	else:
		index = 'lower_index'

	filter = [self.corpus.EOF_TOKEN]
	if exclude_punctuation == True:
		filter += self.corpus.punct_tokens
	if exclude_spaces == True:
		filter += self.corpus.space_tokens

	resultset_start = page_size*(page_current-1)

	count_tokens, tokens_descriptor, total_descriptor = self.corpus.get_token_count_text(exclude_punctuation, exclude_spaces)
	formatted_data = [f'Report based on {tokens_descriptor}']
	formatted_data.append(f'Ngram length: {ngram_length}')

	# ngrams = self.corpus.get_ngrams_by_index(ngram_length = ngram_length, index = index).T
	ngrams_report = self.corpus.tokens.with_columns([pl.col(index).shift(-i).alias(f'token_{i+1}') for i in range(ngram_length)])

	schema = [f'token_{i+1}' for i in range(ngram_length)]

	# ngrams_report = pl.LazyFrame(ngrams.T, schema=schema)
	ngrams_report = ngrams_report.group_by(schema).agg(pl.len().alias("frequency")).sort(by="frequency", descending=True)

	for i in range(ngram_length):
		ngrams_report = ngrams_report.filter(~pl.col(f'token_{i+1}').is_in(filter))

	ngrams_report_page = ngrams_report.slice(resultset_start, page_size).collect(engine = 'streaming')
	logger.info(f'collected report page: {(time.time() - start_time):.5f} seconds')
	token_strs = []
	for i in range(ngram_length):
		token_strs.append(self.corpus.token_ids_to_tokens(ngrams_report_page.select(pl.col(f'token_{i+1}')).to_numpy().flatten()))
	token_strs = np.array(token_strs)
	ngram_text = [' '.join(column) for column in token_strs.T]
	ngrams_report_page = ngrams_report_page.with_columns(pl.Series(name="ngram", values=ngram_text)).with_row_index(name='rank', offset=(page_current-1)*page_size+1)
	ngrams_report_page = ngrams_report_page.with_columns(((pl.col("frequency") / pl.lit(count_tokens)) * normalize_by).alias('normalized_frequency'))
	formatted_data.append(f'Normalized Frequency is per {normalize_by:,.0f} tokens')

	# formatted_data.extend([f'Total unique ngrams: {total_unique:,}', f'Total ngrams: {total_count:,}'])

	# total_pages = math.ceil(total_unique/page_size)
	# if page_size != 0 and total_count > page_size:
	# 	formatted_data.extend([f'Showing {min(page_size, total_count)} rows', f'Page {page_current} of {total_pages}']) 

	ngrams_report_page = ngrams_report_page[['rank', 'ngram', 'frequency', 'normalized_frequency']]

	return Result(type = 'ngrams', df=ngrams_report_page, title=f'Ngram Frequencies', description=f'{self.corpus.name}', summary_data = {}, formatted_data = formatted_data)

In [None]:
set_logger_state('verbose')

In [None]:
ngrams_reuters.ngram_frequencies(ngram_length = 3, case_sensitive = True).display()

2025-06-07 16:13:23 - INFO - ngram_frequencies - collected report page: 0.12534 seconds


Ngram Frequencies,Ngram Frequencies,Ngram Frequencies,Ngram Frequencies
Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus
Rank,Ngram,Frequency,Normalized Frequency
1,The company said,741,5.30
2,cts vs loss,644,4.60
3,mln dlrs in,624,4.46
4,pct of the,539,3.85
5,said it has,529,3.78
6,the end of,444,3.17
7,dlrs a share,441,3.15
8,cts a share,430,3.07
9,the United States,418,2.99
10,Inc said it,412,2.95


In [None]:
#| hide
congress = Corpus().load(f'{save_path}us-congressional-speeches-subset-100k.corpus')
report_congress = Ngrams(congress)
congress._init_token_arrays()

2025-06-07 16:05:59 - INFO - memory_usage - init, memory usage: 1181.33203125 MB
2025-06-07 16:06:00 - INFO - load - Load time: 0.843 seconds
2025-06-07 16:06:00 - INFO - _init_token_arrays - Created tokens_array in 0.078 seconds
2025-06-07 16:06:00 - INFO - _init_token_arrays - Created tokens_lookup in 0.101 seconds
2025-06-07 16:06:02 - INFO - _init_token_arrays - Created tokens_sort_order in 1.822 seconds


In [None]:
#| hide
%time report_congress.ngram_frequencies(ngram_length = 2, case_sensitive = False, page_current = 1).display()

2025-06-07 16:06:04 - INFO - ngram_frequencies - collected report page: 1.94683 seconds


Ngram Frequencies,Ngram Frequencies,Ngram Frequencies
Ngram length: 2,Ngram length: 2,Ngram length: 2
Rank,Ngram,Frequency
1,of the,227943
2,in the,114241
3,to the,92967
4,it is,51659
5,that the,51620
6,for the,46516
7,on the,43924
8,and the,43053
9,by the,40236
10,the senator,37269


CPU times: user 8.39 s, sys: 277 ms, total: 8.66 s
Wall time: 1.96 s


In [None]:
#| hide
%time report_congress.ngrams('liberty', ngram_length = 2, ngram_token_position = 'RIGHT', page_size = 10).display()

2025-06-07 16:06:04 - INFO - tokenize - Tokenization time: 0.00031 seconds
2025-06-07 16:06:05 - INFO - get_token_positions - Token indexing (1060) time: 0.56575 seconds
2025-06-07 16:06:05 - INFO - ngrams - Generating ngrams results
2025-06-07 16:06:05 - INFO - _get_ngrams - Ngrams (1060) retrieval time: 0.00216 seconds
2025-06-07 16:06:05 - INFO - ngrams - Ngrams report time: 0.57616 seconds


"Ngrams for ""liberty""","Ngrams for ""liberty""","Ngrams for ""liberty""","Ngrams for ""liberty"""
US Congressional Speeches Subset 100k,US Congressional Speeches Subset 100k,US Congressional Speeches Subset 100k,US Congressional Speeches Subset 100k
Rank,Ngram,Frequency,Normalized Frequency
1,of liberty,227,0.13
2,the liberty,124,0.07
3,. liberty,99,0.05
4,at liberty,98,0.05
5,and liberty,62,0.03
6,for liberty,41,0.02
7,religious liberty,28,0.02
8,individual liberty,28,0.02
9,their liberty,18,0.01
10,radio liberty,15,0.01


CPU times: user 639 ms, sys: 340 ms, total: 980 ms
Wall time: 596 ms


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()