# frequency

> Functionality for frequency analysis.

In [None]:
#| default_exp frequency

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import time
import polars as pl
from fastcore.basics import patch

In [None]:
#| export
from conc.corpus import Corpus
from conc.result import Result
from conc.core import logger, PAGE_SIZE, set_logger_state

In [None]:
#| hide
save_path = '../test-corpora/saved/'
path_to_toy_corpus = '../test-corpora/saved/toy.corpus'
path_to_brown_corpus = '../test-corpora/saved/brown.corpus'

In [None]:
#| export
class Frequency:
	""" Class for frequency analysis reporting """
	def __init__(self,
			  corpus:Corpus # Corpus instance
			  ): 
		self.corpus = corpus


In [None]:
#| hide
# now working from new vocab data structure
brown = Corpus().load(path_to_brown_corpus)
display(brown.vocab.head(5).collect(engine='streaming'))

rank,token_id,token,frequency_lower,frequency_orth,is_punct,is_space
1,22848,"""the""",63516,62473,False,False
2,8128,""",""",58331,58331,True,False
3,38309,""".""",49907,49907,True,False
4,2739,"""of""",36321,36122,False,False
5,7126,"""and""",27787,27633,False,False


In [None]:
#| export
@patch
def frequencies(self: Frequency,
				case_insensitive:bool=True, # frequencies for tokens lowercased or with case preserved
				normalize_by:int=1000000, # normalize frequencies by a number (e.g. 10000)
				page_size:int=PAGE_SIZE, # number of rows to return
				page_current:int=1, # current page
				show_token_id:bool=False, # show token_id in output
				exclude_tokens:list[str]=[], # exclude specific tokens from frequency report, can be used to remove stopwords
				exclude_tokens_text:str = '', # text to explain which tokens have been excluded, will be added to the report notes
				restrict_tokens:list[str]=[], # restrict frequency report to return frequencies for a list of specific tokens
				restrict_tokens_text:str = '', # text to explain which tokens are included, will be added to the report notes
				exclude_punctuation:bool=True, # exclude punctuation tokens
				exclude_spaces:bool=True # exclude space tokens
				) -> Result: # return a Result object with the frequency table
	""" Report frequent tokens. """

	if type(normalize_by) != int:
		raise ValueError('normalize_by must be an integer, e.g. 1000000 or 10000')

	start_time = time.time()

	if case_insensitive:
		frequency_column = 'frequency_lower'
	else:
		frequency_column = 'frequency_orth'

	columns = ['rank', 'token', 'frequency']
	if show_token_id == True:
		columns = ['rank', 'token_id', 'token', 'frequency']

	count_tokens = self.corpus.token_count
	tokens_descriptor = 'all tokens'
	total_descriptor = 'Total tokens'
	if exclude_punctuation and exclude_spaces:
		count_tokens = self.corpus.word_token_count
		tokens_descriptor = 'word tokens'
		total_descriptor = 'Total word tokens'
	elif exclude_punctuation:
		space_tokens_count = self.corpus.spaces.select(pl.len()).collect(engine='streaming').item()
		count_tokens = self.corpus.word_token_count + space_tokens_count
		tokens_descriptor = 'word and space tokens'
		total_descriptor = 'Total word and space tokens'
	elif exclude_spaces:
		punct_tokens_count = self.corpus.puncts.select(pl.len()).collect(engine='streaming').item()
		count_tokens = self.corpus.word_token_count + punct_tokens_count
		tokens_descriptor = 'word and punctuation tokens'
		total_descriptor = 'Total word and punctuation tokens'

	formatted_data = []
	formatted_data.append(f'Report based on {tokens_descriptor}')

	df = self.corpus.vocab.filter(pl.col(frequency_column).is_not_null())
	if exclude_tokens:
		excluded_tokens_count = df.filter(pl.col('token').is_in(exclude_tokens)).select(pl.len()).collect(engine='streaming').item()
		df = df.filter(~pl.col('token').is_in(exclude_tokens))
		if exclude_tokens_text == '':
			formatted_data.append(f'Tokens excluded from report: {excluded_tokens_count}')
		else:
			formatted_data.append(f'{exclude_tokens_text}')
	if restrict_tokens:
		df = df.filter(pl.col('token').is_in(restrict_tokens))
		if restrict_tokens_text == '':
			formatted_data.append(f'')
		else:
			formatted_data.append(f'{restrict_tokens_text}')

	if exclude_punctuation:
		df = df.filter(pl.col('is_punct') == False)
	if exclude_spaces:
		df = df.filter(pl.col('is_space') == False)

	df = df.sort(by = frequency_column, descending=True)

	unique_tokens = df.select(pl.len()).collect(engine='streaming').item()

	df = df.slice((page_current-1)*page_size, page_current*page_size).rename({frequency_column: "frequency"}).select(*columns)

	# if a number is passed then normalize by that number
	df = df.with_columns(((pl.col("frequency") / count_tokens) * normalize_by).alias('normalized_frequency'))
	columns.append('normalized_frequency')

	df = df.drop('rank').with_row_index(name='rank', offset=(page_current-1)*page_size+1)

	if normalize_by is not None:
		formatted_data.append(f'Normalized Frequency is per {normalize_by:,.0f} tokens')

	formatted_data.append(f'{total_descriptor}: {count_tokens:,.0f}')

	formatted_data.append(f'Unique tokens: {unique_tokens:,.0f}')
	if unique_tokens > page_size:
		formatted_data.append(f'Showing {page_size} rows')
		formatted_data.append(f'Page {page_current} of {unique_tokens // page_size + 1}')

	logger.info(f'Frequencies report time: {(time.time() - start_time):.5f} seconds')

	return Result(type = 'frequencies', df=df.collect(engine='streaming'), title='Frequencies', description='Frequencies of tokens in the corpus', summary_data={}, formatted_data=formatted_data)


In [None]:
# load the corpus
brown = Corpus().load(path_to_brown_corpus)

In [None]:
#| hide
toy = Corpus().load(path_to_toy_corpus)

In [None]:
#| hide
print(toy)
print(brown)

┌────────────────────┬──────────────────────────────────┐
│ Attribute          ┆ Value                            │
╞════════════════════╪══════════════════════════════════╡
│ Name               ┆ Toy Corpus                       │
│ Description        ┆ Toy corpus for testing           │
│ Date Created       ┆ 2025-05-29 21:12:13              │
│ Conc Version       ┆ 0.0.1                            │
│ Corpus Path        ┆ ../test-corpora/saved/toy.corpus │
│ Document Count     ┆ 6                                │
│ Token Count        ┆ 38                               │
│ Word Token Count   ┆ 32                               │
│ Unique Tokens      ┆ 15                               │
│ Unique Word Tokens ┆ 14                               │
└────────────────────┴──────────────────────────────────┘
┌────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [None]:
# instantiate the Frequency class
report_brown = Frequency(brown)

In [None]:
#| hide
report_toy = Frequency(toy)
%time report_toy.frequencies(normalize_by=10000, page_size=20).display()

Frequencies,Frequencies,Frequencies,Frequencies
Frequencies of tokens in the corpus,Frequencies of tokens in the corpus,Frequencies of tokens in the corpus,Frequencies of tokens in the corpus
Rank,Token,Frequency,Normalized Frequency
1,the,8,2500.00
2,is,4,1250.00
3,cat,3,937.50
4,dog,3,937.50
5,on,2,625.00
6,sat,2,625.00
7,mat,2,625.00
8,a,2,625.00
9,climbing,1,312.50
10,barking,1,312.50


CPU times: user 23.1 ms, sys: 10.8 ms, total: 33.9 ms
Wall time: 18.4 ms


In [None]:
# run the frequencies method and display the results
report_brown.frequencies(normalize_by=10000, page_size=20).display()

Frequencies,Frequencies,Frequencies,Frequencies
Frequencies of tokens in the corpus,Frequencies of tokens in the corpus,Frequencies of tokens in the corpus,Frequencies of tokens in the corpus
Rank,Token,Frequency,Normalized Frequency
1,the,63516,648.03
2,of,36321,370.57
3,and,27787,283.50
4,to,25868,263.92
5,a,22190,226.40
6,in,19751,201.51
7,that,10409,106.20
8,is,10138,103.43
9,was,9931,101.32
10,for,8905,90.85


In [None]:
from conc.core import get_stop_words
stop_words = get_stop_words(save_path, spacy_model = 'en_core_web_sm')
report_brown.frequencies(normalize_by=10000, exclude_tokens = stop_words, page_size=20).display()

Frequencies,Frequencies,Frequencies,Frequencies
Frequencies of tokens in the corpus,Frequencies of tokens in the corpus,Frequencies of tokens in the corpus,Frequencies of tokens in the corpus
Rank,Token,Frequency,Normalized Frequency
1,said,1944,19.83
2,time,1667,17.01
3,new,1595,16.27
4,man,1346,13.73
5,like,1287,13.13
6,af,989,10.09
7,years,953,9.72
8,way,925,9.44
9,state,883,9.01
10,long,863,8.80


In [None]:
#| hide
congress = Corpus().load('../test-corpora/saved/us-congressional-speeches-subset-500k.corpus')
report_congress = Frequency(congress)
%time report_congress.frequencies(normalize_by=10000, page_size=20, exclude_tokens = stop_words).display()

Frequencies,Frequencies,Frequencies,Frequencies
Frequencies of tokens in the corpus,Frequencies of tokens in the corpus,Frequencies of tokens in the corpus,Frequencies of tokens in the corpus
Rank,Token,Frequency,Normalized Frequency
1,mr.,446869,49.46
2,bill,348577,38.58
3,senator,264478,29.28
4,committee,243140,26.91
5,president,235712,26.09
6,time,201359,22.29
7,amendment,198132,21.93
8,states,174178,19.28
9,house,166762,18.46
10,gentleman,159877,17.70


CPU times: user 213 ms, sys: 190 ms, total: 403 ms
Wall time: 111 ms


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()