# frequency

> Functionality for frequency analysis.
- toc: false
- page-layout: full

In [None]:
#| default_exp frequency

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import time
import polars as pl
from fastcore.basics import patch

In [None]:
#| export
from conc.corpus import Corpus
from conc.listcorpus import ListCorpus
from conc.result import Result
from conc.core import logger, PAGE_SIZE

In [None]:
#| hide
from conc.core import set_logger_state

In [None]:
#| hide
import os

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

path_to_toy_corpus = f'{save_path}toy.corpus'
path_to_brown_corpus = f'{save_path}brown.corpus'
path_to_brown_listcorpus = f'{save_path}brown.listcorpus'
path_to_reuters_corpus = f'{save_path}reuters.corpus'

In [None]:
#| export
class Frequency:
	""" Class for frequency analysis reporting """
	def __init__(self,
			  corpus:Corpus|ListCorpus # Corpus instance
			  ): 
		self.corpus = corpus


In [None]:
#| hide
# now working from new vocab data structure
brown = Corpus().load(path_to_brown_corpus)
display(brown.vocab.head(5).collect(engine='streaming'))

rank,tokens_sort_order,token_id,token,frequency_lower,frequency_orth,is_punct,is_space
1,50087,22848,"""the""",63516,62473,False,False
2,28,8128,""",""",58331,58331,True,False
3,41,38309,""".""",49907,49907,True,False
4,35232,2739,"""of""",36321,36122,False,False
5,3351,7126,"""and""",27787,27633,False,False


In [None]:
#| export
@patch
def frequencies(self: Frequency,
				case_sensitive:bool=False, # frequencies for tokens with or without case preserved 
				normalize_by:int=10000, # normalize frequencies by a number (e.g. 10000)
				page_size:int=PAGE_SIZE, # number of rows to return, if 0 returns all
				page_current:int=1, # current page, ignored if page_size is 0
				show_token_id:bool=False, # show token_id in output
				show_document_frequency:bool=False, # show document frequency in output
				exclude_tokens:list[str]=[], # exclude specific tokens from frequency report, can be used to remove stopwords
				exclude_tokens_text:str = '', # text to explain which tokens have been excluded, will be added to the report notes
				restrict_tokens:list[str]=[], # restrict frequency report to return frequencies for a list of specific tokens
				restrict_tokens_text:str = '', # text to explain which tokens are included, will be added to the report notes
				exclude_punctuation:bool=True # exclude punctuation tokens
				) -> Result: # return a Result object with the frequency table
	""" Report frequent tokens. """

	if type(normalize_by) != int:
		raise ValueError('normalize_by must be an integer, e.g. 1000000 or 10000')

	start_time = time.time()

	if case_sensitive:
		frequency_column = 'frequency_orth'
		document_count_column = 'orth_index'
	else:
		frequency_column = 'frequency_lower'
		document_count_column = 'lower_index'

	if page_size == 0:
		page_current = 1 # if returning all, then only interested in first page

	columns = ['rank', 'token_id', 'token', 'frequency']

	count_tokens, tokens_descriptor, total_descriptor = self.corpus.get_token_count_text(exclude_punctuation)

	formatted_data = []
	formatted_data.append(f'Report based on {tokens_descriptor}')

	df = self.corpus.vocab.filter(pl.col(frequency_column).is_not_null())
	# ALWAYS remove spaces!
	df = df.filter(pl.col('is_space') == False)

	if exclude_tokens:
		excluded_tokens_count = df.filter(pl.col('token').is_in(exclude_tokens)).select(pl.len()).collect(engine='streaming').item()
		df = df.filter(~pl.col('token').is_in(exclude_tokens))
		if exclude_tokens_text == '':
			formatted_data.append(f'Tokens excluded from report: {excluded_tokens_count}')
		else:
			formatted_data.append(f'{exclude_tokens_text}')
	if restrict_tokens:
		df = df.filter(pl.col('token').is_in(restrict_tokens))
		if restrict_tokens_text == '':
			formatted_data.append(f'')
		else:
			formatted_data.append(f'{restrict_tokens_text}')

	if exclude_punctuation:
		df = df.filter(pl.col('is_punct') == False)

	df = df.sort(by = frequency_column, descending=True)

	unique_tokens = df.select(pl.len()).collect(engine='streaming').item()

	if page_size == 0:
		df = df.rename({frequency_column: "frequency"}).select(*columns)
		rank_offset = 1 # not really needed, but just in case future changes
	else:
		df = df.slice((page_current-1)*page_size, page_size).rename({frequency_column: "frequency"}).select(*columns)
		rank_offset = (page_current-1) * page_size+1

	if show_document_frequency:
		document_counts = self.corpus.tokens.select(pl.col(document_count_column).alias('token_id'), pl.col('token2doc_index')).group_by('token_id').agg(pl.col('token2doc_index').n_unique().alias('document_frequency'))
		df = df.join(document_counts, on='token_id', how='left', maintain_order='left')

	df = df.with_columns(((pl.col("frequency") / count_tokens) * normalize_by).alias('normalized_frequency'))
	columns.append('normalized_frequency')

	df = df.drop('rank').with_row_index(name='rank', offset=rank_offset)

	if show_token_id == False:
		df = df.drop('token_id')

	if normalize_by is not None:
		formatted_data.append(f'Normalized Frequency is per {normalize_by:,.0f} tokens')

	formatted_data.append(f'{total_descriptor}: {count_tokens:,.0f}')

	formatted_data.append(f'Unique {tokens_descriptor}: {unique_tokens:,.0f}')
	if page_size != 0 and unique_tokens > page_size:
		formatted_data.append(f'Showing {page_size} rows')
		formatted_data.append(f'Page {page_current} of {unique_tokens // page_size + 1}')

	logger.info(f'Frequencies report time: {(time.time() - start_time):.5f} seconds')

	return Result(type = 'frequencies', df=df, title='Frequencies', description=f'Frequencies of {tokens_descriptor}, {self.corpus.name}', summary_data={}, formatted_data=formatted_data)


In [None]:
# load the corpus
brown = Corpus().load(path_to_brown_corpus)

In [None]:
#| hide
brown_listcorpus = ListCorpus().load(path_to_brown_listcorpus)

In [None]:
#| hide
toy = Corpus().load(path_to_toy_corpus)

In [None]:
#| hide
toy.summary()
brown.summary()

Corpus Summary,Corpus Summary
Attribute,Value
Name,Toy Corpus
Description,Toy corpus is a very small dataset for testing and library development.
Date Created,2025-07-01 13:05:37
Conc Version,0.1.5
Corpus Path,/home/geoff/data/conc-test-corpora/toy.corpus
Document Count,6
Token Count,38
Word Token Count,32
Unique Tokens,15
Unique Word Tokens,14


Corpus Summary,Corpus Summary
Attribute,Value
Name,Brown Corpus
Description,"A Standard Corpus of Present-Day Edited American English, for use with Digital Computers. by W. N. Francis and H. Kucera (1964) Department of Linguistics, Brown University Providence, Rhode Island, USA Revised 1971, Revised and Amplified 1979 http://www.hit.uib.no/icame/brown/bcm.html. This version downloaded via NLTK https://www.nltk.org/nltk_data/."
Date Created,2025-07-01 13:11:26
Conc Version,0.1.5
Corpus Path,/home/geoff/data/conc-test-corpora/brown.corpus
Document Count,500
Token Count,1138566
Word Token Count,980144
Unique Tokens,42930
Unique Word Tokens,42907


In [None]:
# instantiate the Frequency class
freq_brown = Frequency(brown)

In [None]:
#| hide
from conc.core import show_toy_corpus

In [None]:
#| hide
show_toy_corpus(f'{source_path}toy.csv')



source,text,category,species
1.txt,The cat sat on the mat.,feline,cat
2.txt,The dog sat on the mat.,canine,dog
3.txt,The cat is meowing.,feline,cat
4.txt,The dog is barking.,canine,dog
5.txt,The cat is climbing a tree.,feline,cat
6.txt,The dog is digging a hole.,canine,dog


In [None]:
#| hide
freq_toy = Frequency(toy)
frequencies_test = freq_toy.frequencies(normalize_by=100, show_document_frequency = True, page_size=20).to_frame().to_dict(as_series=False)
normalized_frequencies_test = dict(zip(frequencies_test['token'], frequencies_test['normalized_frequency']))
document_frequencies_test = dict(zip(frequencies_test['token'], frequencies_test['document_frequency']))
frequencies_test = dict(zip(frequencies_test['token'], frequencies_test['frequency']))
assert frequencies_test['is'] == 4
assert normalized_frequencies_test['is'] == ((4/32) * 100)
assert document_frequencies_test['is'] == 4

%time freq_toy.frequencies(normalize_by=100, show_document_frequency = True, page_size=20).display()

Frequencies,Frequencies,Frequencies,Frequencies,Frequencies
"Frequencies of word tokens, Toy Corpus","Frequencies of word tokens, Toy Corpus","Frequencies of word tokens, Toy Corpus","Frequencies of word tokens, Toy Corpus","Frequencies of word tokens, Toy Corpus"
Rank,Token,Frequency,Document Frequency,Normalized Frequency
1,the,8,6,25.00
2,is,4,4,12.50
3,cat,3,3,9.38
4,dog,3,3,9.38
5,a,2,2,6.25
6,mat,2,2,6.25
7,on,2,2,6.25
8,sat,2,2,6.25
9,barking,1,1,3.12
10,climbing,1,1,3.12


CPU times: user 19.3 ms, sys: 6.92 ms, total: 26.3 ms
Wall time: 17.8 ms


In [None]:
# run the frequencies method and display the results
freq_brown.frequencies(normalize_by=10000, page_size=20).display()

Frequencies,Frequencies,Frequencies,Frequencies
"Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus"
Rank,Token,Frequency,Normalized Frequency
1,the,63516,648.03
2,of,36321,370.57
3,and,27787,283.50
4,to,25868,263.92
5,a,22190,226.40
6,in,19751,201.51
7,that,10409,106.20
8,is,10138,103.43
9,was,9931,101.32
10,for,8905,90.85


In [None]:
#| hide
freq_brown_listcorpus = Frequency(brown_listcorpus)
freq_brown_listcorpus.frequencies(normalize_by=10000, page_size=20).display()

Frequencies,Frequencies,Frequencies,Frequencies
"Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus"
Rank,Token,Frequency,Normalized Frequency
1,the,63516,648.03
2,of,36321,370.57
3,and,27787,283.50
4,to,25868,263.92
5,a,22190,226.40
6,in,19751,201.51
7,that,10409,106.20
8,is,10138,103.43
9,was,9931,101.32
10,for,8905,90.85


In [None]:
#| hide
# test page 2
freq_brown.frequencies(normalize_by=10000, page_current = 2, page_size=20).display()

Frequencies,Frequencies,Frequencies,Frequencies
"Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus"
Rank,Token,Frequency,Normalized Frequency
21,at,4990,50.91
22,not,4726,48.22
23,are,4391,44.80
24,i,4370,44.59
25,from,4228,43.14
26,or,4100,41.83
27,this,4037,41.19
28,have,3950,40.30
29,an,3570,36.42
30,which,3546,36.18


In [None]:
from conc.core import get_stop_words

In [None]:
stop_words = get_stop_words(save_path, spacy_model = 'en_core_web_sm')
freq_brown.frequencies(normalize_by=10000, show_document_frequency = True, exclude_tokens = stop_words, page_size=20).display()

Frequencies,Frequencies,Frequencies,Frequencies,Frequencies
"Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus"
Rank,Token,Frequency,Document Frequency,Normalized Frequency
1,said,1944,315,19.83
2,time,1667,450,17.01
3,new,1595,390,16.27
4,man,1346,326,13.73
5,like,1287,366,13.13
6,af,989,49,10.09
7,years,953,346,9.72
8,way,925,365,9.44
9,state,883,200,9.01
10,long,863,354,8.80


In [None]:
#| hide
# congress = Corpus().load(f'{save_path}us-congressional-speeches-subset-100k.corpus')
# freq_congress = Frequency(congress)
# %time freq_congress.frequencies(normalize_by=10000, page_size=20).display()

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()