# frequency

> Functionality for frequency analysis.

In [None]:
#| default_exp frequency

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import time
import polars as pl
from fastcore.basics import patch

In [None]:
#| export
from conc.corpus import Corpus
from conc.result import Result
from conc.core import logger, PAGE_SIZE

In [None]:
#| export
class Frequency:
	""" Class for frequency analysis reporting """
	def __init__(self,
			  corpus:Corpus # Corpus instance
			  ): 
		self.corpus = corpus


In [None]:
#| export
@patch
def frequencies(self: Frequency,
				n:int=PAGE_SIZE, # number of rows to return
				normalize_by:int=1000000, # normalize frequencies by a number (e.g. 10000)
				sort_by='frequency', # TODO - check if needed
				show_token_id:bool=False, # show token_id in output
				exclude_punctuation:bool=True, # exclude punctuation tokens
				exclude_spaces:bool=True # exclude space tokens
				) -> Result: # return a Result object with the frequency table
	""" Report frequent tokens. """
	# TODO - add in restrict_to and exclude options - latter is for stopword removal
	
	start_time = time.time()
	self.corpus._init_frequency_table()

	columns = ['rank', 'token', 'frequency']
	if show_token_id == True:
		columns = ['rank', 'token_id', 'token', 'frequency']

	# if a number is passed then normalize by that number
	if type(normalize_by) != int:
		raise ValueError('normalize_by must be an integer, e.g. 1000000 or 10000')
	self.corpus.frequency_table = self.corpus.frequency_table.with_columns((pl.col('frequency') * normalize_by / self.corpus.token_count).alias('normalized_frequency'))
	columns.append('normalized_frequency')

	# TODO - work out what doing with sort_by
	# if sort_by in ['frequency', 'normalized_frequency']:
	# 	self.frequency_table = self.frequency_table.sort(sort_by, descending=True)
	# 	self.frequency_table = self.frequency_table.drop('rank').with_row_index(name='rank', offset=1)

	formatted_data = []
	if normalize_by is not None:
		formatted_data.append(f'Normalized Frequency is per {normalize_by:,.0f} tokens')

	logger.info(f'Frequencies report time: {(time.time() - start_time):.5f} seconds')

	df = self.corpus.frequency_table.sort('frequency', descending=True)
	if exclude_punctuation:
		df = df.filter(pl.col('is_punct') == False)
	if exclude_spaces:
		df = df.filter(pl.col('is_space') == False)
	if n:
		df = df[columns].head(n)
	else:
		df = df[columns]
	# remove rows with is_punct or is_space columns set to True
	df = df.drop('rank').with_row_index(name='rank', offset=1)
	return Result(type = 'frequencies', df=df, title='Frequencies', description='Frequencies of tokens in the corpus', summary_data={}, formatted_data=formatted_data)


In [None]:
#| hide
path_to_corpus_file = '../test-corpora/saved/brown.corpus'

In [None]:
# load the corpus
brown = Corpus().load(path_to_corpus_file)

In [None]:
# instantiate the Frequency class
report_brown = Frequency(brown)

In [None]:
# run the frequencies method and display the results
report_brown.frequencies(n=10, normalize_by=10000).display()

Frequencies,Frequencies,Frequencies,Frequencies
Frequencies of tokens in the corpus,Frequencies of tokens in the corpus,Frequencies of tokens in the corpus,Frequencies of tokens in the corpus
Rank,Token,Frequency,Normalized Frequency
1,the,63516,556.72
2,of,36321,318.35
3,and,27787,243.55
4,to,25868,226.73
5,a,22190,194.49
6,in,19751,173.12
7,that,10409,91.23
8,is,10138,88.86
9,was,9931,87.04
10,for,8905,78.05


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()