# keyness

> Functionality for keyness analysis.

In [None]:
#| default_exp keyness

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import numpy as np
import time
import polars as pl
from fastcore.basics import patch
import math

In [None]:
#| export
from conc.corpus import Corpus
from conc.result import Result
from conc.core import logger, PAGE_SIZE

In [None]:
#| export
class Keyness:
	""" Class for keyness analysis reporting. """
	def __init__(self,
			  corpus:Corpus # Corpus instance
			  ): 
		self.corpus = corpus


In [None]:
# version from old version of the library
# def keywords(self:Keyness, 
# 			 reference_corpus:Corpus, # Reference corpus
# 			 use_cache:bool=True # retrieve the results from cache if available
# 			 ) -> Result: # Result instance
	
# 	start_time = time.time()

# 	cache_id = tuple(['keywords'])
# 	if use_cache == True and cache_id in self.corpus.results_cache:
# 		keyness_report = self.corpus.results_cache[cache_id]
# 	else:

# 		minimum_reference_normalised_freq = math.log2(0.5/reference_corpus['token_count'])

# 		normalised_freq_in_corpus = np.array(list(loaded_corpora[corpus_name]['frequency_lookup'].values()))/loaded_corpora[corpus_name]['token_count']
# 		normalised_freq_in_reference = np.array(list(reference_corpus['frequency_lookup'].values()))/reference_corpus['token_count']

# 		normalised_freq_in_corpus = np.log2(normalised_freq_in_corpus)
# 		normalised_freq_in_reference = np.log2(normalised_freq_in_reference)

# 		normalised_freq_in_corpus = dict(zip(loaded_corpora[corpus_name]['frequency_lookup'].keys(), normalised_freq_in_corpus.tolist()))
# 		normalised_freq_in_reference = dict(zip(reference_corpus['frequency_lookup'].keys(), normalised_freq_in_reference.tolist()))

# 		keyness_data = {}
# 		for token_id in normalised_freq_in_corpus:
# 			if token_id in normalised_freq_in_reference:
# 				log_ratio = normalised_freq_in_corpus[token_id] - normalised_freq_in_reference[token_id]
# 			else:
# 				log_ratio = normalised_freq_in_corpus[token_id] - minimum_reference_normalised_freq
# 			keyness_data[token_id] = {'id': loaded_corpora[corpus_name]['vocab'][token_id], 'log_ratio': log_ratio}

# 		keyness_report = pd.DataFrame.from_dict(keyness_data, orient='index').sort_values(['log_ratio'], ascending=False)

# 		self.corpus.results_cache[cache_id] = keyness_report

# 	return keyness_report

In [None]:
#| hide
import os

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

path_to_toy_corpus = f'{save_path}toy.corpus'
path_to_brown_corpus = f'{save_path}brown.corpus'
path_to_reuters_corpus = f'{save_path}reuters.corpus'
path_to_gutenberg_corpus = f'{save_path}gutenberg.corpus'
path_to_gardenparty_corpus = f'{save_path}garden-party.corpus'
path_to_congress_corpus = f'{save_path}us-congressional-speeches-subset-100k.corpus'

In [None]:
# load the corpus
toy = Corpus().load(path_to_toy_corpus)
brown = Corpus().load(path_to_brown_corpus)
reuters = Corpus().load(path_to_reuters_corpus)
gutenberg = Corpus().load(path_to_gutenberg_corpus)
gardenparty = Corpus().load(path_to_gardenparty_corpus)
congress = Corpus().load(path_to_congress_corpus)


In [None]:
brown.summary()
reuters.summary()
gardenparty.summary()
congress.summary()

Corpus Summary,Corpus Summary
Attribute,Value
Name,Brown Corpus
Description,"A Standard Corpus of Present-Day Edited American English, for use with Digital Computers. by W. N. Francis and H. Kucera (1964) Department of Linguistics, Brown University Providence, Rhode Island, USA Revised 1971, Revised and Amplified 1979 http://www.hit.uib.no/icame/brown/bcm.html. This version downloaded via NLTK https://www.nltk.org/nltk_data/."
Date Created,2025-06-04 14:32:45
Conc Version,0.0.1
Corpus Path,/home/geoff/data/conc-test-corpora/brown.corpus
Document Count,500
Token Count,1140905
Word Token Count,980144
Unique Tokens,42937
Unique Word Tokens,42907


Corpus Summary,Corpus Summary
Attribute,Value
Name,Reuters Corpus
Description,"Reuters corpus (Reuters-21578, Distribution 1.0). ""The copyright for the text of newswire articles and Reuters annotations in the Reuters-21578 collection resides with Reuters Ltd. Reuters Ltd. and Carnegie Group, Inc. have agreed to allow the free distribution of this data *for research purposes only*. If you publish results based on this data set, please acknowledge its use, refer to the data set by the name (Reuters-21578, Distribution 1.0), and inform your readers of the current location of the data set."" https://kdd.ics.uci.edu/databases/reuters21578/reuters21578.html. This version downloaded via NLTK https://www.nltk.org/nltk_data/."
Date Created,2025-06-04 14:32:50
Conc Version,0.0.1
Corpus Path,/home/geoff/data/conc-test-corpora/reuters.corpus
Document Count,10788
Token Count,1726826
Word Token Count,1398782
Unique Tokens,50047
Unique Word Tokens,49860


Corpus Summary,Corpus Summary
Attribute,Value
Name,Garden Party Corpus
Description,A corpus of short stories from The Garden Party: and Other Stories by Katherine Mansfield. Texts downloaded from Project Gutenberg https://gutenberg.org/ and are in the public domain. The text files contain the short story without the title. https://github.com/ucdh/scraping-garden-party
Date Created,2025-06-04 14:32:59
Conc Version,0.0.1
Corpus Path,/home/geoff/data/conc-test-corpora/garden-party.corpus
Document Count,15
Token Count,79940
Word Token Count,63311
Unique Tokens,5417
Unique Word Tokens,5398


Corpus Summary,Corpus Summary
Attribute,Value
Name,US Congressional Speeches Subset 100k
Description,"1 million speeches sampled from https://huggingface.co/datasets/Eugleo/us-congressional-speeches-subset to create corpora of varying sizes for development and testing. The dataset card at Huggingface is empty, so there is no further information available on the contents. The title indicates how many speeches are included in this corpus."
Date Created,2025-06-04 14:52:37
Conc Version,0.0.1
Corpus Path,/home/geoff/data/conc-test-corpora/us-congressional-speeches-subset-100k.corpus
Document Count,100000
Token Count,20027241
Word Token Count,18020769
Unique Tokens,214503
Unique Word Tokens,214175


In [None]:
from conc.frequency import Frequency

In [None]:
freq_brown = Frequency(brown)
freq_reuters = Frequency(reuters)
freq_gutenberg = Frequency(gutenberg)
freq_gardenparty = Frequency(gardenparty)
freq_congress = Frequency(congress)

freq_brown.frequencies(normalize_by=1000000, page_size = 5).display()
freq_reuters.frequencies(normalize_by=1000000, page_size = 5).display()
freq_gutenberg.frequencies(normalize_by=1000000, page_size = 5).display()
freq_gardenparty.frequencies(normalize_by=1000000, page_size = 5, show_document_frequency = True).display()
freq_congress.frequencies(normalize_by=1000000, page_size = 5).display()

Frequencies,Frequencies,Frequencies,Frequencies
"Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus"
Rank,Token,Frequency,Normalized Frequency
1,the,63516,64802.72
2,of,36321,37056.80
3,and,27787,28349.92
4,to,25868,26392.04
5,a,22190,22639.53
Report based on word tokens,Report based on word tokens,Report based on word tokens,Report based on word tokens
"Normalized Frequency is per 1,000,000 tokens","Normalized Frequency is per 1,000,000 tokens","Normalized Frequency is per 1,000,000 tokens","Normalized Frequency is per 1,000,000 tokens"
"Total word tokens: 980,144","Total word tokens: 980,144","Total word tokens: 980,144","Total word tokens: 980,144"
"Unique tokens: 42,907","Unique tokens: 42,907","Unique tokens: 42,907","Unique tokens: 42,907"
Showing 5 rows,Showing 5 rows,Showing 5 rows,Showing 5 rows


Frequencies,Frequencies,Frequencies,Frequencies
"Frequencies of word tokens, Reuters Corpus","Frequencies of word tokens, Reuters Corpus","Frequencies of word tokens, Reuters Corpus","Frequencies of word tokens, Reuters Corpus"
Rank,Token,Frequency,Normalized Frequency
1,the,69263,49516.65
2,of,36779,26293.59
3,to,36328,25971.17
4,in,29252,20912.48
5,and,25645,18333.81
Report based on word tokens,Report based on word tokens,Report based on word tokens,Report based on word tokens
"Normalized Frequency is per 1,000,000 tokens","Normalized Frequency is per 1,000,000 tokens","Normalized Frequency is per 1,000,000 tokens","Normalized Frequency is per 1,000,000 tokens"
"Total word tokens: 1,398,782","Total word tokens: 1,398,782","Total word tokens: 1,398,782","Total word tokens: 1,398,782"
"Unique tokens: 49,860","Unique tokens: 49,860","Unique tokens: 49,860","Unique tokens: 49,860"
Showing 5 rows,Showing 5 rows,Showing 5 rows,Showing 5 rows


Frequencies,Frequencies,Frequencies,Frequencies
"Frequencies of word tokens, Gutenberg Corpus","Frequencies of word tokens, Gutenberg Corpus","Frequencies of word tokens, Gutenberg Corpus","Frequencies of word tokens, Gutenberg Corpus"
Rank,Token,Frequency,Normalized Frequency
1,the,133482,61731.69
2,and,95243,44047.22
3,of,71241,32946.97
4,to,48003,22200.04
5,a,33854,15656.53
Report based on word tokens,Report based on word tokens,Report based on word tokens,Report based on word tokens
"Normalized Frequency is per 1,000,000 tokens","Normalized Frequency is per 1,000,000 tokens","Normalized Frequency is per 1,000,000 tokens","Normalized Frequency is per 1,000,000 tokens"
"Total word tokens: 2,162,293","Total word tokens: 2,162,293","Total word tokens: 2,162,293","Total word tokens: 2,162,293"
"Unique tokens: 51,393","Unique tokens: 51,393","Unique tokens: 51,393","Unique tokens: 51,393"
Showing 5 rows,Showing 5 rows,Showing 5 rows,Showing 5 rows


Frequencies,Frequencies,Frequencies,Frequencies,Frequencies
"Frequencies of word tokens, Garden Party Corpus","Frequencies of word tokens, Garden Party Corpus","Frequencies of word tokens, Garden Party Corpus","Frequencies of word tokens, Garden Party Corpus","Frequencies of word tokens, Garden Party Corpus"
Rank,Token,Frequency,Document Frequency,Normalized Frequency
1,the,2911,15,45979.37
2,and,1798,15,28399.49
3,“,1615,15,25509.00
4,”,1614,15,25493.20
5,a,1407,15,22223.63
Report based on word tokens,Report based on word tokens,Report based on word tokens,Report based on word tokens,Report based on word tokens
"Normalized Frequency is per 1,000,000 tokens","Normalized Frequency is per 1,000,000 tokens","Normalized Frequency is per 1,000,000 tokens","Normalized Frequency is per 1,000,000 tokens","Normalized Frequency is per 1,000,000 tokens"
"Total word tokens: 63,311","Total word tokens: 63,311","Total word tokens: 63,311","Total word tokens: 63,311","Total word tokens: 63,311"
"Unique tokens: 5,398","Unique tokens: 5,398","Unique tokens: 5,398","Unique tokens: 5,398","Unique tokens: 5,398"
Showing 5 rows,Showing 5 rows,Showing 5 rows,Showing 5 rows,Showing 5 rows


Frequencies,Frequencies,Frequencies,Frequencies
"Frequencies of word tokens, US Congressional Speeches Subset 100k","Frequencies of word tokens, US Congressional Speeches Subset 100k","Frequencies of word tokens, US Congressional Speeches Subset 100k","Frequencies of word tokens, US Congressional Speeches Subset 100k"
Rank,Token,Frequency,Normalized Frequency
1,the,1389439,77102.09
2,of,687127,38129.73
3,to,610266,33864.59
4,and,459220,25482.82
5,in,379946,21083.78
Report based on word tokens,Report based on word tokens,Report based on word tokens,Report based on word tokens
"Normalized Frequency is per 1,000,000 tokens","Normalized Frequency is per 1,000,000 tokens","Normalized Frequency is per 1,000,000 tokens","Normalized Frequency is per 1,000,000 tokens"
"Total word tokens: 18,020,769","Total word tokens: 18,020,769","Total word tokens: 18,020,769","Total word tokens: 18,020,769"
"Unique tokens: 214,175","Unique tokens: 214,175","Unique tokens: 214,175","Unique tokens: 214,175"
Showing 5 rows,Showing 5 rows,Showing 5 rows,Showing 5 rows


In [None]:
# TODO - recheck calcs
reference = gutenberg
target = gardenparty

frequency_column = 'frequency_lower'
ignore_column = 'frequency_orth'

reference_df = reference.vocab.filter(pl.col('is_punct') == False).filter(pl.col('is_space') == False)
target_df = target.vocab.filter(pl.col('is_punct') == False).filter(pl.col('is_space') == False)

reference_min_freq = (0.05 * 1000000) / reference.word_token_count
target_min_freq = (0.05 * 1000000) / target.word_token_count

combined_frequency_table = target_df.join(reference_df, on='token', how='left', suffix = '_reference').drop('rank', 'token_id', 'is_punct', 'is_space', ignore_column, 'rank_reference', 'token_id_reference', 'is_punct_reference', 'is_space_reference', f'{ignore_column}_reference')

combined_frequency_table = combined_frequency_table.with_columns(
    (pl.col(frequency_column) * 1000000 / target.word_token_count).alias('normalized_frequency')
) 

combined_frequency_table = combined_frequency_table.with_columns(
    (pl.col(f'{frequency_column}_reference') * 1000000 / reference.word_token_count).alias('normalized_frequency_reference')
)

combined_frequency_table = combined_frequency_table.with_columns(pl.col('normalized_frequency').fill_null(target_min_freq))
combined_frequency_table = combined_frequency_table.with_columns(pl.col('normalized_frequency_reference').fill_null(reference_min_freq))
combined_frequency_table = combined_frequency_table.with_columns((pl.col('normalized_frequency')/pl.col('normalized_frequency_reference')).alias('relative_risk'))
combined_frequency_table = combined_frequency_table.with_columns((pl.col('normalized_frequency').log(2) - pl.col('normalized_frequency_reference').log(2)).alias('log_ratio'))

combined_frequency_table = combined_frequency_table.with_columns(
    (pl.col('normalized_frequency') * pl.col('normalized_frequency_reference')).alias('expected_frequency'),
    (pl.col('normalized_frequency') + pl.col('normalized_frequency_reference')).alias('total_frequency')
)
combined_frequency_table = combined_frequency_table.with_columns(
    (pl.col('normalized_frequency') * pl.col('normalized_frequency_reference') / pl.col('total_frequency')).alias('log_likelihood_ratio')
)

combined_frequency_table = combined_frequency_table.collect()

# flag common significance levels
combined_frequency_table = combined_frequency_table.with_columns(
    #(pl.col('log_likelihood_ratio') > 3.84).alias('significant_at_95'),
    #(pl.col('log_likelihood_ratio') > 6.63).alias('significant_at_99'),
    (pl.col('log_likelihood_ratio') > 10.83).alias('significant_at_99_9')
)

# was using a threshold of 5 min previously (pl.col(f'{frequency_column}_reference') > 5) & (pl.col(f'{frequency_column}') > 5)
print(combined_frequency_table.filter(pl.col('log_likelihood_ratio') > 19.82).drop(['frequency_lower', 'expected_frequency', 'total_frequency', 'frequency_lower_reference', 'relative_risk']).sort('log_ratio', descending=True).head(50))

# at 95% significance level, p is 0.05 and chi-squared is 3.84
# at 99% significance level, p is 0.01 and chi-squared is 6.63
# at 99.9% significance level, p is 0.0001 and chi-squared is 10.83
# at 99.99% significance level, p is 0.00001 and chi-squared is 15.13
# at 99.999% significance level, p is 0.000001 and chi-squared is 19.82
# at 99.9999% significance level, p is 0.0000001 and chi-squared is 24.64
# bonferroni correction - and other relevant corrections



┌───────────┬──────────────────────┬────────────────────────────────┬───────────┬──────────────────────┬─────────────────────┐
│ token     ┆ normalized_frequency ┆ normalized_frequency_reference ┆ log_ratio ┆ log_likelihood_ratio ┆ significant_at_99_9 │
╞═══════════╪══════════════════════╪════════════════════════════════╪═══════════╪══════════════════════╪═════════════════════╡
│ the       ┆ 45979.371673         ┆ 156.321934                     ┆ 8.200323  ┆ 155.792267           ┆ true                │
│ she       ┆ 18495.995956         ┆ 185.943999                     ┆ 6.636201  ┆ 184.093272           ┆ true                │
│ him       ┆ 3016.853311          ┆ 35.655719                      ┆ 6.402767  ┆ 35.239232            ┆ true                │
│ and       ┆ 28399.488241         ┆ 348.57417                      ┆ 6.348255  ┆ 344.347661           ┆ true                │
│ they      ┆ 6286.427319          ┆ 90.689889                      ┆ 6.115155  ┆ 89.400175            ┆ true  

In [None]:
# instantiate the Ngrams class
report_brown = Keyness(brown)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()