# keyness

> Functionality for keyness analysis.

In [None]:
#| default_exp keyness

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import numpy as np
import time
import polars as pl
from fastcore.basics import patch
import math

In [None]:
#| export
from conc.corpus import Corpus
from conc.result import Result
from conc.core import logger, PAGE_SIZE

In [None]:
#| export
class Keyness:
	""" Class for keyness analysis reporting. """
	def __init__(self,
			  corpus:Corpus # Corpus instance
			  ): 
		self.corpus = corpus


In [None]:
# version from old version of the library
# def keywords(self:Keyness, 
# 			 reference_corpus:Corpus, # Reference corpus
# 			 use_cache:bool=True # retrieve the results from cache if available
# 			 ) -> Result: # Result instance
	
# 	start_time = time.time()

# 	cache_id = tuple(['keywords'])
# 	if use_cache == True and cache_id in self.corpus.results_cache:
# 		keyness_report = self.corpus.results_cache[cache_id]
# 	else:

# 		minimum_reference_normalised_freq = math.log2(0.5/reference_corpus['token_count'])

# 		normalised_freq_in_corpus = np.array(list(loaded_corpora[corpus_name]['frequency_lookup'].values()))/loaded_corpora[corpus_name]['token_count']
# 		normalised_freq_in_reference = np.array(list(reference_corpus['frequency_lookup'].values()))/reference_corpus['token_count']

# 		normalised_freq_in_corpus = np.log2(normalised_freq_in_corpus)
# 		normalised_freq_in_reference = np.log2(normalised_freq_in_reference)

# 		normalised_freq_in_corpus = dict(zip(loaded_corpora[corpus_name]['frequency_lookup'].keys(), normalised_freq_in_corpus.tolist()))
# 		normalised_freq_in_reference = dict(zip(reference_corpus['frequency_lookup'].keys(), normalised_freq_in_reference.tolist()))

# 		keyness_data = {}
# 		for token_id in normalised_freq_in_corpus:
# 			if token_id in normalised_freq_in_reference:
# 				log_ratio = normalised_freq_in_corpus[token_id] - normalised_freq_in_reference[token_id]
# 			else:
# 				log_ratio = normalised_freq_in_corpus[token_id] - minimum_reference_normalised_freq
# 			keyness_data[token_id] = {'id': loaded_corpora[corpus_name]['vocab'][token_id], 'log_ratio': log_ratio}

# 		keyness_report = pd.DataFrame.from_dict(keyness_data, orient='index').sort_values(['log_ratio'], ascending=False)

# 		self.corpus.results_cache[cache_id] = keyness_report

# 	return keyness_report

In [None]:
#| hide
path_to_brown_corpus = '../test-corpora/saved/brown.corpus'
path_to_reuters_corpus = '../test-corpora/saved/reuters.corpus'
path_to_gutenberg_corpus = '../test-corpora/saved/gutenberg.corpus'


In [None]:
# load the corpus
brown = Corpus().load(path_to_brown_corpus)
reuters = Corpus().load(path_to_reuters_corpus)
gutenberg = Corpus().load(path_to_gutenberg_corpus)


In [None]:
brown.summary()
reuters.summary()
gutenberg.summary()


Corpus Summary,Corpus Summary
Attribute,Value
Name,Brown Corpus
Description,"A Standard Corpus of Present-Day Edited American English, for use with Digital Computers. by W. N. Francis and H. Kucera (1964) Department of Linguistics, Brown University Providence, Rhode Island, USA Revised 1971, Revised and Amplified 1979 http://www.hit.uib.no/icame/brown/bcm.html"
Conc Version,0.0.1
Corpus Path,../test-corpora/saved/brown.corpus
Source Path,../test-corpora/source/brown.csv.gz
Document Count,500
Token Count,1140905
Unique Tokens,42937
Word Token Count,980844
Unique Word Tokens,42907


Corpus Summary,Corpus Summary
Attribute,Value
Name,Reuters Corpus
Description,From NLTK TODO
Conc Version,0.0.1
Corpus Path,../test-corpora/saved/reuters.corpus
Source Path,../test-corpora/source/reuters.csv.gz
Document Count,10788
Token Count,1726826
Unique Tokens,50047
Word Token Count,1409770
Unique Word Tokens,49860


Corpus Summary,Corpus Summary
Attribute,Value
Name,Gutenberg Corpus
Description,From NLTK TODO
Conc Version,0.0.1
Corpus Path,../test-corpora/saved/gutenberg.corpus
Source Path,../test-corpora/source/gutenberg.csv.gz
Document Count,18
Token Count,2777046
Unique Tokens,51590
Word Token Count,2162511
Unique Word Tokens,51393


In [None]:
from conc.frequency import Frequency

In [None]:
frequencies_brown = Frequency(brown)
frequencies_reuters = Frequency(reuters)
frequencies_gutenberg = Frequency(gutenberg)

frequencies_brown.frequencies(normalize_by=1000000).display()
frequencies_reuters.frequencies(normalize_by=1000000).display()
frequencies_gutenberg.frequencies(normalize_by=1000000).display()


Frequencies,Frequencies,Frequencies,Frequencies
Frequencies of tokens in the corpus,Frequencies of tokens in the corpus,Frequencies of tokens in the corpus,Frequencies of tokens in the corpus
Rank,Token,Frequency,Normalized Frequency
1,the,63516,64756.48
2,of,36321,37030.35
3,and,27787,28329.68
4,to,25868,26373.21
5,a,22190,22623.37
6,in,19751,20136.74
7,that,10409,10612.29
8,is,10138,10336.00
9,was,9931,10124.95
10,for,8905,9078.92


Frequencies,Frequencies,Frequencies,Frequencies
Frequencies of tokens in the corpus,Frequencies of tokens in the corpus,Frequencies of tokens in the corpus,Frequencies of tokens in the corpus
Rank,Token,Frequency,Normalized Frequency
1,the,69263,49130.71
2,of,36779,26088.65
3,to,36328,25768.74
4,in,29252,20749.48
5,and,25645,18190.91
6,said,25379,18002.23
7,a,24844,17622.73
8,mln,18621,13208.54
9,vs,14332,10166.20
10,for,13720,9732.08


Frequencies,Frequencies,Frequencies,Frequencies
Frequencies of tokens in the corpus,Frequencies of tokens in the corpus,Frequencies of tokens in the corpus,Frequencies of tokens in the corpus
Rank,Token,Frequency,Normalized Frequency
1,the,133482,61725.47
2,and,95243,44042.78
3,of,71241,32943.65
4,to,48003,22197.81
5,a,33854,15654.95
6,in,33530,15505.12
7,i,29872,13813.57
8,that,28728,13284.56
9,he,25774,11918.55
10,it,22107,10222.84


In [None]:
from conc.concordance import Concordance

In [None]:
brown_concordance = Concordance(brown)
reuters_concordance = Concordance(reuters)
gutenberg_concordance = Concordance(gutenberg)

In [None]:
import string
import re

In [None]:
reference = brown
target = gutenberg

reference_df = reference.frequency_table.sort('frequency', descending=True).filter(pl.col('is_punct') == False).filter(pl.col('is_space') == False)
target_df = target.frequency_table.sort('frequency', descending=True).filter(pl.col('is_punct') == False).filter(pl.col('is_space') == False)
# create new pl df joined by token
reference_min_freq = 0.05 * 1000000 / reference.word_token_count
target_min_freq = 0.05 * 1000000 / target.word_token_count
combined_frequency_table = target_df.join(reference_df, on='token', how='left', suffix = '_reference').drop('rank', 'token_id', 'is_punct', 'is_space', 'rank_reference', 'token_id_reference', 'is_punct_reference', 'is_space_reference')

# polars df -  replace null values in normalized_frequency_reference with the brown_min_frequency
combined_frequency_table = combined_frequency_table.with_columns(pl.col('normalized_frequency_reference').fill_null(reference_min_freq))
combined_frequency_table = combined_frequency_table.with_columns((pl.col('normalized_frequency')/pl.col('normalized_frequency_reference')).alias('relative_risk'))

print(combined_frequency_table.filter(pl.col('frequency') > 100).sort('relative_risk', descending=True).head(20))


┌───────────┬───────────┬───────────────────┬───────────────────┬──────────────────┬───────────────┐
│ frequency ┆ token     ┆ normalized_freque ┆ frequency_referen ┆ normalized_frequ ┆ relative_risk │
│           ┆           ┆ ncy               ┆ ce                ┆ ency_reference   ┆               │
╞═══════════╪═══════════╪═══════════════════╪═══════════════════╪══════════════════╪═══════════════╡
│ 1673      ┆ shalt     ┆ 773.637683        ┆ null              ┆ 0.050977         ┆ 15176.357595  │
│ 1246      ┆ whale     ┆ 576.18204         ┆ null              ┆ 0.050977         ┆ 11302.893941  │
│ 817       ┆ judah     ┆ 377.801546        ┆ null              ┆ 0.050977         ┆ 7411.2876     │
│ 652       ┆ spake     ┆ 301.501357        ┆ null              ┆ 0.050977         ┆ 5914.515931   │
│ 627       ┆ marianne  ┆ 289.940722        ┆ null              ┆ 0.050977         ┆ 5687.732344   │
│ 591       ┆ ahab      ┆ 273.293408        ┆ null              ┆ 0.050977         ┆ 5361.1

In [None]:
# instantiate the Ngrams class
report_brown = Keyness(brown)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()