# keyness

> Functionality for keyness analysis.

In [None]:
#| default_exp keyness

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import numpy as np
import time
import polars as pl
from fastcore.basics import patch
import math

In [None]:
#| export
from conc.corpus import Corpus
from conc.result import Result
from conc.core import logger, PAGE_SIZE

In [None]:
#| export
class Keyness:
	""" Class for keyness analysis reporting. """
	def __init__(self,
			  corpus:Corpus # Corpus instance
			  ): 
		self.corpus = corpus


In [None]:
# version from old version of the library
# def keywords(self:Keyness, 
# 			 reference_corpus:Corpus, # Reference corpus
# 			 use_cache:bool=True # retrieve the results from cache if available
# 			 ) -> Result: # Result instance
	
# 	start_time = time.time()

# 	cache_id = tuple(['keywords'])
# 	if use_cache == True and cache_id in self.corpus.results_cache:
# 		keyness_report = self.corpus.results_cache[cache_id]
# 	else:

# 		minimum_reference_normalised_freq = math.log2(0.5/reference_corpus['token_count'])

# 		normalised_freq_in_corpus = np.array(list(loaded_corpora[corpus_name]['frequency_lookup'].values()))/loaded_corpora[corpus_name]['token_count']
# 		normalised_freq_in_reference = np.array(list(reference_corpus['frequency_lookup'].values()))/reference_corpus['token_count']

# 		normalised_freq_in_corpus = np.log2(normalised_freq_in_corpus)
# 		normalised_freq_in_reference = np.log2(normalised_freq_in_reference)

# 		normalised_freq_in_corpus = dict(zip(loaded_corpora[corpus_name]['frequency_lookup'].keys(), normalised_freq_in_corpus.tolist()))
# 		normalised_freq_in_reference = dict(zip(reference_corpus['frequency_lookup'].keys(), normalised_freq_in_reference.tolist()))

# 		keyness_data = {}
# 		for token_id in normalised_freq_in_corpus:
# 			if token_id in normalised_freq_in_reference:
# 				log_ratio = normalised_freq_in_corpus[token_id] - normalised_freq_in_reference[token_id]
# 			else:
# 				log_ratio = normalised_freq_in_corpus[token_id] - minimum_reference_normalised_freq
# 			keyness_data[token_id] = {'id': loaded_corpora[corpus_name]['vocab'][token_id], 'log_ratio': log_ratio}

# 		keyness_report = pd.DataFrame.from_dict(keyness_data, orient='index').sort_values(['log_ratio'], ascending=False)

# 		self.corpus.results_cache[cache_id] = keyness_report

# 	return keyness_report

In [None]:
#| hide
path_to_brown_corpus = '../test-corpora/saved/brown.corpus'
path_to_reuters_corpus = '../test-corpora/saved/reuters.corpus'
path_to_gutenberg_corpus = '../test-corpora/saved/gutenberg.corpus'
path_to_rnz_corpus = '../test-corpora/saved/rnz-10k.corpus'

In [None]:
# load the corpus
brown = Corpus().load(path_to_brown_corpus)
reuters = Corpus().load(path_to_reuters_corpus)
gutenberg = Corpus().load(path_to_gutenberg_corpus)
rnz = Corpus().load(path_to_rnz_corpus)
garden = Corpus().load('../test-corpora/saved/garden-party-corpus.corpus')

In [None]:
brown.summary()
reuters.summary()
gutenberg.summary()
rnz.summary()
garden.summary()

In [None]:
from conc.frequency import Frequency

In [None]:
frequencies_brown = Frequency(brown)
frequencies_reuters = Frequency(reuters)
frequencies_gutenberg = Frequency(gutenberg)
frequencies_rnz = Frequency(rnz)
frequences_garden = Frequency(garden)


frequencies_brown.frequencies(normalize_by=1000000).display()
frequencies_reuters.frequencies(normalize_by=1000000).display()
frequencies_gutenberg.frequencies(normalize_by=1000000).display()
frequencies_rnz.frequencies(normalize_by=1000000).display()
frequences_garden.frequencies(normalize_by=1000000).display()

In [None]:
from conc.concordance import Concordance

In [None]:
brown_concordance = Concordance(brown)
reuters_concordance = Concordance(reuters)
gutenberg_concordance = Concordance(gutenberg)

In [None]:
import string
import re

In [None]:
reference = brown
target = garden

reference_df = reference.frequency_table.sort('frequency', descending=True).filter(pl.col('is_punct') == False).filter(pl.col('is_space') == False)
target_df = target.frequency_table.sort('frequency', descending=True).filter(pl.col('is_punct') == False).filter(pl.col('is_space') == False)
# create new pl df joined by token
reference_min_freq = 0.05 * 1000000 / reference.word_token_count
target_min_freq = 0.05 * 1000000 / target.word_token_count
combined_frequency_table = target_df.join(reference_df, on='token', how='left', suffix = '_reference').drop('rank', 'token_id', 'is_punct', 'is_space', 'rank_reference', 'token_id_reference', 'is_punct_reference', 'is_space_reference')

# polars df -  replace null values in normalized_frequency_reference with the brown_min_frequency
combined_frequency_table = combined_frequency_table.with_columns(pl.col('normalized_frequency_reference').fill_null(reference_min_freq))
combined_frequency_table = combined_frequency_table.with_columns((pl.col('normalized_frequency')/pl.col('normalized_frequency_reference')).alias('relative_risk'))

print(combined_frequency_table.filter(pl.col('frequency') > 100).sort('relative_risk', descending=True).head(20))


In [None]:
# instantiate the Ngrams class
report_brown = Keyness(brown)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()