# report

> Reports to aide corpus analysis.

In [None]:
#| default_exp report

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import time
from fastcore.basics import patch

In [None]:
#| export
from conc.result import Result
from conc.core import PAGE_SIZE
from conc.corpus import Corpus
from conc.frequency import Frequency
from conc.ngrams import Ngrams
from conc.concordance import Concordance
from conc.keyness import Keyness

In [None]:
#| export
class Report:
	"""Represention of a text data, with methods to load and save a corpus and to do corpus linguistic analysis of the texts."""
	
	def __init__(self, 
				corpus # Corpus instance
				):
		# information about corpus
		self.corpus = corpus
		self.frequency_ = Frequency(corpus)
		self.ngrams_ = Ngrams(corpus)
		self.concordance_ = Concordance(corpus)
		self.keyness_ = None


In [None]:
#| hide
import os

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

path_to_toy_corpus = f'{save_path}toy.corpus'
path_to_brown_corpus = f'{save_path}brown.corpus'
path_to_reuters_corpus = f'{save_path}reuters.corpus'

In [None]:
# load (or build) a corpus
reuters = Corpus('reuters').load(path_to_reuters_corpus)

In [None]:
# create a report instance for the corpus
report = Report(reuters)

In [None]:
#| export
@patch
def frequencies(self: Report,
				case_sensitive:bool=False, # frequencies for tokens with or without case preserved 
				normalize_by:int=10000, # normalize frequencies by a number (e.g. 10000)
				page_size:int=PAGE_SIZE, # number of rows to return, if 0 returns all
				page_current:int=1, # current page, ignored if page_size is 0
				show_token_id:bool=False, # show token_id in output
				show_document_frequency:bool=False, # show document frequency in output
				exclude_tokens:list[str]=[], # exclude specific tokens from frequency report, can be used to remove stopwords
				exclude_tokens_text:str = '', # text to explain which tokens have been excluded, will be added to the report notes
				restrict_tokens:list[str]=[], # restrict frequency report to return frequencies for a list of specific tokens
				restrict_tokens_text:str = '', # text to explain which tokens are included, will be added to the report notes
				exclude_punctuation:bool=True, # exclude punctuation tokens
				exclude_spaces:bool=True # exclude space tokens
				) -> Result: # return a Result object with the frequency table
	""" Report frequent tokens. """
	return self.frequency_.frequencies(case_sensitive=case_sensitive,
										normalize_by=normalize_by,
										page_size=page_size,
										page_current=page_current,
										show_token_id=show_token_id,
										show_document_frequency=show_document_frequency,
										exclude_tokens=exclude_tokens,
										exclude_tokens_text=exclude_tokens_text,
										restrict_tokens=restrict_tokens,
										restrict_tokens_text=restrict_tokens_text,
										exclude_punctuation=exclude_punctuation,
										exclude_spaces=exclude_spaces)

In [None]:
report.frequencies(normalize_by=10000).display()

Frequencies,Frequencies,Frequencies,Frequencies
"Frequencies of word tokens, Reuters Corpus","Frequencies of word tokens, Reuters Corpus","Frequencies of word tokens, Reuters Corpus","Frequencies of word tokens, Reuters Corpus"
Rank,Token,Frequency,Normalized Frequency
1,the,69263,495.17
2,of,36779,262.94
3,to,36328,259.71
4,in,29252,209.12
5,and,25645,183.34
6,said,25379,181.44
7,a,24844,177.61
8,mln,18621,133.12
9,vs,14332,102.46
10,for,13720,98.09


In [None]:
#| export
@patch
def ngrams(self: Report, 
		   token_str: str, # token string to get ngrams for 
		   ngram_length:int = 2, # length of ngram
		   ngram_word_position:str = 'LEFT', # specify if token sequence is on LEFT, RIGHT, or MIDDLE of ngrams
		   page_size:int = PAGE_SIZE, # number of results to display per results page 
		   page_current:int = 0, # current page of results
		   show_all_columns:bool = False, # return raw df with all columns or just ngram and frequency
		   use_cache:bool = True # retrieve the results from cache if available
		   ) -> Result: # return a Result object with ngram data
	""" Report ngrams for a token string. """
	return self.ngrams_.ngrams(token_str, ngram_length=ngram_length, ngram_word_position=ngram_word_position, page_size=page_size, page_current=page_current, show_all_columns=show_all_columns, use_cache=use_cache)

In [None]:
report.ngrams(token_str = 'i', ngram_length = 2, ngram_word_position = 'LEFT').display()

"Ngrams for ""i""","Ngrams for ""i""","Ngrams for ""i"""
"Ngram length: 2, Token position: left","Ngram length: 2, Token position: left","Ngram length: 2, Token position: left"
Rank,Ngram,Frequency
81,i welcome,1
82,i predict,1
83,i consider,1
84,i forecast,1
85,i judge,1
86,i now,1
87,i buy,1
88,i eat,1
89,i 'll,1
90,i still,1


In [None]:
#| export
@patch
def concordance(self: Report, 
				token_str: str, # token string to get concordance for 
				context_words:int = 5, # number of words to show on left and right of token string
				order:str='1R2R3R', # order of sort columns
				page_size:int=PAGE_SIZE, # number of results to display per results page
				page_current:int=1, # current page of results
				show_all_columns:bool = False, # df with all columns or just essentials
				use_cache:bool = True # retrieve the results from cache if available
				) -> Result: # concordance report results
	""" Report concordance for a token string. """
	return self.concordance_.concordance(token_str, context_words=context_words, order=order, page_size=page_size, page_current=page_current, show_all_columns=show_all_columns, use_cache=use_cache)

In [None]:
report.concordance('cause', context_words = 10, order='1R2R3R').display()

"Concordance for ""cause""","Concordance for ""cause""","Concordance for ""cause""","Concordance for ""cause"""
"Reuters Corpus, Context tokens: 10, Order: 1R2R3R","Reuters Corpus, Context tokens: 10, Order: 1R2R3R","Reuters Corpus, Context tokens: 10, Order: 1R2R3R","Reuters Corpus, Context tokens: 10, Order: 1R2R3R"
Document Id,Left,Node,Right
1802,dramatically this year because the jump in inflation will,cause,"a falloff in the growth of disposable income ,"
4127,", said the exchange is non - dilutive and will",cause,a slight increase in Innovex 's fully diluted earnings
5393,Berger also said that a further devalued dollar may,cause,"economic depressions in some U.S. trading partners , and"
5160,"dlrs a share for fiscal 1987 "" are expected to",cause,even the aggressive side of this range to be
5717,will not take any short - term decisions which might,cause,"markets to panic , "" Dauster added ."
9574,He said Iraq 's President Saddam Hussein was the main,cause,of tension in the Gulf and said Iran would
9628,He said Iraq 's President Saddam Hussein was the main,cause,of tension in the Gulf and said Iran would
6565,hit seven pct of Sri Lanka 's plantations and may,cause,"output to drop below its 1987 target of 143,000"
7550,Reuters . Impending crop problems in Argentina will likely,cause,"those prices to rise , and with the recently"
9209,Pacific region 's disrupted weather patterns to any single,cause,", El Nino 's role is being closely studied ,"


In [None]:
#| export
@patch
def set_reference_corpus(self: Report, 
                    corpus: Corpus  # Reference corpus
                    ) -> None:
    """ Set a reference corpus for keyness analysis. """
    self.keyness_ = Keyness(self.corpus, corpus)

In [None]:
# load (or build) a corpus
brown = Corpus('brown').load(path_to_brown_corpus)
report.set_reference_corpus(brown)

In [None]:
#| export
@patch
def keywords(self: Report,
				effect_size_measure:str = 'log_ratio', # effect size measure to use, currently only 'log_ratio' is supported and anything else is ignored
				statistical_significance_measure:str = 'log_likelihood', # statistical significance measure to use, currently only 'log_likelihood' is supported and anything else is ignored
				order:str = 'log_ratio', # column to order the results by: log_ratio, log_likelihood, frequency, frequency_reference, document_frequency, document_frequency_reference
				order_descending:bool = True, # order is descending or ascending
				statistical_significance_cut: float = 0.0, # statistical significance cut-off, e.g. 0.05 or 0.01 or 0.001
				apply_bonferroni:bool = False, # apply Bonferroni correction to the statistical significance cut-off
				min_document_frequency: int = 0, # minimum document frequency in target for token to be included in the report
				min_document_frequency_reference: int = 0, # minimum document frequency in reference for token to be included in the report
				min_frequency: int = 0, # minimum frequency in target for token to be included in the report
				min_frequency_reference: int = 0, # minimum document frequency in reference for token to be included in the report
				case_sensitive:bool=False, # frequencies for tokens with or without case preserved 
				normalize_by:int=10000, # normalize frequencies by a number (e.g. 10000)
				page_size:int=PAGE_SIZE, # number of rows to return, if 0 returns all
				page_current:int=1, # current page, ignored if page_size is 0
				show_document_frequency:bool=False, # show document frequency in output
				exclude_tokens:list[str]=[], # exclude specific tokens from frequency report, can be used to remove stopwords
				exclude_tokens_text:str = '', # text to explain which tokens have been excluded, will be added to the report notes
				restrict_tokens:list[str]=[], # restrict frequency report to return frequencies for a list of specific tokens
				restrict_tokens_text:str = '', # text to explain which tokens are included, will be added to the report notes
				exclude_punctuation:bool=True, # exclude punctuation tokens
				exclude_spaces:bool=True # exclude space tokens
				) -> Result: # return a Result object with the frequency table
	""" Get keywords for the corpus. """
	if self.keyness_ is None:
		raise ValueError("Reference corpus is not set. Use 'set_reference_corpus' to set the reference corpus.")
	return self.keyness_.keywords(effect_size_measure=effect_size_measure,
									statistical_significance_measure=statistical_significance_measure,
									order=order,
									order_descending=order_descending,
									statistical_significance_cut=statistical_significance_cut,
									apply_bonferroni=apply_bonferroni,
									min_document_frequency=min_document_frequency,
									min_document_frequency_reference=min_document_frequency_reference,
									min_frequency=min_frequency,
									min_frequency_reference=min_frequency_reference,
									case_sensitive=case_sensitive,
									normalize_by=normalize_by,
									page_size=page_size,
									page_current=page_current,
									show_document_frequency=show_document_frequency,
									exclude_tokens=exclude_tokens,
									exclude_tokens_text=exclude_tokens_text,
									restrict_tokens=restrict_tokens,
									restrict_tokens_text=restrict_tokens_text,
									exclude_punctuation=exclude_punctuation,
									exclude_spaces=exclude_spaces)

In [None]:
report.keywords(statistical_significance_cut = 0.001, min_document_frequency_reference=5).display()

Keywords,Keywords,Keywords,Keywords,Keywords,Keywords,Keywords,Keywords,Keywords
"Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus","Target corpus: Reuters Corpus, Reference corpus: Brown Corpus"
Rank,Token,Frequency,Normalized Frequency,Frequency Reference,Normalized Frequency Reference,Relative Risk,Log Ratio,Log Likelihood
1,net,6988,49.96,31,0.32,157.95,7.30,7078.84
2,dividend,1041,7.44,6,0.06,121.57,6.93,1042.37
3,exports,1214,8.68,10,0.10,85.07,6.41,1191.05
4,4th,840,6.01,8,0.08,73.57,6.20,815.81
5,securities,839,6.00,8,0.08,73.49,6.20,814.76
6,currency,818,5.85,8,0.08,71.65,6.16,792.86
7,subsidiary,630,4.50,7,0.07,63.06,5.98,604.46
8,billion,5828,41.66,65,0.66,62.83,5.97,5589.95
9,pact,428,3.06,5,0.05,59.98,5.91,408.89
10,profit,2960,21.16,36,0.37,57.61,5.85,2817.73


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()