# report

> Reports to aide corpus analysis.

In [None]:
#| default_exp report

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import time
from fastcore.basics import patch

In [None]:
#| export
from conc.result import Result
from conc.core import PAGE_SIZE
from conc.corpus import Corpus
from conc.frequency import Frequency
from conc.ngrams import Ngrams
from conc.concordance import Concordance


In [None]:
#| export
class Report:
	"""Represention of a text data, with methods to load and save a corpus and to do corpus linguistic analysis of the texts."""
	
	def __init__(self, 
				corpus # Corpus instance
				):
		# information about corpus
		self.corpus = corpus
		self.frequency_ = Frequency(corpus)
		self.ngrams_ = Ngrams(corpus)
		self.concordance_ = Concordance(corpus)



In [None]:
#| hide
import os

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

path_to_toy_corpus = f'{save_path}toy.corpus'
path_to_brown_corpus = f'{save_path}brown.corpus'
path_to_reuters_corpus = f'{save_path}reuters.corpus'

In [None]:
# load (or build) a corpus
brown = Corpus('brown').load(path_to_brown_corpus)

In [None]:
# create a report instance for the corpus
report_brown = Report(brown)

In [None]:
#| export
@patch
def frequencies(self: Report,
				case_insensitive:bool=True, # frequencies for tokens lowercased or with case preserved
				normalize_by:int=10000, # normalize frequencies by a number (e.g. 10000)
				page_size:int=PAGE_SIZE, # number of rows to return
				page_current:int=1, # current page
				show_token_id:bool=False, # show token_id in output
				show_document_frequency:bool=False, # show document frequency in output
				exclude_tokens:list[str]=[], # exclude specific tokens from frequency report, can be used to remove stopwords
				exclude_tokens_text:str = '', # text to explain which tokens have been excluded, will be added to the report notes
				restrict_tokens:list[str]=[], # restrict frequency report to return frequencies for a list of specific tokens
				restrict_tokens_text:str = '', # text to explain which tokens are included, will be added to the report notes
				exclude_punctuation:bool=True, # exclude punctuation tokens
				exclude_spaces:bool=True # exclude space tokens
				) -> Result: # return a Result object with the frequency table
	""" Report frequent tokens. """
	return self.frequency_.frequencies(case_insensitive=case_insensitive,
										normalize_by=normalize_by,
										page_size=page_size,
										page_current=page_current,
										show_token_id=show_token_id,
										show_document_frequency=show_document_frequency,
										exclude_tokens=exclude_tokens,
										exclude_tokens_text=exclude_tokens_text,
										restrict_tokens=restrict_tokens,
										restrict_tokens_text=restrict_tokens_text,
										exclude_punctuation=exclude_punctuation,
										exclude_spaces=exclude_spaces)

In [None]:
report_brown.frequencies(normalize_by=10000).display()

Frequencies,Frequencies,Frequencies,Frequencies
"Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus","Frequencies of word tokens, Brown Corpus"
Rank,Token,Frequency,Normalized Frequency
1,the,63516,648.03
2,of,36321,370.57
3,and,27787,283.50
4,to,25868,263.92
5,a,22190,226.40
6,in,19751,201.51
7,that,10409,106.20
8,is,10138,103.43
9,was,9931,101.32
10,for,8905,90.85


In [None]:
#| export
@patch
def ngrams(self: Report, 
		   token_str: str, # token string to get ngrams for 
		   ngram_length:int = 2, # length of ngram
		   ngram_word_position:str = 'LEFT', # specify if token sequence is on LEFT, RIGHT, or MIDDLE of ngrams
		   page_size:int = PAGE_SIZE, # number of results to display per results page 
		   page_current:int = 0, # current page of results
		   show_all_columns:bool = False, # return raw df with all columns or just ngram and frequency
		   use_cache:bool = True # retrieve the results from cache if available
		   ) -> Result: # return a Result object with ngram data
	""" Report ngrams for a token string. """
	return self.ngrams_.ngrams(token_str, ngram_length=ngram_length, ngram_word_position=ngram_word_position, page_size=page_size, page_current=page_current, show_all_columns=show_all_columns, use_cache=use_cache)

In [None]:
report_brown.ngrams(token_str = 'i', ngram_length = 2, ngram_word_position = 'LEFT').display()

"Ngrams for ""i""","Ngrams for ""i"""
"Ngram length: 2, Token position: left","Ngram length: 2, Token position: left"
Ngram,Frequency
i was,265
i have,211
i 'm,209
i had,194
i do,186
i am,152
i 'll,139
i could,124
i can,107
i would,103


In [None]:
#| export
@patch
def concordance(self: Report, 
				token_str: str, # token string to get concordance for 
				context_words:int = 5, # number of words to show on left and right of token string
				order:str='1R2R3R', # order of sort columns
				page_size:int=PAGE_SIZE, # number of results to display per results page
				page_current:int=1, # current page of results
				show_all_columns:bool = False, # df with all columns or just essentials
				use_cache:bool = True # retrieve the results from cache if available
				) -> Result: # concordance report results
	""" Report concordance for a token string. """
	return self.concordance_.concordance(token_str, context_words=context_words, order=order, page_size=page_size, page_current=page_current, show_all_columns=show_all_columns, use_cache=use_cache)

In [None]:
report_brown.concordance('cause', context_words = 10, order='1R2R3R').display()

"Concordance for ""cause""","Concordance for ""cause""","Concordance for ""cause""","Concordance for ""cause"""
"Brown Corpus, Context tokens: 10, Order: 1R2R3R","Brown Corpus, Context tokens: 10, Order: 1R2R3R","Brown Corpus, Context tokens: 10, Order: 1R2R3R","Brown Corpus, Context tokens: 10, Order: 1R2R3R"
Document Id,Left,Node,Right
87,abstract principle connected with it -- such as ` `,cause,"'' . all practical purposes , the West stands disunited"
196,to stand or fall only by the merits of my,cause,'' . seven recognized that independence was but the first
100,professionals '' but agitators for some kind of ` `,cause,"'' or ` ` reform '' , and this was"
207,he really wants is to find ` ` a sacred,cause,'' to which he can honestly devote himself . restless
263,` of '' that lost ` ` and '' dying,cause,", ` ` and in the '' ` ` sprung"
200,things happening in the earth and sky with no discernable,cause,", and these they attribute to the will of God"
288,"sign the request , because of illness or other good",cause,", another person who stands in close personal or business"
253,not be at any fault for money for prosecuting the,cause,", for himself will procure it and lay it down"
76,short views -- only up to lunchtime '' . the,cause,", his mood in the fifties rarely rises above the"
240,"Mando , pleading her",cause,", must have said that Dr. Brown was the most"


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()