# ngrams

> Functionality for ngram analysis.

In [None]:
#| default_exp ngrams

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import numpy as np
import time
import polars as pl
from spacy.attrs import ORTH, LOWER # remove? - add ENT_TYPE, ENT_IOB
import math
from fastcore.basics import patch

In [None]:
#| export
from conc.corpus import Corpus
from conc.result import Result
from conc.core import logger, PAGE_SIZE

In [None]:
#| hide
from conc.core import set_logger_state

In [None]:
#| hide
import os

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

path_to_toy_corpus = f'{save_path}toy.corpus'
path_to_brown_corpus = f'{save_path}brown.corpus'
path_to_reuters_corpus = f'{save_path}reuters.corpus'

In [None]:
#| hide
from conc.corpus import build_test_corpora

In [None]:
#| hide
build_test_corpora(source_path, save_path, force_rebuild=False)

In [None]:
#| export
class Ngrams:
	""" Class for n-gram analysis reporting. """
	def __init__(self,
			  corpus:Corpus # Corpus instance
			  ): 
		self.corpus = corpus


In [None]:
#| hide
from conc.core import show_toy_corpus

In [None]:
#| hide
toy = Corpus().load(path_to_toy_corpus)
ngrams_toy = Ngrams(toy)

In [None]:
#| exporti
@patch
def _get_ngrams(self:Ngrams, 
			   token_sequence: list[np.ndarray], # token sequence to get index for 
			   index_id: int, # index to search (i.e. ORTH, LOWER)
			   token_positions: list[np.ndarray], # positions of token sequence, returned by get_token_positions 
			   ngram_length: int = 2, # length of ngram
			   ngram_token_position: str = 'LEFT', # specify if token sequence is on LEFT or RIGHT (support for ngrams with token in middle of sequence is in-development))
			   exclude_punctuation:bool=False # exclude ngrams with tokens
			   ) -> np.ndarray: # array of ngrams results

	""" Get ngram data for a token sequence. """
	
	start_time = time.time()
	sequence_len = len(token_sequence[0])
	variants_len = len(token_sequence)
	token_index_len = len(token_positions[0])

	if index_id == ORTH:
		index = 'orth_index'
	else:
		index = 'lower_index'

	if ngram_token_position == 'RIGHT':
		ngrams = self.corpus.get_tokens_in_context(token_positions = token_positions, index = index, context_length = ngram_length, position_offset = 0, position_offset_step = -1, exclude_punctuation = False, convert_eof = False)
		ngrams = ngrams[::-1, :] # reversing order as retrieved right to left
	# elif ngram_token_position == 'MIDDLE': # further development needed - in roadmap
	# 	ngram_range = range(-1 * ngram_length + sequence_len + 1, sequence_len + 1)
	else:
		ngrams = self.corpus.get_tokens_in_context(token_positions = token_positions, index = index, context_length = ngram_length, position_offset = 0, position_offset_step = 1, exclude_punctuation = False, convert_eof = False)

	# old retrieval method - aligning with collocates context retrieval
	# ngrams = []
	
	# for pos in ngram_range:
	# 	if variants_len == 1 and pos > -1 and pos < sequence_len:
	# 		ngrams.append(np.full(token_index_len, token_sequence[0][pos]))
	# 	else:
	# 		seq = token_positions[0] + pos
	# 		ngrams.append(self.corpus.get_tokens_by_index(index)[seq])

	# ngrams = np.stack(ngrams)
	# logger.info(f'Ngrams ({ngrams.shape}) retrieval time: {(time.time() - start_time):.5f} seconds')

	# getting positions to search for EOF_TOKEN and filter out ngrams crossing doc boundaries
	#positions = (np.array(ngram_range)[:, None] != np.arange(sequence_len)).all(axis=1)
	# ngrams = np.delete(ngrams, np.where(ngrams[positions] == self.corpus.EOF_TOKEN)[1], axis=1)
	logger.debug(f'Ngrams shape prior to EOF removal {ngrams.shape}')
	ngrams = np.delete(ngrams, np.where(ngrams == self.corpus.EOF_TOKEN)[1], axis=1)
	logger.debug(f'Ngrams shape after EOF removal {ngrams.shape}')

	if exclude_punctuation: # see above - ngrams returned with punctuation tokens - and then cleaned if exclude_punctuation is True
		punctuation_tokens = self.corpus.punct_tokens
		ngrams = np.delete(ngrams, np.where(np.isin(ngrams, punctuation_tokens))[1], axis=1)

	logger.info(f'Ngrams ({ngrams.shape[1]}) retrieval time: {(time.time() - start_time):.5f} seconds')
	return ngrams


In [None]:
#| hide
show_toy_corpus(f'{source_path}toy.csv')
token_str = 'dog'
token_sequence, index_id = toy.tokenize(token_str, simple_indexing=True)
token_positions = toy.get_token_positions(token_sequence, index_id)
ngrams = ngrams_toy._get_ngrams(token_sequence, index_id, token_positions, ngram_length = 2, ngram_token_position = 'LEFT')
assert ngrams.shape[0] == 2
assert ngrams.shape[1] == 3
assert tuple(toy.token_ids_to_tokens(ngrams[0])) == tuple(['dog', 'dog', 'dog'])
assert tuple(toy.token_ids_to_tokens(ngrams[1])) == tuple(['sat', 'is', 'is'])

# testing right
ngrams = ngrams_toy._get_ngrams(token_sequence, index_id, token_positions, ngram_length = 2, ngram_token_position = 'RIGHT')
assert ngrams.shape[0] == 2
assert ngrams.shape[1] == 3
assert tuple(toy.token_ids_to_tokens(ngrams[1])) == tuple(['dog', 'dog', 'dog'])
assert tuple(toy.token_ids_to_tokens(ngrams[0])) == tuple(['the', 'the', 'the'])

source,text,category,species
1.txt,The cat sat on the mat.,feline,cat
2.txt,The dog sat on the mat.,canine,dog
3.txt,The cat is meowing.,feline,cat
4.txt,The dog is barking.,canine,dog
5.txt,The cat is climbing a tree.,feline,cat
6.txt,The dog is digging a hole.,canine,dog


In [None]:
#| hide
token_str = 'the'
token_sequence, index_id = toy.tokenize(token_str, simple_indexing=True)
token_positions = toy.get_token_positions(token_sequence, index_id)
ngrams = ngrams_toy._get_ngrams(token_sequence, index_id, token_positions, ngram_length = 3, ngram_token_position = 'LEFT', exclude_punctuation = False)
assert ngrams.shape[0] == 3
assert ngrams.shape[1] == 8
assert tuple(toy.token_ids_to_tokens(ngrams[0])) == tuple(['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'])
assert tuple(toy.token_ids_to_tokens(ngrams[1])) == tuple(['cat', 'mat', 'dog', 'mat', 'cat', 'dog', 'cat', 'dog'])
assert tuple(toy.token_ids_to_tokens(ngrams[2])) == tuple(['sat', '.', 'sat', '.', 'is', 'is', 'is', 'is'])

ngrams = ngrams_toy._get_ngrams(token_sequence, index_id, token_positions, ngram_length = 3, ngram_token_position = 'LEFT', exclude_punctuation = True)
assert ngrams.shape[0] == 3
assert ngrams.shape[1] == 6
assert tuple(toy.token_ids_to_tokens(ngrams[0])) == tuple(['the', 'the',  'the', 'the', 'the', 'the'])
assert tuple(toy.token_ids_to_tokens(ngrams[1])) == tuple(['cat', 'dog',  'cat', 'dog', 'cat', 'dog'])
assert tuple(toy.token_ids_to_tokens(ngrams[2])) == tuple(['sat', 'sat', 'is', 'is', 'is', 'is'])

# case where punctuation can be included, but triggering EOF tokens - two not included
ngrams = ngrams_toy._get_ngrams(token_sequence, index_id, token_positions, ngram_length = 4, ngram_token_position = 'LEFT', exclude_punctuation = False)
assert ngrams.shape[0] == 4
assert ngrams.shape[1] == 6
assert tuple(toy.token_ids_to_tokens(ngrams[0])) == tuple(['the', 'the',  'the', 'the', 'the', 'the'])
assert tuple(toy.token_ids_to_tokens(ngrams[1])) == tuple(['cat', 'dog',  'cat', 'dog', 'cat', 'dog'])
assert tuple(toy.token_ids_to_tokens(ngrams[2])) == tuple(['sat', 'sat', 'is', 'is', 'is', 'is'])
assert tuple(toy.token_ids_to_tokens(ngrams[3])) == tuple(['on', 'on', 'meowing', 'barking', 'climbing', 'digging'])

token_str = 'the dog'
token_sequence, index_id = toy.tokenize(token_str, simple_indexing=True)
token_positions = toy.get_token_positions(token_sequence, index_id)
ngrams = ngrams_toy._get_ngrams(token_sequence, index_id, token_positions, ngram_length = 3, ngram_token_position = 'LEFT', exclude_punctuation = False)
assert ngrams.shape[0] == 3
assert ngrams.shape[1] == 3
assert tuple(toy.token_ids_to_tokens(ngrams[0])) == tuple(['the', 'the',  'the'])
assert tuple(toy.token_ids_to_tokens(ngrams[1])) == tuple(['dog', 'dog', 'dog'])
assert tuple(toy.token_ids_to_tokens(ngrams[2])) == tuple(['sat', 'is', 'is'])

In [None]:
#| hide
# set_logger_state('verbose')
# brown = Corpus().load(path_to_brown_corpus)
# ngrams_brown = Ngrams(brown)
# token_str = 'The'
# token_sequence, index_id = brown.tokenize(token_str, simple_indexing=True)
# token_positions = brown.get_token_positions(token_sequence, index_id)
# ngrams = ngrams_brown._get_ngrams(token_sequence, index_id, token_positions, ngram_length = 3, ngram_token_position = 'RIGHT', exclude_punctuation = False)
# print(ngrams.shape)
# # find any ngrams[0] that contain brown.space_tokens
# from_x = 0
# to_x = 100
# df = pl.DataFrame([brown.token_ids_to_tokens(ngrams[0]),
#     brown.token_ids_to_tokens(ngrams[1]), brown.token_ids_to_tokens(ngrams[2])])
# display(df.with_row_index('row').filter((pl.col('column_1') == '\n') & 
#     (pl.col('row') >= from_x) & (pl.col('row') <= to_x)
# ).head(20))

# ngrams = ngrams_brown._get_ngrams(token_sequence, index_id, token_positions, ngram_length = 2, ngram_token_position = 'RIGHT', exclude_punctuation = False)
# print(ngrams.shape)
# from_x = 0
# to_x = 100
# df = pl.DataFrame([brown.token_ids_to_tokens(ngrams[0]),
#     brown.token_ids_to_tokens(ngrams[1])])
# display(df.with_row_index('row').filter( 
#     (pl.col('row') == 37) | (pl.col('row') == 78) | (pl.col('row') == 94)).head(20))
# set_logger_state('quiet')

In [None]:
#| hide
# token_str = 'ocean'
# token_sequence, index_id = reuters.tokenize(token_str, simple_indexing=True)
# token_positions = reuters.get_token_positions(token_sequence, index_id)
# ngrams = ngrams_reuters._get_ngrams(token_sequence, index_id, token_positions, ngram_length = 2, ngram_token_position = 'LEFT')
# print(ngrams)

In [None]:
#| export
@patch
def ngrams(self: Ngrams, 
		   token_str: str, # token string to get ngrams for 
		   ngram_length:int = 2, # length of ngram
		   ngram_token_position: str = 'LEFT', # specify if token sequence is on LEFT or RIGHT (support for ngrams with token in middle of sequence is in-development))
		   normalize_by:int=10000, # normalize frequencies by a number (e.g. 10000)
		   page_size:int = PAGE_SIZE, # number of results to display per results page 
		   page_current:int = 1, # current page of results
		   show_all_columns:bool = False, # return raw df with all columns or just ngram and frequency
		   exclude_punctuation:bool=True, # do not return ngrams with punctuation tokens
		   use_cache:bool = True # retrieve the results from cache if available (currently ignored)
		   ) -> Result: # return a Result object with ngram data
	""" Report ngram frequencies containing a token string. """

	if type(normalize_by) != int:
		raise ValueError('normalize_by must be an integer, e.g. 1000000 or 10000')

	token_sequence, index_id = self.corpus.tokenize(token_str, simple_indexing=True)

	start_time = time.time()
	use_cache = False
	cache_id = tuple(['ngram'] + list(token_sequence) + [ngram_length, ngram_token_position]) # before reenabling will need to make sure the cache_id matches options above (e.g. could get differences based on exclude punctuation etc but cache first currently)

	if use_cache == True and cache_id in self.corpus.results_cache:
		logger.info('Using cached ngrams results')
		ngrams_report = self.corpus.results_cache[cache_id][0]
		total_unique = self.corpus.results_cache[cache_id][1]
		total_count = self.corpus.results_cache[cache_id][2]
	else:
		token_positions = self.corpus.get_token_positions(token_sequence, index_id)
		
		if len(token_positions[0]) == 0:
			logger.info('No tokens found')
			return Result(type = 'ngrams', df=pl.DataFrame(), title=f'Ngrams for "{token_str}"', description=f'No matches', summary_data={}, formatted_data=[])

		logger.info('Generating ngrams results')
		ngrams = self._get_ngrams(token_sequence, index_id, token_positions, ngram_length = ngram_length, ngram_token_position = ngram_token_position, exclude_punctuation=exclude_punctuation)
		total_count = ngrams.shape[1]
		schema = [f'token_{i+1}' for i in range(ngram_length)]
		ngrams_report = pl.DataFrame(ngrams.T, schema=schema).to_struct(name = 'ngram_token_ids').value_counts(sort=True).rename({"count": "frequency"})
		ngrams_report = ngrams_report.with_row_index(name='rank', offset=1)
		total_unique = len(ngrams_report)
		self.corpus.results_cache[cache_id] = (ngrams_report, total_unique, total_count)
	
	count_tokens, tokens_descriptor, total_descriptor = self.corpus.get_token_count_text(exclude_punctuation)

	resultset_start = page_size*(page_current-1)

	# get specific chunk of report into polars based on resultset_start:
	ngrams_report_page = ngrams_report.slice(resultset_start, page_size).unnest('ngram_token_ids')
	ngrams_report_page = ngrams_report_page.with_columns(((pl.col("frequency") / pl.lit(count_tokens)) * normalize_by).alias('normalized_frequency'))

	token_strs = []
	for i in range(ngram_length):
		token_strs.append(self.corpus.token_ids_to_tokens(ngrams_report_page[f'token_{i+1}'].to_numpy()))
	token_strs = np.array(token_strs)
	ngram_text = [' '.join(column) for column in token_strs.T]
	ngrams_report_page = ngrams_report_page.with_columns(pl.Series(name="ngram", values=ngram_text))
	#ngrams_report_page = ngrams_report_page.to_pandas().set_index('rank')
	total_pages = math.ceil(total_unique/page_size)
	summary_data = {'ngram_length': ngram_length, 'ngram_token_position': ngram_token_position, 'total_unique': total_unique, 'total_count': total_count, 'page_current': page_current, 'total_pages': total_pages}
	formatted_data = [f'Report based on {tokens_descriptor}', f'Ngram length: {ngram_length}, Token position: {ngram_token_position.lower()}']

	if exclude_punctuation:
		formatted_data.append(f'Ngrams containing punctuation tokens excluded')

	if normalize_by is not None:
		formatted_data.append(f'Normalized Frequency is per {normalize_by:,.0f} tokens')

	formatted_data.extend([f'Total unique ngrams: {total_unique:,}', f'Total ngrams: {total_count:,}'])

	if page_size != 0 and total_count > page_size:
		formatted_data.extend([f'Showing {min(page_size, total_count)} rows', f'Page {page_current} of {total_pages}']) 

	if show_all_columns == False:
		ngrams_report_page = ngrams_report_page[['rank', 'ngram', 'frequency', 'normalized_frequency']]
	
	logger.info(f'Ngrams report time: {(time.time() - start_time):.5f} seconds')

	return Result(type = 'ngrams', df=ngrams_report_page, title=f'Ngrams for "{token_str}"', description=f'{self.corpus.name}', summary_data=summary_data, formatted_data=formatted_data)


In [None]:
#| hide
show_toy_corpus(f'{source_path}toy.csv')
token_str = 'the'
test_result = ngrams_toy.ngrams(token_str).to_frame()
assert test_result.filter(pl.col('ngram') == 'the cat').select('frequency').item() == 3
assert test_result.filter(pl.col('ngram') == 'the mat').select('frequency').item() == 2
assert test_result.filter(pl.col('ngram') == 'the dog').select('frequency').item() == 3
assert test_result.select(pl.len()).item() == 3

source,text,category,species
1.txt,The cat sat on the mat.,feline,cat
2.txt,The dog sat on the mat.,canine,dog
3.txt,The cat is meowing.,feline,cat
4.txt,The dog is barking.,canine,dog
5.txt,The cat is climbing a tree.,feline,cat
6.txt,The dog is digging a hole.,canine,dog


In [None]:
# load the corpus
reuters = Corpus().load(path_to_reuters_corpus)

# instantiate the Ngrams class
ngrams_reuters = Ngrams(reuters)

In [None]:
# run the ngrams method and display the results
ngrams_reuters.ngrams('environmental', ngram_length = 2, ngram_token_position = 'LEFT').display()

"Ngrams for ""environmental""","Ngrams for ""environmental""","Ngrams for ""environmental""","Ngrams for ""environmental"""
Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus
Rank,Ngram,Frequency,Normalized Frequency
1,environmental protection,4,0.03
2,environmental systems,4,0.03
3,environmental services,3,0.02
4,environmental damage,2,0.01
5,environmental regulations,2,0.01
6,environmental impact,2,0.01
7,environmental controls,1,0.01
8,environmental approval,1,0.01
9,environmental and,1,0.01
10,environmental sciences,1,0.01


In [None]:
# run the ngrams method and display the results
ngrams_reuters.ngrams('the highest', ngram_length = 3, ngram_token_position = 'LEFT', page_size = 10).display()

"Ngrams for ""the highest""","Ngrams for ""the highest""","Ngrams for ""the highest""","Ngrams for ""the highest"""
Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus
Rank,Ngram,Frequency,Normalized Frequency
1,the highest since,8,0.06
2,the highest level,4,0.03
3,the highest in,3,0.02
4,the highest rate,2,0.01
5,the highest interest,2,0.01
6,the highest priority,2,0.01
7,the highest number,2,0.01
8,the highest agriculture,2,0.01
9,the highest such,2,0.01
10,the highest positive,2,0.01


In [None]:
#| export
@patch
def ngram_frequencies(self: Ngrams, 
                ngram_length:int=2, # length of ngram
                case_sensitive:bool=False, # frequencies for tokens lowercased or with case preserved
				normalize_by:int=10000, # normalize frequencies by a number (e.g. 10000)
				page_size:int=PAGE_SIZE, # number of rows to return
				page_current:int=1, # current page
				exclude_punctuation:bool=True # exclude ngrams containing punctuation tokens
				) -> Result: # return a Result object with the frequency table
	""" Report frequent ngrams. """
	
	if type(normalize_by) != int:
		raise ValueError('normalize_by must be an integer, e.g. 1000000 or 10000')

	start_time = time.time()

	if case_sensitive:
		index = 'orth_index'
	else:
		index = 'lower_index'

	filter = [self.corpus.EOF_TOKEN]
	if exclude_punctuation == True:
		filter += self.corpus.punct_tokens

	resultset_start = page_size*(page_current-1)

	count_tokens, tokens_descriptor, total_descriptor = self.corpus.get_token_count_text(exclude_punctuation)
	formatted_data = [f'Report based on {tokens_descriptor}']
	formatted_data.append(f'Ngram length: {ngram_length}')

	if exclude_punctuation:
		formatted_data.append(f'Ngrams containing punctuation tokens excluded')

	# ngrams = self.corpus.get_ngrams_by_index(ngram_length = ngram_length, index = index).T

	ngrams_report = self.corpus.tokens.select(pl.col(index).alias('token_1')).with_row_index('position')

	ngrams_report = ngrams_report.with_columns([pl.col('token_1').shift(-i).alias(f'token_{i+1}') for i in range(1, ngram_length)])

	schema = [f'token_{i+1}' for i in range(ngram_length)]

	# ngrams_report = pl.LazyFrame(ngrams.T, schema=schema)
	ngrams_report = ngrams_report.group_by(schema).agg(pl.len().alias("frequency")).sort(by="frequency", descending=True)

	for i in range(ngram_length):
		ngrams_report = ngrams_report.filter(~pl.col(f'token_{i+1}').is_in(filter))

	total_unique = ngrams_report.select(pl.len()).collect().item()
	total_count = ngrams_report.select(pl.col('frequency').sum()).collect().item()

	ngrams_report_page = ngrams_report.slice(resultset_start, page_size).collect(engine = 'streaming')
	logger.info(f'collected report page: {(time.time() - start_time):.5f} seconds')
	token_strs = []
	for i in range(ngram_length):
		token_strs.append(self.corpus.token_ids_to_tokens(ngrams_report_page.select(pl.col(f'token_{i+1}')).to_numpy().flatten()))
	token_strs = np.array(token_strs)
	ngram_text = [' '.join(column) for column in token_strs.T]
	ngrams_report_page = ngrams_report_page.with_columns(pl.Series(name="ngram", values=ngram_text)).with_row_index(name='rank', offset=(page_current-1)*page_size+1)
	ngrams_report_page = ngrams_report_page.with_columns(((pl.col("frequency") / pl.lit(count_tokens)) * normalize_by).alias('normalized_frequency'))
	formatted_data.append(f'Normalized Frequency is per {normalize_by:,.0f} tokens')

	formatted_data.extend([f'Total unique ngrams: {total_unique:,}', f'Total ngrams: {total_count:,}'])

	total_pages = math.ceil(total_unique/page_size)
	if page_size != 0 and total_count > page_size:
		formatted_data.extend([f'Showing {min(page_size, total_count)} rows', f'Page {page_current} of {total_pages}']) 

	ngrams_report_page = ngrams_report_page[['rank', 'ngram', 'frequency', 'normalized_frequency']]

	return Result(type = 'ngram_frequencies', df=ngrams_report_page, title=f'Ngram Frequencies', description=f'{self.corpus.name}', summary_data = {}, formatted_data = formatted_data)

In [None]:
#| hide
# ngrams_report = brown.tokens.select(pl.col('lower_index').alias('token_1')).with_row_index('position').collect()
# ngrams_report = ngrams_report.join(brown.spaces.collect().select(pl.col('position').alias('space_position'), pl.lit(True).alias('is_space')), left_on='position', right_on='space_position', how='left', maintain_order='left')
# print(ngrams_report.filter(pl.col('position')>= 100).head(10))
# ngrams_report = ngrams_report.filter(pl.col('is_space').is_null())
# print(ngrams_report.shape)
# print(ngrams_report.filter(pl.col('position')>= 99).head(10))

In [None]:
#| hide
show_toy_corpus(f'{source_path}toy.csv')
test_result = ngrams_toy.ngram_frequencies(ngram_length = 2, case_sensitive = False, exclude_punctuation = False, page_current = 1, page_size = 100).to_frame()
assert test_result.select(pl.col('frequency').sum()).item() == 32
test_result = ngrams_toy.ngram_frequencies(ngram_length = 2, case_sensitive = False, exclude_punctuation = True, page_current = 1, page_size = 100).to_frame()
assert test_result.select(pl.col('frequency').sum()).item() == 26

source,text,category,species
1.txt,The cat sat on the mat.,feline,cat
2.txt,The dog sat on the mat.,canine,dog
3.txt,The cat is meowing.,feline,cat
4.txt,The dog is barking.,canine,dog
5.txt,The cat is climbing a tree.,feline,cat
6.txt,The dog is digging a hole.,canine,dog


In [None]:
#| hide
ngrams_reuters.ngrams('the company', ngram_length = 3).display()

"Ngrams for ""the company""","Ngrams for ""the company""","Ngrams for ""the company""","Ngrams for ""the company"""
Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus
Rank,Ngram,Frequency,Normalized Frequency
1,the company said,1173,8.39
2,the company 's,518,3.70
3,the company also,107,0.76
4,the company has,69,0.49
5,the company to,69,0.49
6,the company is,59,0.42
7,the company reported,51,0.36
8,the company will,49,0.35
9,the company had,47,0.34
10,the company was,46,0.33


In [None]:
ngrams_reuters.ngram_frequencies(ngram_length = 3, case_sensitive = False).display()

Ngram Frequencies,Ngram Frequencies,Ngram Frequencies,Ngram Frequencies
Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus
Rank,Ngram,Frequency,Normalized Frequency
1,the company said,1173,8.39
2,mln dlrs in,795,5.68
3,cts vs loss,665,4.75
4,said it has,636,4.55
5,mln avg shrs,620,4.43
6,pct of the,608,4.35
7,the united states,603,4.31
8,qtr net shr,574,4.10
9,dlrs a share,546,3.90
10,inc said it,523,3.74


In [None]:
ngrams_reuters.ngram_frequencies(ngram_length = 3, case_sensitive = True).display()

Ngram Frequencies,Ngram Frequencies,Ngram Frequencies,Ngram Frequencies
Reuters Corpus,Reuters Corpus,Reuters Corpus,Reuters Corpus
Rank,Ngram,Frequency,Normalized Frequency
1,The company said,747,5.34
2,mln dlrs in,726,5.19
3,cts vs loss,645,4.61
4,said it has,632,4.52
5,mln Avg shrs,615,4.40
6,pct of the,608,4.35
7,QTR NET Shr,559,4.00
8,the United States,524,3.75
9,dlrs a share,519,3.71
10,Inc said it,514,3.67


In [None]:
#| hide
# congress = Corpus().load(f'{save_path}us-congressional-speeches-subset-100k.corpus')
# report_congress = Ngrams(congress)
# congress._init_token_arrays()

In [None]:
#| hide
# %time report_congress.ngram_frequencies(ngram_length = 2, case_sensitive = False, page_current = 1).display()

Ngram Frequencies,Ngram Frequencies,Ngram Frequencies,Ngram Frequencies
US Congressional Speeches Subset 100k,US Congressional Speeches Subset 100k,US Congressional Speeches Subset 100k,US Congressional Speeches Subset 100k
Rank,Ngram,Frequency,Normalized Frequency
1,of the,227943,126.49
2,in the,114241,63.39
3,to the,92967,51.59
4,it is,51659,28.67
5,that the,51620,28.64
6,for the,46516,25.81
7,on the,43924,24.37
8,and the,43053,23.89
9,by the,40236,22.33
10,the senator,37269,20.68


CPU times: user 26 s, sys: 4.12 s, total: 30.1 s
Wall time: 785 ms


In [None]:
#| hide
# %time report_congress.ngrams('senator', ngram_length = 2, ngram_token_position = 'RIGHT', page_size = 10).display()

"Ngrams for ""senator""","Ngrams for ""senator""","Ngrams for ""senator""","Ngrams for ""senator"""
US Congressional Speeches Subset 100k,US Congressional Speeches Subset 100k,US Congressional Speeches Subset 100k,US Congressional Speeches Subset 100k
Rank,Ngram,Frequency,Normalized Frequency
1,the senator,37269,20.68
2,distinguished senator,2430,1.35
3,senior senator,1099,0.61
4,junior senator,899,0.50
5,a senator,578,0.32
6,of senator,483,0.27
7,and senator,474,0.26
8,by senator,452,0.25
9,that senator,366,0.20
10,able senator,351,0.19


CPU times: user 287 ms, sys: 284 ms, total: 572 ms
Wall time: 490 ms


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()