# Performance

This page reports timing results of conc methods with different size corpora. 

In [None]:
#| hide
%load_ext memory_profiler

In [None]:
#| hide 
%load_ext line_profiler

In [None]:
#| hide
# %load_ext memray


In [None]:
#| hide
import os

In [None]:
from conc.core import logger, set_logger_state
from conc.corpus import Corpus
from conc.report import Report

In [None]:
set_logger_state('quiet')

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

In [None]:
#| eval: false
test_corpora = {
				'us-congressional-speeches-subset-10k': 'US Congressional Speeches Subset 10k',
                'us-congressional-speeches-subset-100k': 'US Congressional Speeches Subset 100k',
				#'us-congressional-speeches-subset-500k': 'US Congressional Speeches Subset 500k'
				}


corpora = {}
for slug, name in test_corpora.items():
	set_logger_state('verbose')
	logger.info(f'Starting {name} build ...')
	try:
		corpora[slug] = Corpus(name).build_from_csv(f'{source_path}{slug}.csv.gz', save_path = save_path, text_column='text', metadata_columns = ['speech_id', 'date', 'speaker', 'chamber', 'state'], build_process_cleanup = False)
		del corpora[slug]
	except Exception as e:
		raise e
	set_logger_state('quiet')


2025-06-03 16:43:00 - INFO - <module> - Starting US Congressional Speeches Subset 10k build ...
2025-06-03 16:43:01 - INFO - memory_usage - init, memory usage: 230.69140625 MB
2025-06-03 16:43:08 - INFO - memory_usage - processed 5000 documents, memory usage: 315.48828125 MB, difference: 84.796875 MB
2025-06-03 16:43:14 - INFO - memory_usage - processed 10000 documents, memory usage: 332.828125 MB, difference: 17.33984375 MB
2025-06-03 16:43:14 - INFO - memory_usage - Completing build process, memory usage: 332.828125 MB, difference: 0.0 MB
2025-06-03 16:43:14 - INFO - memory_usage - init, memory usage: 332.828125 MB
2025-06-03 16:43:14 - INFO - memory_usage - got input length 1975172, memory usage: 334.92578125 MB, difference: 2.09765625 MB
2025-06-03 16:43:14 - INFO - memory_usage - collected vocab, memory usage: 335.05078125 MB, difference: 0.125 MB
2025-06-03 16:43:14 - INFO - memory_usage - freed up combined_df and input_df, memory usage: 335.05078125 MB, difference: 0.0 MB
2025-0

In [None]:
import polars as pl

In [None]:
corpora[slug]

<conc.corpus.Corpus>

In [None]:
corpora[slug].tokenize('freak', simple_indexing = True)

([(np.int64(50562),)], 66)

In [None]:
corpora[slug].tokens.with_row_index('position').filter(pl.col('lower_index') == 50562).head(21).collect()

position,orth_index,lower_index,token2doc_index
2324950,50562,50562,51507
2325012,50562,50562,51507
2838852,50562,50562,54017
2838883,50562,50562,54017
9726312,50562,50562,88273
10853770,50562,50562,93884
13981508,50562,50562,19340
16058463,50562,50562,29626


In [None]:
build_df = pl.scan_parquet(f'{save_path}us-congressional-speeches-subset-100k.corpus/build_*.parquet',)

In [None]:
input_df = pl.scan_parquet(f'{save_path}us-congressional-speeches-subset-100k.corpus/build_*.parquet',)
# combining indexes to reindex
combined_df = pl.concat([input_df.select(pl.col('orth_index').alias('index')), input_df.select(pl.col('lower_index').alias('index'))])

input_length = input_df.select(pl.len()).collect(engine='streaming').item() # tested vs count - len seems to have slight memory overhead, but more correct (i.e. count only counts non-null)
logger.memory_usage(f'got input length {input_length}')

# get unique vocab ids (combining orth and lower) and create new index
vocab_df  = combined_df.select(pl.col('index').unique().sort().alias('source_id')).with_row_index('token_id', offset=1) #.collect(engine='streaming')
logger.memory_usage('collected vocab')

# combined_df = (combined_df.with_columns(pl.col('index').replace(vocab_df.select(pl.col('source_id'))['source_id'], vocab_df.select(pl.col('token_id'))['token_id']).cast(pl.UInt32)))
# combined_df = combined_df.with_columns(pl.col('index').cast(pl.UInt32))

combined_df = (
    combined_df
    .join(vocab_df, left_on="index", right_on="source_id", how="left", maintain_order="left")
    .drop("index")
    .rename({"token_id": "index"})
    .with_columns(pl.col("index").cast(pl.UInt32).alias("index"))
)

tokens_df = pl.concat(
                                [combined_df.select(pl.col('index').alias('orth_index')).slice(0, input_length), 
                                combined_df.select(pl.col('index').alias('lower_index')).slice(input_length),
                                input_df.select(pl.col('token2doc_index'))], how='horizontal'
                        )
display(tokens_df.filter(pl.col('lower_index') == 50562).head(21).collect())
tokens_df.collect().write_parquet(f'{save_path}us-congressional-speeches-subset-100k.corpus/tokens-test.parquet') #, maintain_order = True
display(tokens_df.filter(pl.col('lower_index') == 50562).head(21).collect())

orth_index,lower_index,token2doc_index
50562,50562,51507
50562,50562,51507
50562,50562,54017
50562,50562,54017
50562,50562,88273
50562,50562,93884
50562,50562,19340
50562,50562,29626


orth_index,lower_index,token2doc_index
50562,50562,51507
50562,50562,51507
50562,50562,54017
50562,50562,54017
50562,50562,88273
50562,50562,93884
50562,50562,19340
50562,50562,29626


In [None]:
tokens_df = pl.read_parquet(f'{save_path}us-congressional-speeches-subset-100k.corpus/tokens-test.parquet',)
display(tokens_df.filter(pl.col('lower_index') == 50562).head(21))

orth_index,lower_index,token2doc_index
50562,50562,51507
50562,50562,51507
50562,50562,54017
50562,50562,54017
50562,50562,88273
50562,50562,93884
50562,50562,19340
50562,50562,29626


In [None]:
build_df.filter(pl.col('lower_index') == 3046352667721326977).head(100).collect()

orth_index,lower_index,token2doc_index
3046352667721326977,3046352667721326977,51507
3046352667721326977,3046352667721326977,51507
3046352667721326977,3046352667721326977,54017
3046352667721326977,3046352667721326977,54017
3046352667721326977,3046352667721326977,88273
3046352667721326977,3046352667721326977,93884
3046352667721326977,3046352667721326977,19340
3046352667721326977,3046352667721326977,29626


In [None]:
corpora[slug]._nlp.vocab[3046352667721326977].text

'freak'

In [None]:
#| eval: false
for slug, name in test_corpora.items():
	try:
		corpora[name] = Corpus().load(f'{save_path}{slug}.corpus')
		corpora[name].summary()
	except FileNotFoundError:
		set_logger_state('verbose')
		logger.error(f'Corpus {name} not found in {save_path}.')
		set_logger_state('quiet')
	

Corpus Summary,Corpus Summary
Attribute,Value
Name,US Congressional Speeches Subset 10k
Description,
Date Created,2025-05-30 00:53:09
Conc Version,0.0.1
Corpus Path,../test-corpora/saved/us-congressional-speeches-subset-10k.corpus
Document Count,10000
Token Count,1964972
Word Token Count,1767904
Unique Tokens,50641
Unique Word Tokens,50520


Corpus Summary,Corpus Summary
Attribute,Value
Name,US Congressional Speeches Subset 100k
Description,
Date Created,2025-05-30 00:53:41
Conc Version,0.0.1
Corpus Path,../test-corpora/saved/us-congressional-speeches-subset-100k.corpus
Document Count,100000
Token Count,20027241
Word Token Count,18020769
Unique Tokens,214503
Unique Word Tokens,214175


In [None]:
# as a loop
# for slug, name in test_corpora.items():
#     %timeit corpora[slug] = Corpus().load(f'{save_path}{slug}.corpus')# 

In [None]:
report = {}
for name in test_corpora_names:
    report[name] = Report(corpora[name])

In [None]:
for name in test_corpora_names:
    %timeit report[name].frequencies(normalize_by=10000, page_size=10)

4.56 ms ± 376 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
13.9 ms ± 169 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
19.4 ms ± 183 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
from conc.frequency import Frequency

In [None]:
report = {}
for slug, name in test_corpora.items():
    report[slug] = Frequency(corpora[slug])

In [None]:
set_logger_state('quiet')

In [None]:
for slug, name in test_corpora.items():
    %timeit report[slug].frequencies(normalize_by=10000, page_size=10)

7.95 ms ± 155 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
22.1 ms ± 394 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
34.9 ms ± 340 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
token_str = 'dog'
use_cache = False
for name in test_corpora_names:
    %timeit report[name].ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache)


714 μs ± 32.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
11.2 ms ± 348 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
23.6 ms ± 1.21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
token_str = 'dog'
use_cache = True
for name in test_corpora_names:
    report[name].ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache) # warm up
    %timeit report[name].ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache)

83.4 μs ± 544 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
96.8 μs ± 405 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
97.1 μs ± 383 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [None]:
token_str = 'the'
use_cache = False
for name in test_corpora_names:
    %timeit report[name].ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache)


7.44 ms ± 121 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
84.5 ms ± 1.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
176 ms ± 902 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
token_str = 'the'
use_cache = True
for name in test_corpora_names:
    report[name].ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache) # warm up
    %timeit report[name].ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache)

83.1 μs ± 545 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
89.4 μs ± 1.49 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
87.3 μs ± 214 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [None]:
use_cache = False
token_str = 'dog'
for name in test_corpora_names:
    %timeit report[name].concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)

2.99 ms ± 241 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
14 ms ± 401 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)
25.9 ms ± 697 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
use_cache = True
token_str = 'dog'
for name in test_corpora_names:
    report[name].concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache) # warm up
    %timeit report[name].concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)


1.98 ms ± 32 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.26 ms ± 158 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.5 ms ± 19 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
use_cache = False
token_str = 'the'
for name in test_corpora_names:
    %timeit report[name].concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)


16.4 ms ± 251 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
102 ms ± 2.65 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
214 ms ± 10.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
use_cache = True
token_str = 'the'
for name in test_corpora_names:
    report[name].concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache) # warm up
    %timeit report[name].concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)


2.42 ms ± 59.5 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
3.35 ms ± 148 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
4.23 ms ± 157 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
# reclaim space!
for name in test_corpora_names:
	if os.path.exists(f'{save_path}{name}.corpus'):
		os.remove(f'{save_path}{name}.corpus')