# Performance

This page reports timing results of conc methods with different size corpora. 

In [1]:
#| hide
%load_ext memory_profiler

In [2]:
#| hide 
%load_ext line_profiler

In [3]:
#| hide
# %load_ext memray


In [4]:
#| hide
import os

In [5]:
from conc.core import logger, set_logger_state
from conc.corpus import Corpus
from conc.conc import Conc

In [6]:
set_logger_state('quiet')

In [7]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/'

In [None]:
#| eval: false
test_corpora = {
				'us-congressional-speeches-subset-10k': 'US Congressional Speeches Subset 10k',
                'us-congressional-speeches-subset-100k': 'US Congressional Speeches Subset 100k',
				#'us-congressional-speeches-subset-500k': 'US Congressional Speeches Subset 500k'
				}


corpora = {}
for slug, name in test_corpora.items():
	set_logger_state('verbose')
	logger.info(f'Starting {name} build ...')
	description = f'1 million speeches sampled from https://huggingface.co/datasets/Eugleo/us-congressional-speeches-subset to create corpora of varying sizes for development and testing. The dataset card at Huggingface is empty, so there is no further information available on the contents. The title indicates how many speeches are included in this corpus. '
	try:
		corpora[slug] = Corpus(name = name, description = description).build_from_csv(f'{source_path}{slug}.csv.gz', save_path = save_path, text_column='text', metadata_columns = ['speech_id', 'date', 'speaker', 'chamber', 'state'], build_process_cleanup = False)
		del corpora[slug]
	except Exception as e:
		raise e
	set_logger_state('quiet')


2025-06-04 14:52:00 - INFO - <module> - Starting US Congressional Speeches Subset 10k build ...
2025-06-04 14:52:00 - INFO - memory_usage - init, memory usage: 234.1484375 MB
2025-06-04 14:52:02 - INFO - memory_usage - processed 5000 documents, memory usage: 553.34765625 MB, difference: 319.19921875 MB
2025-06-04 14:52:03 - INFO - memory_usage - processed 10000 documents, memory usage: 568.4765625 MB, difference: 15.12890625 MB
2025-06-04 14:52:03 - INFO - memory_usage - Completing build process, memory usage: 568.4765625 MB, difference: 0.0 MB
2025-06-04 14:52:03 - INFO - memory_usage - init, memory usage: 568.4765625 MB
2025-06-04 14:52:03 - INFO - memory_usage - got input length 1975172, memory usage: 574.7578125 MB, difference: 6.28125 MB
2025-06-04 14:52:03 - INFO - memory_usage - collected vocab, memory usage: 574.7578125 MB, difference: 0.0 MB
2025-06-04 14:52:03 - INFO - memory_usage - freed up combined_df and input_df, memory usage: 574.7578125 MB, difference: 0.0 MB
2025-06-0

In [None]:
for name in test_corpora_names:
    %timeit report[name].frequencies(normalize_by=10000, page_size=10)

4.56 ms ± 376 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
13.9 ms ± 169 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
19.4 ms ± 183 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
from conc.frequency import Frequency

In [None]:
report = {}
for slug, name in test_corpora.items():
    report[slug] = Frequency(corpora[slug])

In [None]:
set_logger_state('quiet')

In [None]:
for slug, name in test_corpora.items():
    %timeit report[slug].frequencies(normalize_by=10000, page_size=10)

7.95 ms ± 155 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
22.1 ms ± 394 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
34.9 ms ± 340 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
token_str = 'dog'
use_cache = False
for name in test_corpora_names:
    %timeit report[name].ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache)


714 μs ± 32.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
11.2 ms ± 348 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
23.6 ms ± 1.21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
token_str = 'dog'
use_cache = True
for name in test_corpora_names:
    report[name].ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache) # warm up
    %timeit report[name].ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache)

83.4 μs ± 544 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
96.8 μs ± 405 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
97.1 μs ± 383 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [None]:
token_str = 'the'
use_cache = False
for name in test_corpora_names:
    %timeit report[name].ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache)


7.44 ms ± 121 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
84.5 ms ± 1.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
176 ms ± 902 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
token_str = 'the'
use_cache = True
for name in test_corpora_names:
    report[name].ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache) # warm up
    %timeit report[name].ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache)

83.1 μs ± 545 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
89.4 μs ± 1.49 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
87.3 μs ± 214 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [None]:
use_cache = False
token_str = 'dog'
for name in test_corpora_names:
    %timeit report[name].concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)

2.99 ms ± 241 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
14 ms ± 401 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)
25.9 ms ± 697 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
use_cache = True
token_str = 'dog'
for name in test_corpora_names:
    report[name].concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache) # warm up
    %timeit report[name].concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)


1.98 ms ± 32 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.26 ms ± 158 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.5 ms ± 19 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
use_cache = False
token_str = 'the'
for name in test_corpora_names:
    %timeit report[name].concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)


16.4 ms ± 251 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
102 ms ± 2.65 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
214 ms ± 10.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
use_cache = True
token_str = 'the'
for name in test_corpora_names:
    report[name].concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache) # warm up
    %timeit report[name].concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)


2.42 ms ± 59.5 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
3.35 ms ± 148 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
4.23 ms ± 157 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
# reclaim space!
for name in test_corpora_names:
	if os.path.exists(f'{save_path}{name}.corpus'):
		os.remove(f'{save_path}{name}.corpus')