# Performance

This page reports timing results of conc methods with different size corpora. 

In [None]:
#| hide
%load_ext memory_profiler

In [None]:
#| hide 
%load_ext line_profiler

In [None]:
from conc.core import logger, set_logger_state
from conc.corpus import Corpus
from conc.report import Report

In [None]:
set_logger_state('quiet')

In [None]:
#| hide
source_corpus_path = '../test-corpora/source/'
save_corpus_path = '../test-corpora/saved/'

In [None]:
#| eval: false
test_corpora_names = ['rnz-10k', 'rnz-100k', 'rnz-200k', 'rnz-500k']
corpora = {}
for name in test_corpora_names:
	try:
		corpora[name] = Corpus().load(save_corpus_path + name + '.corpus')
	except FileNotFoundError:
		corpora[name] = Corpus(name).build_from_csv(source_corpus_path + name + '.csv.gz', text_column='description')
		corpora[name].save(save_corpus_path + name + '.corpus')
	corpora[name].summary()

Corpus Summary,Corpus Summary
Attribute,Value
Name,rnz-10k
Description,
Conc Version,0.0.1
Corpus Path,../test-corpora/saved/rnz-10k.corpus
Source Path,../test-corpora/source/rnz-10k.csv.gz
Document Count,10000
Token Count,341671
Unique Tokens,21357
Word Token Count,318917
Unique Word Tokens,21322


Corpus Summary,Corpus Summary
Attribute,Value
Name,rnz-100k
Description,
Conc Version,0.0.1
Corpus Path,../test-corpora/saved/rnz-100k.corpus
Source Path,../test-corpora/source/rnz-100k.csv.gz
Document Count,100000
Token Count,3133652
Unique Tokens,53541
Word Token Count,2961598
Unique Word Tokens,53487


Corpus Summary,Corpus Summary
Attribute,Value
Name,rnz-200k
Description,
Conc Version,0.0.1
Corpus Path,../test-corpora/saved/rnz-200k.corpus
Source Path,../test-corpora/source/rnz-200k.csv.gz
Document Count,200000
Token Count,5934801
Unique Tokens,76930
Word Token Count,5603713
Unique Word Tokens,76860


Corpus Summary,Corpus Summary
Attribute,Value
Name,rnz-500k
Description,
Conc Version,0.0.1
Corpus Path,../test-corpora/saved/rnz-500k.corpus
Source Path,../test-corpora/source/rnz-500k.csv.gz
Document Count,500000
Token Count,15326015
Unique Tokens,120311
Word Token Count,14451100
Unique Word Tokens,120212


In [None]:
#| eval: false

rnz10 = Corpus('rnz-10k')
rnz100 = Corpus('rnz-100k')
rnz200 = Corpus('rnz-200k')
rnz500 = Corpus('rnz-500k')

%time rnz10.build_from_csv('../test-corpora/source/rnz-10k.csv.gz', text_column='description')
# %time rnz100.build(rnz100.import_from_csv('../test-corpora/source/rnz-100k.csv.gz', text_column='description'))
# %time rnz200.build(rnz200.import_from_csv('../test-corpora/source/rnz-200k.csv.gz', text_column='description'))
# %time rnz500.build(rnz500.import_from_csv('../test-corpora/source/rnz-500k.csv.gz', text_column='description'))

%time rnz10.save('../test-corpora/saved/rnz-10k.corpus')
# %time rnz100.save('../test-corpora/saved/rnz-100k.corpus')
# %time rnz200.save('../test-corpora/saved/rnz-200k.corpus')
# %time rnz500.save('../test-corpora/saved/rnz-500k.corpus')

CPU times: user 1.74 s, sys: 0 ns, total: 1.74 s
Wall time: 1.7 s
CPU times: user 840 ms, sys: 0 ns, total: 840 ms
Wall time: 837 ms


In [None]:
%timeit rnz10 = Corpus('rnz-10k').load('../test-corpora/saved/rnz-10k.corpus')
# %timeit rnz100 = Corpus('rnz-100k').load('../test-corpora/saved/rnz-100k.corpus')
# %timeit rnz200 = Corpus('rnz-200k').load('../test-corpora/saved/rnz-200k.corpus')
# %timeit rnz500 = Corpus('rnz-500k').load('../test-corpora/saved/rnz-500k.corpus')

105 ms ± 13.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
report_rnz10 = Report(rnz10)

In [None]:
%timeit report_rnz10.frequencies(n=10, normalize_by=10000)
# %timeit results = rnz100.frequencies(n=10, normalize_by=10000)
# %timeit results = rnz200.frequencies(n=10, normalize_by=10000)
# %timeit results = rnz500.frequencies(n=10, normalize_by=10000)

7.21 ms ± 754 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
pretty = False
token_str = 'dog'
use_cache = False
%timeit report_rnz10.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache)
# %timeit rnz100_ngrams, rnz100_total_unique, rnz100_total_count = rnz100.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', pretty = pretty, use_cache = use_cache)
# %timeit rnz200_ngrams, rnz200_total_unique, rnz200_total_count = rnz200.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', pretty = pretty, use_cache = use_cache)
# %timeit rnz500_ngrams, rnz500_total_unique, rnz500_total_count = rnz500.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', pretty = pretty, use_cache = use_cache)


879 μs ± 96.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
pretty = False
token_str = 'dog'
use_cache = True
%timeit report_rnz10.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache)
# %timeit rnz100_ngrams, rnz100_total_unique, rnz100_total_count = rnz100.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', pretty = pretty, use_cache = use_cache)
# %timeit rnz200_ngrams, rnz200_total_unique, rnz200_total_count = rnz200.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', pretty = pretty, use_cache = use_cache)
# %timeit rnz500_ngrams, rnz500_total_unique, rnz500_total_count = rnz500.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', pretty = pretty, use_cache = use_cache)

284 μs ± 10.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
pretty = False
token_str = 'the'
use_cache = False
%timeit report_rnz10.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache)
# %timeit rnz100_ngrams, rnz100_total_unique, rnz100_total_count = rnz100.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', pretty = pretty, use_cache = use_cache)
# %timeit rnz200_ngrams, rnz200_total_unique, rnz200_total_count = rnz200.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', pretty = pretty, use_cache = use_cache)
# %timeit rnz500_ngrams, rnz500_total_unique, rnz500_total_count = rnz500.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', pretty = pretty, use_cache = use_cache)


2.95 ms ± 90.5 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
pretty = False
token_str = 'the'
use_cache = True
%timeit report_rnz10.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache)
# %timeit rnz100_ngrams, rnz100_total_unique, rnz100_total_count = rnz100.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', pretty = pretty, use_cache = use_cache)
# %timeit rnz200_ngrams, rnz200_total_unique, rnz200_total_count = rnz200.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', pretty = pretty, use_cache = use_cache)
# %timeit rnz500_ngrams, rnz500_total_unique, rnz500_total_count = rnz500.ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', pretty = pretty, use_cache = use_cache)

283 μs ± 13.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
use_cache = False
token_str = 'dog'
%timeit report_rnz10.concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)
# %timeit concordance_report, total_count, total_docs = rnz100.concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)
# %timeit concordance_report, total_count, total_docs = rnz200.concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)
# %timeit concordance_report, total_count, total_docs = rnz500.concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)


6.58 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
use_cache = True
token_str = 'dog'
%timeit report_rnz10.concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)
# %timeit concordance_report, total_count, total_docs = rnz100.concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)
# %timeit concordance_report, total_count, total_docs = rnz200.concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)
# %timeit concordance_report, total_count, total_docs = rnz500.concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)


6.16 ms ± 1.17 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
use_cache = False
token_str = 'the'
%timeit report_rnz10.concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)
# %timeit concordance_report, total_count, total_docs = rnz100.concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)
# %timeit concordance_report, total_count, total_docs = rnz200.concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)
# %timeit concordance_report, total_count, total_docs = rnz500.concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)


11.5 ms ± 1.41 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
use_cache = True
token_str = 'the'
%timeit report_rnz10.concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)
# %timeit concordance_report, total_count, total_docs = rnz100.concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)
# %timeit concordance_report, total_count, total_docs = rnz200.concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)
# %timeit concordance_report, total_count, total_docs = rnz500.concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)


5.94 ms ± 519 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
