# Performance

This page reports timing results of conc methods with different size corpora. 

In [None]:
#| hide
%load_ext memory_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [None]:
#| hide 
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [None]:
#| hide
%load_ext memray


The memray extension is already loaded. To reload it, use:
  %reload_ext memray


In [None]:
import os

In [None]:
from conc.core import logger, set_logger_state
from conc.corpus import Corpus
from conc.report import Report

In [None]:
set_logger_state('quiet')

In [None]:
#| hide
source_path = '../test-corpora/source/'
save_path = '../test-corpora/saved/'

In [None]:
test_corpora_names = ['us-congressional-speeches-subset-10k', 'us-congressional-speeches-subset-100k']
name = test_corpora_names[1]
corpora = {}

In [None]:
import zarr
import numpy as np
import polars as pl

In [None]:
orth_store = zarr.open_array(store = f'../conc-build-process/us-congressional-speeches-subset-100k_orth_index.zarr')
orth_index = orth_store[:]
np.savez_compressed(f'../conc-build-process/us-congressional-speeches-subset-100k_orth_index.npz', orth_index=orth_index)
df = pl.DataFrame({'orth_index': orth_index})
df.write_parquet("../conc-build-process/us-congressional-speeches-subset-100k_orth_index.parquet")

lower_store = zarr.open_array(store = f'../conc-build-process/us-congressional-speeches-subset-100k_lower_index.zarr')
lower_index = lower_store[:]
np.savez_compressed(f'../conc-build-process/us-congressional-speeches-subset-100k_lower_index.npz', lower_index=lower_index)
df = pl.DataFrame({'lower_index': lower_index})
df.write_parquet("../conc-build-process/us-congressional-speeches-subset-100k_lower_index.parquet")

In [None]:
# # # open zarr store at path 5242739437188381500_lower_index.zarr
# lower_store = zarr.open_array(store = f'../conc-build-process/5389131771623450514_lower_index.zarr')
# lower_index = lower_store[:]
# unique_values, inverse = np.unique(lower_index, return_inverse=True)
# print(unique_values[0:10])
# print(inverse[100:110])
# print(inverse[0:10])
# print(unique_values.shape, inverse.shape, np.max(inverse))
# # # write to npz file for comparison
# # #np.savez_compressed(f'../conc-build-process/5389131771623450514_lower_index.npz', lower_index=lower_index)
# # # df = pl.DataFrame({'lower_index': lower_index})
# # # df.write_parquet("../conc-build-process/5389131771623450514_lower_index.parquet")

# # #%memit 
# # len(np.unique(lower_index))
# # #%memit unique_values = np.unique(lower_store)

# # #zarr.create_array(f'{build_process_path}/{working_identifier}_orth_index.zarr', overwrite=True, shape=(1,), chunks=(1000000,), dtype=np.uint64, compressors=zarr.codecs.BloscCodec(cname='zstd', clevel=3, shuffle=zarr.codecs.BloscShuffle.shuffle))
# # #unique_values, inverse = np.unique(lower_store, return_inverse=True)
# del lower_store
# del lower_index

In [None]:
#%%memray_flamegraph --temporal
# load from parquet
#df = pl.scan_parquet("../conc-build-process/5389131771623450514_lower_index.parquet")
#lazy_df = pl.scan_parquet("../conc-build-process/5389131771623450514_lower_index.parquet")

orth_df = pl.scan_parquet("../conc-build-process/us-congressional-speeches-subset-100k_orth_index.parquet")
lower_df = pl.scan_parquet("../conc-build-process/us-congressional-speeches-subset-100k_lower_index.parquet")

# Rename the index columns
orth_df = orth_df.select(pl.col("orth_index").alias("index"))
lower_df = lower_df.select(pl.col("lower_index").alias("index"))

#orth_df.collect().head()

# Concatenate the LazyFrames
combined_df = pl.concat([orth_df, lower_df])

# Write the result to a new Parquet file
#lazy_df.sink_parquet("path/to/output_file.parquet")

unique_lazy_df  = (
    combined_df.select(pl.col("index").unique().sort().alias("unique_values")).with_row_index('lookup', offset=1)
    )

unique_df = unique_lazy_df.collect()#["unique_values"].to_list()
#print(type(unique_df))
#print(unique_df.head())
#print(len(unique_df))

# reindex
update_df = (
    combined_df.with_columns(
        pl.col("index").replace(
            unique_df["unique_values"].to_list(),
            unique_df["lookup"].to_list()
        ).cast(pl.UInt32)
    )
)

split_count = combined_df.collect().height // 2

# write to a orth_reindexed.parquet and lower_reindexed.parquet - split in half by shape
update_df.slice(0, split_count).collect().write_parquet("../conc-build-process/us-congressional-speeches-subset-100k_orth_reindexed.parquet")
update_df.slice(split_count, split_count).collect().write_parquet("../conc-build-process/us-congressional-speeches-subset-100k_lower_reindexed.parquet")


In [None]:
# lazy open reindexed parquet
lazy_df = pl.scan_parquet("../conc-build-process/us-congressional-speeches-subset-100k_orth_index.parquet")
print(lazy_df.collect_schema())
print(lazy_df.collect().slice(100, 10))
lazy_df = pl.scan_parquet("../conc-build-process/us-congressional-speeches-subset-100k_orth_reindexed.parquet")
# get information on column names and types
print(lazy_df.collect_schema())
print(lazy_df.collect().slice(100, 10))

Schema([('orth_index', UInt64)])
┌──────────────────────┐
│ orth_index           │
╞══════════════════════╡
│ 4690420944186131903  │
│ 6992604926141104606  │
│ 18194338103975822726 │
│ 3791531372978436496  │
│ 203487227105936704   │
│ 7425985699627899538  │
│ 8654682122863471622  │
│ 7831658034963690409  │
│ 1946980641541756374  │
│ 11901859001352538922 │
└──────────────────────┘
Schema([('index', UInt32)])
┌────────┐
│ index  │
╞════════╡
│ 77455  │
│ 115443 │
│ 298821 │
│ 62756  │
│ 3446   │
│ 122636 │
│ 142784 │
│ 129314 │
│ 32318  │
│ 195410 │
└────────┘


In [None]:
unique_df.filter(pl.col("unique_values") == 4690420944186131903)

lookup,unique_values
77455,4690420944186131903


In [None]:
corpora[name] = Corpus().load(f'{save_path}{name}.corpus')
corpora[name].orth_index[100:110]

array([ 77455, 115443, 298821,  62756,   3446, 122636, 142784, 129314,
        32318, 195410], dtype=uint32)

In [None]:
corpora[name].original_to_new[4690420944186131903]

np.uint32(77455)

In [None]:
#%mprun -f Corpus.load corpora[name] = Corpus().load(f'{save_corpus_path}{name}.corpus')

In [None]:
#| eval: false
for name in test_corpora_names:
	# if os.path.exists(f'{save_corpus_path}{name}.corpus'):
	# 	os.remove(f'{save_corpus_path}{name}.corpus')
	try:
		corpora[name] = Corpus().load(f'{save_path}{name}.corpus')
	except FileNotFoundError:
		set_logger_state('verbose')
		corpora[name] = Corpus(name).build_from_csv(f'{source_path}{name}.csv.gz', text_column='text')
		corpora[name].save(f'{save_path}{name}.corpus')
		set_logger_state('quiet')
	corpora[name].summary()

Corpus Summary,Corpus Summary
Attribute,Value
Name,us-congressional-speeches-subset-10k
Description,
Conc Version,0.0.1
Corpus Path,../test-corpora/saved/us-congressional-speeches-subset-10k.corpus
Source Path,../test-corpora/source/us-congressional-speeches-subset-10k.csv.gz
Document Count,10000
Token Count,1964972
Unique Tokens,50641
Word Token Count,1778104
Unique Word Tokens,50520


Corpus Summary,Corpus Summary
Attribute,Value
Name,us-congressional-speeches-subset-100k
Description,
Conc Version,0.0.1
Corpus Path,../test-corpora/saved/us-congressional-speeches-subset-100k.corpus
Source Path,../test-corpora/source/us-congressional-speeches-subset-100k.csv.gz
Document Count,100000
Token Count,20027241
Unique Tokens,214503
Word Token Count,18120969
Unique Word Tokens,214175


In [None]:
# as a loop
for name in test_corpora_names:
    %timeit corpora[name] = Corpus().load(f'{save_path}{name}.corpus')

121 ms ± 943 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
846 ms ± 16.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.68 s ± 35.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
report = {}
for name in test_corpora_names:
    report[name] = Report(corpora[name])

In [None]:
for name in test_corpora_names:
    %timeit report[name].frequencies(normalize_by=10000, page_size=10)

4.56 ms ± 376 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
13.9 ms ± 169 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
19.4 ms ± 183 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
token_str = 'dog'
use_cache = False
for name in test_corpora_names:
    %timeit report[name].ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache)


714 μs ± 32.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
11.2 ms ± 348 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
23.6 ms ± 1.21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
token_str = 'dog'
use_cache = True
for name in test_corpora_names:
    report[name].ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache) # warm up
    %timeit report[name].ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache)

83.4 μs ± 544 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
96.8 μs ± 405 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
97.1 μs ± 383 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [None]:
token_str = 'the'
use_cache = False
for name in test_corpora_names:
    %timeit report[name].ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache)


7.44 ms ± 121 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
84.5 ms ± 1.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
176 ms ± 902 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
token_str = 'the'
use_cache = True
for name in test_corpora_names:
    report[name].ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache) # warm up
    %timeit report[name].ngrams(token_str, ngram_length = 2, ngram_word_position = 'LEFT', use_cache = use_cache)

83.1 μs ± 545 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
89.4 μs ± 1.49 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
87.3 μs ± 214 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [None]:
use_cache = False
token_str = 'dog'
for name in test_corpora_names:
    %timeit report[name].concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)

2.99 ms ± 241 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
14 ms ± 401 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)
25.9 ms ± 697 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
use_cache = True
token_str = 'dog'
for name in test_corpora_names:
    report[name].concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache) # warm up
    %timeit report[name].concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)


1.98 ms ± 32 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.26 ms ± 158 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.5 ms ± 19 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
use_cache = False
token_str = 'the'
for name in test_corpora_names:
    %timeit report[name].concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)


16.4 ms ± 251 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
102 ms ± 2.65 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
214 ms ± 10.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
use_cache = True
token_str = 'the'
for name in test_corpora_names:
    report[name].concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache) # warm up
    %timeit report[name].concordance(token_str, context_words = 5, order='1L2L3L', use_cache = use_cache)


2.42 ms ± 59.5 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
3.35 ms ± 148 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
4.23 ms ± 157 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
# reclaim space!
for name in test_corpora_names:
	if os.path.exists(f'{save_corpus_path}{name}.corpus'):
		os.remove(f'{save_corpus_path}{name}.corpus')