# Benchmarks

These benchmarks seek to establish the performance of tablite as a user sees it.

Overview

**Input/Output:**

- Save / Load .tpz format
- Save tables to various formats
- Import data from various formats

**Various column functions:**

- Setitem / getitem
- iter
- equal, not equal
- copy
- t += t
- t *= t
- contains
- remove all
- replace
- index
- unique
- histogram
- statistics
- count


**Various table functions**

- **base**
- Setitem / getitem
- iter / rows
- equal, not equal
- load
- save
- copy
- stack
- types
- display_dict
- show
- to_dict
- as_json_serializable
- index
- **core**
- expression
- filter
- sort_index
- reindex
- drop_duplicates
- sort
- is_sorted
- any
- all
- drop 
- replace
- groupby
- pivot
- joins
- lookup
- replace missing values
- transpose
- pivot_transpose
- diff






In [2]:
from tablite import Table
from tablite.datasets import synthetic_order_data
import psutil, os, gc
import tempfile
from pathlib import Path
from time import process_time
from tablite.config import Config

### Create tables from synthetic data.

In [None]:
process = psutil.Process(os.getpid())

# The last tables are too big for RAM (~24Gb), so I create subtables of 1M rows and append them.
ram_start = process.memory_info().rss
t = synthetic_order_data(Config.PAGE_SIZE)
ram_end = process.memory_info().rss
real, flat = t.nbytes()
print(f"Table {len(t):,} rows is {real/1e6:,.0f} Mb on disk, using {(ram_end - ram_start)/1e6:,.0f} Mb ram")

tables = [t]  # 1M rows.

for i in [2,5,10,50,100]:
    for i in range(10):
        gc.collect()

    ram_start = process.memory_info().rss
    t2 = synthetic_order_data(Config.PAGE_SIZE)

    for i in range(i-1):
        t2 += synthetic_order_data(Config.PAGE_SIZE)  # these are all unique

    ram_end = process.memory_info().rss
    real, flat = t2.nbytes()
    tables.append(t2)
    print(f"Table {len(t2):,} rows is {real/1e6:,.0f} Mb on disk, using {(ram_end - ram_start)/1e6:,.0f} Mb ram")

tables[-1].show()


In [None]:
tx = tables[-1]

### Save / Load .tpz format

In [None]:
tmp = Path(tempfile.gettempdir()) / "junk"
tmp.mkdir(exist_ok=True)

for t in tables:
    fn = tmp / f'{len(t)}.tpz'
    start = process_time()
    t.save(fn)
    end = process_time()
    assert fn.exists()
    print(f"saving {len(t):,} rows ({fn.stat().st_size/1e6:,.0f} Mb) took {end-start:,} secconds")
    
    start = process_time()
    t2 = Table.load(fn)
    end = process_time()
    print(f"loading {len(t2):,} tows took {end-start:,} seconds")
    del t2
    fn.unlink()


### Save / load tables to / from various formats

The handlers for saving / export are:

- to_sql
- to_json
- to_xls
- to_ods
- to_csv
- to_tsv
- to_text
- to_html
- to_hdf5


In [4]:
t = synthetic_order_data(1_000_000)
tmp = Path(tempfile.gettempdir()) / "junk"
tmp.mkdir(exist_ok=True)


In [None]:
start = process_time()
string = t.to_sql(name='t')  # --> SQL
end = process_time()
print(f"to_sql() took {end-start:,.2f} secs for {len(t):,} rows")

# start = process_time() TODO
# Table.from_sql(string)  # <-- SQL
# end = process_time()
# print(f"from_sql() took {end-start:,.2f} secs for {len(t):,} rows")
del string

In [None]:


start = process_time()
bytestr = t.to_json()  # --> JSON
end = process_time()
print(f"to_json() took {end-start:,.2f} secs for {len(t):,} rows")

start = process_time()
Table.from_json(bytestr)  # <-- JSON
end = process_time()
print(f"from_json() took {end-start:,.2f} secs for {len(t):,} rows")
del bytestr


In [None]:

fn = tmp / '1.xlsx'  # --> XLS
start = process_time()
t.to_xlsx(fn)
end = process_time()
print(f"t.to_xls({fn.name}) took {end-start:,.2f} secs for {len(t):,} rows")

start = process_time()
Table.from_file(fn, sheet="pyexcel_sheet1")  # <-- XLS
end = process_time()
print(f"Table.from_file({fn.name}) took {end-start:,.2f} secs for {len(t):,} rows")
fn.unlink()


In [6]:
fn = tmp / '1.ods' # --> ODS
start = process_time()
t[:100_000].to_ods(fn)  # limit the memory footprint.
end = process_time()
print(f"t.to_ods({fn.name}) took {end-start:,.2f} secs for {len(t):,} rows")

start = process_time()
Table.from_file(fn, sheet="pyexcel_sheet1")  # <-- ODS
end = process_time()
print(f"Table.from_file({fn.name}) took {end-start:,.2f} secs for {len(t):,} rows")
fn.unlink()



t.to_ods(100000.ods) took 68.16 secs for 100,000 rows
Table.from_file(100000.ods) took 65.22 secs for 100,000 rows


In [None]:

fn = tmp / '1.csv'  # --> CSV
start = process_time()
t.to_csv(fn)
end = process_time()
print(f"t.to_csv({fn.name}) took {end-start:,.2f} secs for {len(t):,} rows")

start = process_time()
Table.from_file(fn)  # <-- CSV
end = process_time()
print(f"Table.from_file({fn.name}) took {end-start:,.2f} secs for {len(t):,} rows")
fn.unlink()


In [None]:


fn = tmp / '1.tsv'  # --> TSV
start = process_time()
t.to_tsv(fn)
end = process_time()
print(f"t.to_tsv({fn.name}) took {end-start:,.2f} secs for {len(t):,} rows")

start = process_time()
Table.from_file(fn)  # <-- TSV
end = process_time()
print(f"Table.from_file({fn.name}) took {end-start:,.2f} secs for {len(t):,} rows")
fn.unlink()



In [None]:

fn = tmp / '1.txt'  # --> TXT
start = process_time()
t.to_text(fn)
end = process_time()
print(f"t.to_txt({fn.name}) took {end-start:,.2f} secs for {len(t):,} rows")

start = process_time()
Table.from_file(fn)  # <-- TXT
end = process_time()
print(f"Table.from_file({fn.name}) took {end-start:,.2f} secs for {len(t):,} rows")
fn.unlink()



In [None]:

fn = tmp / '1.html'  # --> HTML
start = process_time()
t.to_html(fn)
end = process_time()
print(f"t.to_html({fn.name}) took {end-start:,.2f} secs for {len(t):,} rows")

start = process_time()
Table.from_file(fn)  # <-- HTML
end = process_time()
print(f"Table.from_file({fn.name}) took {end-start:,.2f} secs for {len(t):,} rows")
fn.unlink()


In [None]:


fn = tmp / '1.hdf5'  # --> HDF5
start = process_time()
t.to_hdf5(fn)
end = process_time()
print(f"t.to_hdf5({fn.name}) took {end-start:,.2f} secs for {len(t):,} rows")

start = process_time()
Table.from_file(fn)  # <-- HDF5
end = process_time()
print(f"Table.from_file({fn.name}) took {end-start:,.2f} secs for {len(t):,} rows")
fn.unlink()

