In [1]:
import os
import numpy as np
import pandas as pd
import random
import arcticdb as adb
from arcticdb.util.test import random_strings_of_length

In [2]:
arctic = adb.Arctic("lmdb://arcticdb_demo")

In [3]:
if 'sample' not in arctic.list_libraries():
    # library does not already exist
    arctic.create_library('sample')
lib = arctic.get_library('sample')

In [5]:
ten_grouping_values = random_strings_of_length(10, 10, True)
one_hundred_thousand_grouping_values = random_strings_of_length(100_000, 10, True)
rng = np.random.RandomState()

sym_10M = "demo_10M"
sym_100M = "demo_100M"
sym_1B = "demo_1B"

In [6]:
sym = sym_10M

In [7]:
if sym==sym_10M:
    num_rows = 10_000_000
elif sym==sym_100M:
    num_rows = 100_000_000
elif sym==sym_1B:
    num_rows = 1_000_000_000
df = pd.DataFrame(
    {
        "grouping_column_10": list(random.choices(ten_grouping_values, k=num_rows)),
        "grouping_column_100_000": list(random.choices(one_hundred_thousand_grouping_values, k=num_rows)),
        "numeric_column": rng.rand((num_rows))
    }
)

In [10]:
df.shape

(10000000, 3)

In [11]:
lib.write(sym, df)

VersionedItem(symbol='demo_10M', library='sample', data=n/a, version=0, metadata=None, host='LMDB(path=/home/quantfiction/repositories/crypto_trading/notebooks/arcticdb_demo)', timestamp=1717791306196657390)

In [12]:
lib._nvs.read_index(sym)

Unnamed: 0_level_0,end_index,version_id,stream_id,creation_ts,content_hash,index_type,key_type,start_col,end_col,start_row,end_row
start_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1970-01-01 00:00:00.000000,1970-01-01 00:00:00.000100,0,b'demo_10M',1717791304303709171,16209642513796059015,84,2,0,3,0,100000
1970-01-01 00:00:00.000100,1970-01-01 00:00:00.000200,0,b'demo_10M',1717791304307977507,8110629139241627148,84,2,0,3,100000,200000
1970-01-01 00:00:00.000200,1970-01-01 00:00:00.000300,0,b'demo_10M',1717791304321374852,10556189679508249519,84,2,0,3,200000,300000
1970-01-01 00:00:00.000300,1970-01-01 00:00:00.000400,0,b'demo_10M',1717791304328465970,1845369905848734741,84,2,0,3,300000,400000
1970-01-01 00:00:00.000400,1970-01-01 00:00:00.000500,0,b'demo_10M',1717791304301738067,18040571775985524274,84,2,0,3,400000,500000
...,...,...,...,...,...,...,...,...,...,...,...
1970-01-01 00:00:00.009500,1970-01-01 00:00:00.009600,0,b'demo_10M',1717791305495176195,6000351684572541693,84,2,0,3,9500000,9600000
1970-01-01 00:00:00.009600,1970-01-01 00:00:00.009700,0,b'demo_10M',1717791305511303374,5885251280793145919,84,2,0,3,9600000,9700000
1970-01-01 00:00:00.009700,1970-01-01 00:00:00.009800,0,b'demo_10M',1717791305531238165,4315147052244089728,84,2,0,3,9700000,9800000
1970-01-01 00:00:00.009800,1970-01-01 00:00:00.009900,0,b'demo_10M',1717791305552709283,13886310973613602345,84,2,0,3,9800000,9900000


In [15]:
lib.read_metadata(sym)

VersionedItem(symbol='demo_10M', library='sample', data=n/a, version=0, metadata=None, host='LMDB(path=/home/quantfiction/repositories/crypto_trading/notebooks/arcticdb_demo)', timestamp=1717791306196657390)

In [18]:
%%time
lib.read(sym)

CPU times: user 922 ms, sys: 502 ms, total: 1.42 s
Wall time: 1.01 s


VersionedItem(symbol='demo_10M', library='sample', data=<class 'pandas.core.frame.DataFrame'>, version=0, metadata=None, host='LMDB(path=/home/quantfiction/repositories/crypto_trading/notebooks/arcticdb_demo)', timestamp=1717791306196657390)

In [19]:
%%time
lib.read(sym, columns=["grouping_column_10", "numeric_column"])

CPU times: user 235 ms, sys: 146 ms, total: 381 ms
Wall time: 125 ms


VersionedItem(symbol='demo_10M', library='sample', data=<class 'pandas.core.frame.DataFrame'>, version=0, metadata=None, host='LMDB(path=/home/quantfiction/repositories/crypto_trading/notebooks/arcticdb_demo)', timestamp=1717791306196657390)

# Filtering

## Useless Filter

In [20]:
q = adb.QueryBuilder()
q = q[q["numeric_column"] < 2.0]

In [21]:
%%time
lib.read(sym, query_builder=q)

CPU times: user 1.29 s, sys: 831 ms, total: 2.12 s
Wall time: 1.32 s


VersionedItem(symbol='demo_10M', library='sample', data=<class 'pandas.core.frame.DataFrame'>, version=0, metadata=None, host='LMDB(path=/home/quantfiction/repositories/crypto_trading/notebooks/arcticdb_demo)', timestamp=1717791306196657390)

## Real Filter

In [22]:
q = adb.QueryBuilder()
q = q[q["numeric_column"] < 0.1]

In [23]:
%%time
lib.read(sym, query_builder=q).data

CPU times: user 478 ms, sys: 311 ms, total: 789 ms
Wall time: 231 ms


Unnamed: 0,grouping_column_10,grouping_column_100_000,numeric_column
0,QIM84DT9TB,GFXLFU4VAB,0.073828
1,PZA6Z2CNED,FU16CZROD6,0.000987
2,LJ0JFI3SGC,602MM13YN7,0.028889
3,ZK2L6I5T5R,I9PI2SE0MU,0.018263
4,LJ0JFI3SGC,UZLPC5A0Z4,0.016643
...,...,...,...
1000355,LJ0JFI3SGC,L3UIPVXVQ0,0.073518
1000356,I5KMQNY1QL,QIKT6Q9JOF,0.021464
1000357,I5KMQNY1QL,1J1F5Z45EI,0.037978
1000358,LJ0JFI3SGC,UX77NFCBFM,0.004842


## Projections 

In [24]:
q = adb.QueryBuilder()
q = q.apply("new_column", q["numeric_column"] * 2.0)

In [28]:
%%time
lib.read(sym, query_builder=q).data

CPU times: user 1.01 s, sys: 717 ms, total: 1.73 s
Wall time: 1.22 s


Unnamed: 0,grouping_column_10,grouping_column_100_000,numeric_column,new_column
0,QIM84DT9TB,QX76M5N43S,0.509679,1.019359
1,8PXRBDCFUN,NP0HS20JV9,0.885924,1.771848
2,YDC0K8AGOQ,XDDEE73H94,0.416197,0.832393
3,8PXRBDCFUN,FHOQ7PWKZR,0.222763,0.445525
4,1UECMMRD77,6BZKO2TMZH,0.495821,0.991641
...,...,...,...,...
9999995,ZK2L6I5T5R,CFKIC8DOZI,0.908071,1.816141
9999996,8PXRBDCFUN,3XOBX33OK5,0.201978,0.403956
9999997,ZK2L6I5T5R,KN62F39EUZ,0.598491,1.196983
9999998,QIM84DT9TB,J1191J1FI2,0.566221,1.132442


In [27]:
%%time
lib.read(sym).data.assign(new_column = lambda x: x['numeric_column'] * 2)

CPU times: user 1.05 s, sys: 901 ms, total: 1.95 s
Wall time: 1.53 s


Unnamed: 0,grouping_column_10,grouping_column_100_000,numeric_column,new_column
0,QIM84DT9TB,QX76M5N43S,0.509679,1.019359
1,8PXRBDCFUN,NP0HS20JV9,0.885924,1.771848
2,YDC0K8AGOQ,XDDEE73H94,0.416197,0.832393
3,8PXRBDCFUN,FHOQ7PWKZR,0.222763,0.445525
4,1UECMMRD77,6BZKO2TMZH,0.495821,0.991641
...,...,...,...,...
9999995,ZK2L6I5T5R,CFKIC8DOZI,0.908071,1.816141
9999996,8PXRBDCFUN,3XOBX33OK5,0.201978,0.403956
9999997,ZK2L6I5T5R,KN62F39EUZ,0.598491,1.196983
9999998,QIM84DT9TB,J1191J1FI2,0.566221,1.132442


## Groupby

In [29]:
q = adb.QueryBuilder()
q = q.groupby("grouping_column_10").agg({"numeric_column": "mean"})

### Small number of things to group by

In [30]:
%%time
lib.read(sym, query_builder=q)

CPU times: user 527 ms, sys: 315 ms, total: 843 ms
Wall time: 163 ms


VersionedItem(symbol='demo_10M', library='sample', data=<class 'pandas.core.frame.DataFrame'>, version=0, metadata=None, host='LMDB(path=/home/quantfiction/repositories/crypto_trading/notebooks/arcticdb_demo)', timestamp=1717791306196657390)

In [32]:
%%time
lib.read(sym).data.groupby('grouping_column_10').agg({'numeric_column':'mean'})

CPU times: user 1.39 s, sys: 709 ms, total: 2.09 s
Wall time: 1.71 s


Unnamed: 0_level_0,numeric_column
grouping_column_10,Unnamed: 1_level_1
1UECMMRD77,0.500001
8PXRBDCFUN,0.499829
DUM17WIPQT,0.499888
I5KMQNY1QL,0.500004
K15PAY160I,0.500026
LJ0JFI3SGC,0.499989
PZA6Z2CNED,0.500175
QIM84DT9TB,0.500259
YDC0K8AGOQ,0.499999
ZK2L6I5T5R,0.499783


### Large number of things to group by

In [33]:
q = adb.QueryBuilder()
q = q.groupby("grouping_column_100_000").agg({"numeric_column": "mean"})

In [34]:
%%time
lib.read(sym, query_builder=q)

CPU times: user 2.04 s, sys: 307 ms, total: 2.34 s
Wall time: 391 ms


VersionedItem(symbol='demo_10M', library='sample', data=<class 'pandas.core.frame.DataFrame'>, version=0, metadata=None, host='LMDB(path=/home/quantfiction/repositories/crypto_trading/notebooks/arcticdb_demo)', timestamp=1717791306196657390)

In [35]:
%%time
lib.read(sym).data.groupby('grouping_column_100_000').agg({'numeric_column':'mean'})

CPU times: user 2.59 s, sys: 808 ms, total: 3.39 s
Wall time: 3.02 s


Unnamed: 0_level_0,numeric_column
grouping_column_100_000,Unnamed: 1_level_1
000KPMWJ8I,0.436089
0020VTY9LQ,0.508949
002GBLJ69C,0.508370
002I509L29,0.463328
003BA0CDUL,0.489452
...,...
ZZY5DY6NWN,0.514540
ZZYZRJ4TNW,0.480418
ZZZ1XJJ91S,0.457022
ZZZ6WF95O1,0.537931


## Combinations

In [36]:
q = adb.QueryBuilder()
q = (
    q[q["numeric_column"] < 0.1]
    .apply("new_column", q["numeric_column"] * 2.0)
    .groupby("grouping_column_10")
    .agg({"numeric_column": "mean", "new_column": "max"})
)

In [38]:
%%time
lib.read(sym, query_builder=q).data

CPU times: user 293 ms, sys: 196 ms, total: 489 ms
Wall time: 77.7 ms


Unnamed: 0_level_0,numeric_column,new_column
grouping_column_10,Unnamed: 1_level_1,Unnamed: 2_level_1
1UECMMRD77,0.049878,0.2
K15PAY160I,0.050186,0.199998
I5KMQNY1QL,0.04983,0.199995
PZA6Z2CNED,0.049905,0.199998
ZK2L6I5T5R,0.049908,0.199998
QIM84DT9TB,0.050015,0.199994
YDC0K8AGOQ,0.049976,0.199999
LJ0JFI3SGC,0.049939,0.199996
DUM17WIPQT,0.049767,0.199997
8PXRBDCFUN,0.050113,0.199998


In [39]:
ten_grouping_values[0]

'QIM84DT9TB'

In [40]:
q = adb.QueryBuilder()

q = q[q['grouping_column_10'] == ten_grouping_values[0]]

In [41]:
%%time 
lib.read(sym, query_builder=q)

CPU times: user 501 ms, sys: 305 ms, total: 806 ms
Wall time: 271 ms


VersionedItem(symbol='demo_10M', library='sample', data=<class 'pandas.core.frame.DataFrame'>, version=0, metadata=None, host='LMDB(path=/home/quantfiction/repositories/crypto_trading/notebooks/arcticdb_demo)', timestamp=1717791306196657390)