Raccoon vs. Pandas speed test
=============================

Setup pythonpath, import libraries and initialized DataFrame to store results

In [1]:
import sys
sys.path.insert(1, "c:\\RMBAries\\git\\raccoon\\")

from copy import deepcopy

In [2]:
import raccoon as rc
import pandas as pd

In [3]:
results = rc.DataFrame(columns=['raccoon', 'pandas', 'ratio'], sorted=False)

In [4]:
def add_results(index):
    results[index, 'raccoon'] = res_rc.best
    results[index, 'pandas'] = res_pd.best
    results[index, 'ratio'] = res_rc.best / res_pd.best

Initialize 10,000 empty DataFrames
---------------------------

In [5]:
def init_rc():
    for x in range(10000):
        df = rc.DataFrame()
        
def init_pd():
    for x in range(10000):
        df = pd.DataFrame()

In [6]:
res_rc = %timeit -o init_rc()

10 loops, best of 3: 83 ms per loop


In [7]:
res_pd = %timeit -o init_pd()

1 loop, best of 3: 2.77 s per loop


In [8]:
ratio = res_rc.best / res_pd.best

In [9]:
add_results('initialize empty')

In [10]:
results.print()

index               raccoon    pandas      ratio
----------------  ---------  --------  ---------
initialize empty  0.0829567   2.76704  0.0299803


Initialize 100 row X 100 col DataFrame()
--------

In [11]:
data = dict()
for x in range(100):
    data['a' + str(x)] = list(range(100))

In [12]:
res_rc = %timeit -o df=rc.DataFrame(data=data)

1000 loops, best of 3: 1.56 ms per loop


In [13]:
res_pd = %timeit -o df=pd.DataFrame(data=data)

100 loops, best of 3: 9.77 ms per loop


In [14]:
add_results('initialize with matrix')

In [15]:
results.print()

index                      raccoon      pandas      ratio
----------------------  ----------  ----------  ---------
initialize empty        0.0829567   2.76704     0.0299803
initialize with matrix  0.00155941  0.00977127  0.159591


Add 10,000 items in 1 column to empty DataFrame
-------------

In [16]:
def one_col_add_rc():
    df = rc.DataFrame()
    for x in range(10000):
        df.set(x, 'a', x)
        
def one_col_add_pd():
    df = pd.DataFrame()
    for x in range(10000):
        df.at[x, 'a'] = x

In [17]:
res_rc = %timeit -o one_col_add_rc()

10 loops, best of 3: 53.9 ms per loop


In [18]:
res_pd = %timeit -o one_col_add_pd()

1 loop, best of 3: 20 s per loop


In [19]:
add_results('add rows one column')

In [20]:
print(results)

index                      raccoon       pandas       ratio
----------------------  ----------  -----------  ----------
initialize empty        0.0829567    2.76704     0.0299803
initialize with matrix  0.00155941   0.00977127  0.159591
add rows one column     0.0538965   19.9965      0.00269529


Add 100 rows of 100 columns to empty DataFrame
----------

In [21]:
new_row = {('a' + str(x)): x for x in range(100)}
columns = ['a' + str(x) for x in range(100)]

def matrix_add_rc():
    df = rc.DataFrame(columns=columns)
    for x in range(100):
        df.set(indexes=x, values=new_row)

def matrix_add_pd():
    df = pd.DataFrame(columns=columns)
    for x in range(100):
        df.loc[x] = new_row

In [22]:
res_rc = %timeit -o matrix_add_rc()

100 loops, best of 3: 8.05 ms per loop


In [23]:
res_pd = %timeit -o matrix_add_pd()

1 loop, best of 3: 204 ms per loop


In [24]:
add_results('add matrix')

In [25]:
print(results)

index                      raccoon       pandas       ratio
----------------------  ----------  -----------  ----------
initialize empty        0.0829567    2.76704     0.0299803
initialize with matrix  0.00155941   0.00977127  0.159591
add rows one column     0.0538965   19.9965      0.00269529
add matrix              0.00805453   0.204356    0.0394142


Append 10x10 DataFrame 1000 times
------

In [26]:
def append_rc():
    grid = {'a' + str(x): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] for x in range(10)}
    df = rc.DataFrame(data=deepcopy(grid), columns=list(grid.keys()))
    for x in range(100):
        index = [(y + 1) + (x + 1) * 10 for y in range(10)]
        new_grid = deepcopy(grid)
        new_df = rc.DataFrame(data=new_grid, columns=list(new_grid.keys()), index=index)
        df.append(new_df)

def append_pd():
    grid = {'a' + str(x): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] for x in range(10)}
    df = pd.DataFrame(data=grid, columns=list(grid.keys()))
    for x in range(100):
        index = [(y + 1) + (x + 1) * 10 for y in range(10)]
        new_grid = deepcopy(grid)
        new_df = pd.DataFrame(data=new_grid, columns=list(new_grid.keys()), index=index)
        df = df.append(new_df)

In [27]:
res_rc = %timeit -o append_rc()

10 loops, best of 3: 68.4 ms per loop


In [28]:
res_pd = %timeit -o append_pd()

1 loop, best of 3: 218 ms per loop


In [29]:
add_results('append')

In [30]:
print(results)

index                      raccoon       pandas       ratio
----------------------  ----------  -----------  ----------
initialize empty        0.0829567    2.76704     0.0299803
initialize with matrix  0.00155941   0.00977127  0.159591
add rows one column     0.0538965   19.9965      0.00269529
add matrix              0.00805453   0.204356    0.0394142
append                  0.0683559    0.217914    0.313684


Get
---

In [31]:
# First create a 1000 row X 100 col matrix for the test. Index is [0...999]

col = [x for x in range(1000)]
grid = {'a' + str(x): col[:] for x in range(100)}

df_rc = rc.DataFrame(data=grid, columns=sorted(grid.keys()))
df_pd = pd.DataFrame(data=grid, columns=sorted(grid.keys()))

In [32]:
# get cell

def rc_get_cell():
    for c in df_rc.columns:
        for r in df_rc.index:
            x = df_rc.get(r, c)
            
def pd_get_cell():
    for c in df_pd.columns:
        for r in df_pd.index:
            x = df_pd.at[r, c]

In [33]:
res_rc = %timeit -o rc_get_cell()

1 loop, best of 3: 846 ms per loop


In [34]:
res_pd = %timeit -o pd_get_cell()

1 loop, best of 3: 1.09 s per loop


In [35]:
add_results('get cell')

In [36]:
print(results)

index                      raccoon       pandas       ratio
----------------------  ----------  -----------  ----------
initialize empty        0.0829567    2.76704     0.0299803
initialize with matrix  0.00155941   0.00977127  0.159591
add rows one column     0.0538965   19.9965      0.00269529
add matrix              0.00805453   0.204356    0.0394142
append                  0.0683559    0.217914    0.313684
get cell                0.846118     1.09187     0.774925


In [37]:
# get column all index

def get_column_all_rc():
    for c in df_rc.columns:
        x = df_rc.get(columns=c)
        
def get_column_all_pd():
    for c in df_pd.columns:
        x = df_pd[c]

In [38]:
res_rc = %timeit -o get_column_all_rc()

10 loops, best of 3: 43 ms per loop


In [39]:
res_pd = %timeit -o get_column_all_pd()

1000 loops, best of 3: 382 µs per loop


In [40]:
add_results('get column all index')

In [41]:
print(results)

index                      raccoon       pandas         ratio
----------------------  ----------  -----------  ------------
initialize empty        0.0829567    2.76704       0.0299803
initialize with matrix  0.00155941   0.00977127    0.159591
add rows one column     0.0538965   19.9965        0.00269529
add matrix              0.00805453   0.204356      0.0394142
append                  0.0683559    0.217914      0.313684
get cell                0.846118     1.09187       0.774925
get column all index    0.0429724    0.00038188  112.529


In [42]:
# get subset of the index of the column

def get_column_subset_rc():
    for c in df_rc.columns:
        for r in range(100):
            rows = list(range(r*10, r*10 + 9))
            x = df_rc.get(indexes=rows, columns=c)
        
def get_column_subset_pd():
    for c in df_pd.columns:
        for r in range(100):
            rows = list(range(r*10, r*10 + 9))
            x = df_pd.loc[rows, c]

In [43]:
res_rc = %timeit -o get_column_subset_rc()

1 loop, best of 3: 720 ms per loop


In [44]:
res_pd = %timeit -o get_column_subset_pd()

1 loop, best of 3: 6.86 s per loop


In [45]:
add_results('get column subset index')

In [46]:
print(results)

index                       raccoon       pandas         ratio
-----------------------  ----------  -----------  ------------
initialize empty         0.0829567    2.76704       0.0299803
initialize with matrix   0.00155941   0.00977127    0.159591
add rows one column      0.0538965   19.9965        0.00269529
add matrix               0.00805453   0.204356      0.0394142
append                   0.0683559    0.217914      0.313684
get cell                 0.846118     1.09187       0.774925
get column all index     0.0429724    0.00038188  112.529
get column subset index  0.719855     6.86022       0.104932


In [47]:
# get index all columns

def get_index_all_rc():
    for i in df_rc.index:
        x = df_rc.get(indexes=i)
        
def get_index_all_pd():
    for i in df_pd.index:
        x = df_pd.loc[i]

In [48]:
res_rc = %timeit -o get_index_all_rc()

1 loop, best of 3: 794 ms per loop


In [49]:
res_pd = %timeit -o get_index_all_pd()

1 loop, best of 3: 131 ms per loop


In [50]:
add_results('get index all columns')

In [51]:
print(results)

index                       raccoon       pandas         ratio
-----------------------  ----------  -----------  ------------
initialize empty         0.0829567    2.76704       0.0299803
initialize with matrix   0.00155941   0.00977127    0.159591
add rows one column      0.0538965   19.9965        0.00269529
add matrix               0.00805453   0.204356      0.0394142
append                   0.0683559    0.217914      0.313684
get cell                 0.846118     1.09187       0.774925
get column all index     0.0429724    0.00038188  112.529
get column subset index  0.719855     6.86022       0.104932
get index all columns    0.794428     0.131263      6.05217


Set
---

In [52]:
# First create a 1000 row X 100 col matrix for the test. Index is [0...999]

col = [x for x in range(1000)]
grid = {'a' + str(x): col[:] for x in range(100)}

df_rc = rc.DataFrame(data=grid, columns=sorted(grid.keys()))
df_pd = pd.DataFrame(data=grid, columns=sorted(grid.keys()))

In [53]:
# set cell

def rc_set_cell():
    for c in df_rc.columns:
        for r in df_rc.index:
            df_rc.set(r, c, 99)
            
def pd_set_cell():
    for c in df_pd.columns:
        for r in df_pd.index:
            df_pd.at[r, c] = 99

In [54]:
res_rc = %timeit -o rc_set_cell()

1 loop, best of 3: 663 ms per loop


In [55]:
res_pd = %timeit -o pd_set_cell()

1 loop, best of 3: 1.1 s per loop


In [56]:
add_results('set cell')

In [57]:
print(results)

index                       raccoon       pandas         ratio
-----------------------  ----------  -----------  ------------
initialize empty         0.0829567    2.76704       0.0299803
initialize with matrix   0.00155941   0.00977127    0.159591
add rows one column      0.0538965   19.9965        0.00269529
add matrix               0.00805453   0.204356      0.0394142
append                   0.0683559    0.217914      0.313684
get cell                 0.846118     1.09187       0.774925
get column all index     0.0429724    0.00038188  112.529
get column subset index  0.719855     6.86022       0.104932
get index all columns    0.794428     0.131263      6.05217
set cell                 0.662594     1.10305       0.600692


In [58]:
# set column all index

def set_column_all_rc():
    for c in df_rc.columns:
        x = df_rc.set(columns=c, values=99)
        
def set_column_all_pd():
    for c in df_pd.columns:
        x = df_pd[c] = 99

In [59]:
res_rc = %timeit -o set_column_all_rc()

100 loops, best of 3: 5.17 ms per loop


In [60]:
res_pd = %timeit -o set_column_all_pd()

100 loops, best of 3: 15.7 ms per loop


In [61]:
add_results('set column all index')

In [62]:
print(results)

index                       raccoon       pandas         ratio
-----------------------  ----------  -----------  ------------
initialize empty         0.0829567    2.76704       0.0299803
initialize with matrix   0.00155941   0.00977127    0.159591
add rows one column      0.0538965   19.9965        0.00269529
add matrix               0.00805453   0.204356      0.0394142
append                   0.0683559    0.217914      0.313684
get cell                 0.846118     1.09187       0.774925
get column all index     0.0429724    0.00038188  112.529
get column subset index  0.719855     6.86022       0.104932
get index all columns    0.794428     0.131263      6.05217
set cell                 0.662594     1.10305       0.600692
set column all index     0.00516976   0.0156922     0.329448


In [63]:
# set subset of the index of the column

def set_column_subset_rc():
    for c in df_rc.columns:
        for r in range(100):
            rows = list(range(r*10, r*10 + 10))
            x = df_rc.set(indexes=rows, columns=c, values=list(range(10)))
        
def set_column_subset_pd():
    for c in df_pd.columns:
        for r in range(100):
            rows = list(range(r*10, r*10 + 10))
            x = df_pd.loc[rows, c] = list(range(10))

In [64]:
res_rc = %timeit -o set_column_subset_rc()

1 loop, best of 3: 515 ms per loop


In [65]:
res_pd = %timeit -o set_column_subset_pd()

1 loop, best of 3: 24.8 s per loop


In [66]:
add_results('set column subset index')

In [67]:
print(results)

index                       raccoon       pandas         ratio
-----------------------  ----------  -----------  ------------
initialize empty         0.0829567    2.76704       0.0299803
initialize with matrix   0.00155941   0.00977127    0.159591
add rows one column      0.0538965   19.9965        0.00269529
add matrix               0.00805453   0.204356      0.0394142
append                   0.0683559    0.217914      0.313684
get cell                 0.846118     1.09187       0.774925
get column all index     0.0429724    0.00038188  112.529
get column subset index  0.719855     6.86022       0.104932
get index all columns    0.794428     0.131263      6.05217
set cell                 0.662594     1.10305       0.600692
set column all index     0.00516976   0.0156922     0.329448
set column subset index  0.515115    24.7612        0.0208033


In [68]:
row = {x:x for x in grid.keys()}

In [69]:
# set index all columns

def set_index_all_rc():
    for i in df_rc.index:
        x = df_rc.set(indexes=i, values=row)
        
def set_index_all_pd():
    for i in df_pd.index:
        x = df_pd.loc[i] = row

In [70]:
res_rc = %timeit -o set_index_all_rc()

10 loops, best of 3: 60.3 ms per loop


In [71]:
res_pd = %timeit -o set_index_all_pd()

1 loop, best of 3: 546 ms per loop


In [72]:
add_results('set index all columns')

In [73]:
print(results)

index                       raccoon       pandas         ratio
-----------------------  ----------  -----------  ------------
initialize empty         0.0829567    2.76704       0.0299803
initialize with matrix   0.00155941   0.00977127    0.159591
add rows one column      0.0538965   19.9965        0.00269529
add matrix               0.00805453   0.204356      0.0394142
append                   0.0683559    0.217914      0.313684
get cell                 0.846118     1.09187       0.774925
get column all index     0.0429724    0.00038188  112.529
get column subset index  0.719855     6.86022       0.104932
get index all columns    0.794428     0.131263      6.05217
set cell                 0.662594     1.10305       0.600692
set column all index     0.00516976   0.0156922     0.329448
set column subset index  0.515115    24.7612        0.0208033
set index all columns    0.0603223    0.546263      0.110427


Sort
-----

In [74]:
# make a dataframe 1000x100 with index in reverse order

rev = list(reversed(range(1000)))

df_rc = rc.DataFrame(data=grid, index=rev)
df_pd = pd.DataFrame(grid, index=rev)

In [75]:
res_rc = %timeit -o df_rc.sort_index() 

100 loops, best of 3: 13 ms per loop


In [76]:
res_pd = %timeit -o df_pd.sort_index()

The slowest run took 9.80 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 816 µs per loop


In [77]:
add_results('sort index')

In [78]:
print(results)

index                       raccoon        pandas         ratio
-----------------------  ----------  ------------  ------------
initialize empty         0.0829567    2.76704        0.0299803
initialize with matrix   0.00155941   0.00977127     0.159591
add rows one column      0.0538965   19.9965         0.00269529
add matrix               0.00805453   0.204356       0.0394142
append                   0.0683559    0.217914       0.313684
get cell                 0.846118     1.09187        0.774925
get column all index     0.0429724    0.00038188   112.529
get column subset index  0.719855     6.86022        0.104932
get index all columns    0.794428     0.131263       6.05217
set cell                 0.662594     1.10305        0.600692
set column all index     0.00516976   0.0156922      0.329448
set column subset index  0.515115    24.7612         0.0208033
set index all columns    0.0603223    0.546263       0.110427
sort index               0.0129654    0.000815632   15.8961
