In [1]:
%autosave 0
%load_ext autoreload
%autoreload 2

Autosave disabled


## ArrowLoaf

In [2]:
from pathlib import Path
from arrowloaf import ArrowLoaf

In [3]:
FOLDER = Path('~/data/Loaf').expanduser()
CHUNKS = FOLDER/'Chunks'
INPATH = FOLDER/'names.parquet'
OUTPATH = FOLDER/'clone.parquet'

In [4]:
print(ArrowLoaf.__doc__)


    Query, read, and save Parquet tables using pandas and pyarrow.

    Inputs
        data    ArrowLoaf, DataFrame, RecordBatch, Series, Table
                OR any valid DataFrame input.
        index   bool: Keep index column? (Ignored for non-pandas input.)

    Magic
        self[key]   pyarrow.Column: Selected column.
        iter(self)  iterator: Column names.
        len(self)   int: Count rows in table.

    Example
        path = '/path/to/data.parquet'
        data = ArrowLoaf.read(path,columns=['spam','eggs'])
        data = data.query('spam > 0 and eggs > 42',chunksize=100_000)
        data.frame().to_csv('path/to/newdata.csv')
    


## build an ArrowLoaf

In [5]:
# from nothing
ArrowLoaf().frame()

In [6]:
# from any valid DataFrame input
data = dict()
data['name'] = ['Arthur','Lancelot','Knight']
data['quest'] = ['grail','grail','ni!']
data = ArrowLoaf(data)
data.frame()

Unnamed: 0,name,quest
0,Arthur,grail
1,Lancelot,grail
2,Knight,ni!


In [7]:
# from DataFrame with index
ixdata = data.frame().set_index('name')
ixdata = ArrowLoaf(ixdata,index=True)
ixdata.frame()

Unnamed: 0_level_0,quest
name,Unnamed: 1_level_1
Arthur,grail
Lancelot,grail
Knight,ni!


## inspect data

In [8]:
print(data)

ArrowLoaf
3 x 2
name: string
quest: string


In [9]:
data.head(2)

Unnamed: 0,name,quest
0,Arthur,grail
1,Lancelot,grail


In [10]:
data.columns

['name', 'quest']

In [11]:
data.shape

(3, 2)

## read Parquet file

In [12]:
%time bigdata = ArrowLoaf.read(INPATH)
print(bigdata)

CPU times: user 1.72 s, sys: 640 ms, total: 2.36 s
Wall time: 2.37 s
ArrowLoaf
8,808,095 x 6
nconst: string
primaryName: string
birthYear: double
deathYear: double
primaryProfession: string
knownForTitles: string


In [13]:
bigdata.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0072308,tt0043044,tt0050419,tt0053137"
1,nm0000002,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0038355,tt0117057,tt0037382,tt0071877"
2,nm0000003,Brigitte Bardot,1934.0,,"actress,soundtrack,producer","tt0054452,tt0057345,tt0059956,tt0049189"
3,nm0000004,John Belushi,1949.0,1982.0,"actor,writer,soundtrack","tt0078723,tt0080455,tt0072562,tt0077975"
4,nm0000005,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0060827,tt0083922,tt0050986,tt0050976"


## save Parquet file

In [14]:
%time bigdata.save(OUTPATH)
assert bigdata == ArrowLoaf.read(OUTPATH)

CPU times: user 3.41 s, sys: 500 ms, total: 3.91 s
Wall time: 6.14 s


## concatenate Parquet files

In [15]:
paths = map("{}.parquet".format,range(3))
paths = [ CHUNKS/x for x in paths ]

%time catloaf = ArrowLoaf.cat(paths)
print(catloaf)

CPU times: user 640 ms, sys: 100 ms, total: 740 ms
Wall time: 746 ms
ArrowLoaf
3,000,000 x 6
nconst: string
primaryName: string
birthYear: double
deathYear: double
primaryProfession: string
knownForTitles: string


## generate DataFrames

In [16]:
help(ArrowLoaf.chunks)

Help on function chunks in module arrowloaf:

chunks(self, chunksize=1000000)
    Generate Dataframes with limited maximum row count.



In [17]:
%%time
msg = "{} with {:,} rows".format
for df in bigdata.chunks(1_000_000):
    print(msg(type(df),len(df)))

<class 'pandas.core.frame.DataFrame'> with 1,000,000 rows
<class 'pandas.core.frame.DataFrame'> with 1,000,000 rows
<class 'pandas.core.frame.DataFrame'> with 1,000,000 rows
<class 'pandas.core.frame.DataFrame'> with 1,000,000 rows
<class 'pandas.core.frame.DataFrame'> with 1,000,000 rows
<class 'pandas.core.frame.DataFrame'> with 1,000,000 rows
<class 'pandas.core.frame.DataFrame'> with 1,000,000 rows
<class 'pandas.core.frame.DataFrame'> with 1,000,000 rows
<class 'pandas.core.frame.DataFrame'> with 808,095 rows
CPU times: user 2.81 s, sys: 680 ms, total: 3.49 s
Wall time: 3.45 s


## filter columns

In [18]:
help(ArrowLoaf.select)

Help on function select in module arrowloaf:

select(self, columns)
    ArrowLoaf: Selected columns in selected order.



In [19]:
cols = ['primaryName','birthYear','primaryProfession']
%time bigdata.select(cols).head()

CPU times: user 10 ms, sys: 0 ns, total: 10 ms
Wall time: 1.77 ms


Unnamed: 0,primaryName,birthYear,primaryProfession
0,Fred Astaire,1899.0,"soundtrack,actor,miscellaneous"
1,Lauren Bacall,1924.0,"actress,soundtrack"
2,Brigitte Bardot,1934.0,"actress,soundtrack,producer"
3,John Belushi,1949.0,"actor,writer,soundtrack"
4,Ingmar Bergman,1918.0,"writer,director,actor"


## filter rows

In [20]:
help(ArrowLoaf.loaf)

Help on function loaf in module arrowloaf:

loaf(self, func, chunksize=1000000)
    ArrowLoaf: Generate DataFrames. Apply function to each frame.
    Loaf results together. Function must not change table schema.



In [21]:
def classic(df):
    return df.query('birthYear < 1920')

%time bigdata.loaf(classic).head()

CPU times: user 3.35 s, sys: 130 ms, total: 3.48 s
Wall time: 3.4 s


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm9082621,Joe Gill,1919.0,2006.0,writer,"tt1641384,tt2820466,tt6025022,tt6556890"
1,nm9091356,Ersilio Tonini,1914.0,2013.0,,tt4456688
2,nm9101175,Valeriya Gerasimova,1903.0,,writer,tt7084996
3,nm9102000,Lincoln Borglum,1912.0,1986.0,,tt0048907
4,nm9109707,Gianfranco Miglio,1918.0,2001.0,,tt4456688


In [22]:
def baldwins(df):
    hasname = df['primaryName'].str.contains
    return df.loc[hasname('Baldwin')]

%time len(bigdata.loaf(baldwins))

CPU times: user 7.33 s, sys: 580 ms, total: 7.91 s
Wall time: 7.9 s


1537