Example file IO to/from a pandas dataframe using the latest juypter scipy notebook with pyarrow. Aim is to read/write various file formats and buffers. Any data in bytestream can be passed to pypachy to commit to pfs.
Run the docker with volume mount to keep the output.

docker run -it --rm -v /tmp/data:/home/jovyan/work -p 8888:8888 ryanmwhitephd/jupyter-scipy-arrow:73bdef0

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
import pyarrow as pa

In [4]:
import pyarrow.parquet as pq

In [25]:
import csv

In [5]:
df = pd.DataFrame({'one': [-1, np.nan, 2.5],
                   'two': ['foo', 'bar', 'baz'],
                   'three': [True, False, True]})

In [27]:
%timeit df.to_csv('example.csv')

4.43 ms ± 410 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
table = pa.Table.from_pandas(df)

In [23]:
%timeit pq.write_table(table, 'example.parquet')

3.96 ms ± 164 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [24]:
%timeit table2 = pq.read_table('example.parquet')

4.8 ms ± 459 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
table2.to_pandas()

Unnamed: 0,one,three,two
0,-1.0,True,foo
1,,False,bar
2,2.5,True,baz


In [10]:
pq.read_table('example.parquet', columns=['one', 'three'])

pyarrow.Table
one: double
three: bool
metadata
--------
{b'pandas': b'{"index_columns": ["__index_level_0__"], "column_indexes": [{"na'
            b'me": null, "field_name": null, "pandas_type": "unicode", "numpy_'
            b'type": "object", "metadata": {"encoding": "UTF-8"}}], "columns":'
            b' [{"name": "one", "field_name": "one", "pandas_type": "float64",'
            b' "numpy_type": "float64", "metadata": null}, {"name": "three", "'
            b'field_name": "three", "pandas_type": "bool", "numpy_type": "bool'
            b'", "metadata": null}, {"name": "two", "field_name": "two", "pand'
            b'as_type": "unicode", "numpy_type": "object", "metadata": null}, '
            b'{"name": null, "field_name": "__index_level_0__", "pandas_type":'
            b' "int64", "numpy_type": "int64", "metadata": null}], "pandas_ver'
            b'sion": "0.19.2"}'}

In [11]:
parquet_file = pq.ParquetFile('example.parquet')

In [12]:
parquet_file.metadata

<pyarrow._parquet.FileMetaData object at 0x7f160f42b278>
  created_by: parquet-cpp version 1.4.0
  num_columns: 4
  num_rows: 3
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 1073

In [13]:
parquet_file.schema

<pyarrow._parquet.ParquetSchema object at 0x7f160b39d908>
one: DOUBLE
three: BOOLEAN
two: BYTE_ARRAY UTF8
__index_level_0__: INT64
 

In [14]:
parquet_file.num_row_groups

1

In [15]:
parquet_file.read_row_group(0)

pyarrow.Table
one: double
three: bool
two: string
__index_level_0__: int64
metadata
--------
{b'pandas': b'{"index_columns": ["__index_level_0__"], "column_indexes": [{"na'
            b'me": null, "field_name": null, "pandas_type": "unicode", "numpy_'
            b'type": "object", "metadata": {"encoding": "UTF-8"}}], "columns":'
            b' [{"name": "one", "field_name": "one", "pandas_type": "float64",'
            b' "numpy_type": "float64", "metadata": null}, {"name": "three", "'
            b'field_name": "three", "pandas_type": "bool", "numpy_type": "bool'
            b'", "metadata": null}, {"name": "two", "field_name": "two", "pand'
            b'as_type": "unicode", "numpy_type": "object", "metadata": null}, '
            b'{"name": null, "field_name": "__index_level_0__", "pandas_type":'
            b' "int64", "numpy_type": "int64", "metadata": null}], "pandas_ver'
            b'sion": "0.19.2"}'}

In [16]:
writer = pq.ParquetWriter('example2.parquet', table.schema)

In [18]:
for i in range(3):
    writer.write_table(table)

In [19]:
writer.close()

In [20]:
pf2 = pq.ParquetFile('example2.parquet')

In [21]:
pf2.num_row_groups

3

In [22]:
with pq.ParquetWriter('example3.parquet', table.schema) as writer:
    for i in range(3):
        writer.write_table(table)

Serialization

In [28]:
context = pa.default_serialization_context()

In [33]:
%timeit serialized_df = context.serialize(df)

352 µs ± 19.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [30]:
df_components = serialized_df.to_components()

In [34]:
%timeit original_df = context.deserialize_components(df_components)

474 µs ± 38.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
original_df

Arbitrary object serialization

In [35]:
data = {
    i: np.random.randn(500, 500)
    for i in range(100)
}

In [36]:
buf = pa.serialize(data).to_buffer()

In [37]:
type(buf)

pyarrow.lib.Buffer

In [38]:
buf.size

200029088

In [39]:
restored_data = pa.deserialize(buf)

In [40]:
restored_data[0]

array([[-0.34624114,  0.08653156,  1.08320732, ..., -0.28030783,
         0.16234606, -2.53999073],
       [-0.36150757, -1.88545953, -0.46077857, ..., -1.62094584,
        -0.76619533, -0.75660042],
       [ 0.47508163,  0.4879743 ,  1.11682785, ..., -0.69589778,
         1.1129186 , -0.04163293],
       ..., 
       [ 0.92065657, -1.62624437,  1.31386466, ...,  0.3664918 ,
         1.27692376,  1.06485014],
       [-0.74487828,  0.69882226, -0.26923223, ..., -0.04297438,
         0.09006656, -0.76505319],
       [-1.38031997,  1.12459625,  0.79685035, ...,  0.21678532,
         0.02705408, -0.24268212]])

Feather format

In [41]:
import pyarrow.feather as feather

feather.write_feather(df, 'example.feather')
read_df = feather.read_feather('example.feather')

In [42]:
with open('example2.feather', 'wb') as f:
    feather.write_feather(df, f)

with open('example2.feather', 'rb') as f:
    read_df = feather.read_feather(f)