In [1]:
import pyarrow.plasma as plasma
client = plasma.connect("/tmp/plasma")

In [2]:
id = plasma.ObjectID(20 * b"a")
id

ObjectID(6161616161616161616161616161616161616161)

In [4]:
# The random generation of Object IDs is often good enough to ensure unique IDs. You can easily create a helper function that randomly generates object IDs as follows:
import numpy as np

def random_object_id():
    return plasma.ObjectID(np.random.bytes(20))

In [6]:
# Create a python object.
object_id = client.put("hello, world")

# Get the object.
client.get(object_id)

'hello, world'

In [7]:
import pyarrow.plasma as plasma
import time

client = plasma.connect("/tmp/plasma")

client.put("hello, world")
# Sleep a little so we get different creation times
time.sleep(2)
client.put("another object")
# Create an object that is not sealed yet
object_id = plasma.ObjectID.from_random()
client.create(object_id, 100)
print(client.list())

{ObjectID(19b2b9d8f32e6c93ba6582f6613006da6ed4397e): {'data_size': 512, 'metadata_size': 0, 'ref_count': 0, 'create_time': 1549222662, 'construct_duration': 0, 'state': 'sealed'}, ObjectID(57ea70c4a54470126d38ce22e117b08332b7911c): {'data_size': 512, 'metadata_size': 0, 'ref_count': 0, 'create_time': 1549222738, 'construct_duration': 0, 'state': 'sealed'}, ObjectID(243466a67be9870020311598d1301e62fc66c719): {'data_size': 100, 'metadata_size': 0, 'ref_count': 1, 'create_time': 1549222740, 'construct_duration': -1, 'state': 'created'}, ObjectID(9c4eb639cb6652a88163184a25e3ae60d7aa77b1): {'data_size': 512, 'metadata_size': 0, 'ref_count': 0, 'create_time': 1549222665, 'construct_duration': 0, 'state': 'sealed'}, ObjectID(19a6f8ba099828c7fc57c0babeb85b3c9e476f3f): {'data_size': 512, 'metadata_size': 0, 'ref_count': 0, 'create_time': 1549222740, 'construct_duration': 0, 'state': 'sealed'}}


In [8]:
import pyarrow as pa
import pandas as pd

# Create a Pandas DataFrame
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)

# Convert the Pandas DataFrame into a PyArrow RecordBatch
record_batch = pa.RecordBatch.from_pandas(df)

Creating the Plasma object requires an ObjectID and the size of the data. Now that we have converted the Pandas DataFrame into a PyArrow RecordBatch, use the MockOutputStream to determine the size of the Plasma object.



In [9]:
# Create the Plasma object from the PyArrow RecordBatch. Most of the work here
# is done to determine the size of buffer to request from the object store.
object_id = plasma.ObjectID(np.random.bytes(20))
mock_sink = pa.MockOutputStream()
stream_writer = pa.RecordBatchStreamWriter(mock_sink, record_batch.schema)
stream_writer.write_batch(record_batch)
stream_writer.close()
data_size = mock_sink.size()
buf = client.create(object_id, data_size)


In [12]:
print(buf.size)

1196


In [13]:
# Write the PyArrow RecordBatch to Plasma
stream = pa.FixedSizeBufferWriter(buf)
stream_writer = pa.RecordBatchStreamWriter(stream, record_batch.schema)
stream_writer.write_batch(record_batch)
stream_writer.close()

In [14]:
# Finally, seal the finished object for use by all clients:

# Seal the Plasma object
client.seal(object_id)

## Getting Pandas DataFrames from Plasma
* https://arrow.apache.org/docs/python/plasma.html#getting-pandas-dataframes-from-plasma
Since we store the Pandas DataFrame as a PyArrow RecordBatch object, to get the object back from the Plasma store, we follow similar steps to those specified in Getting Arrow Objects from Plasma.

We first have to convert the PlasmaBuffer returned from client.get_buffers into an Arrow BufferReader object.

In [15]:
# Fetch the Plasma object
[data] = client.get_buffers([object_id])  # Get PlasmaBuffer from ObjectID
buffer = pa.BufferReader(data)

In [16]:
# From the BufferReader, we can create a specific RecordBatchStreamReader in Arrow to reconstruct the stored PyArrow RecordBatch object.

# Convert object back into an Arrow RecordBatch
reader = pa.RecordBatchStreamReader(buffer)
record_batch = reader.read_next_batch()

In [17]:
# The last step is to convert the PyArrow RecordBatch object back into the original Pandas DataFrame structure.

# Convert back into Pandas
result = record_batch.to_pandas()