In [60]:
indices = pa.array([0, 1, 0, 1, 2, 0, None, 2])
dictionary = pa.array(['foo', 'bar', 'baz'])
dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
dict_array
 


<pyarrow.lib.DictionaryArray object at 0x7f9a2e5e9f50>

-- dictionary:
  [
    "foo",
    "bar",
    "baz"
  ]
-- indices:
  [
    0,
    1,
    0,
    1,
    2,
    0,
    null,
    2
  ]

In [61]:
data = [
pa.array([1, 2, 3, 4]),
pa.array(['foo', 'bar', 'baz', None]),
pa.array([True, None, False, True])]

In [62]:
batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2'])
print(batch)

pyarrow.RecordBatch
f0: int64
f1: string
f2: bool


Generating Schema

In [63]:
import pyarrow as pa
# Create a new field named "number" of type int64 that is not nullable.
number_field = pa.field('number', pa.int64(), nullable=False)

# Create a list of fields for pa.schema()
schema_fields = [number_field]

# Create a new schema from the fields.
schema = pa.schema(schema_fields)

# Construct some metadata to explain Fletchgen that it 
# should allow the FPGA kernel to read from this schema.
metadata = {b'arrow_mode': b'read',
            b'arrow_name': b'ExampleBatch'}

# Add the metadata to the schema
schema = schema.add_metadata(metadata)

# Show the schema
print(schema)

# Serialize the schema itself into an Arrow buffer.
serialized_schema = schema.serialize()

# Write the buffer to a file output stream.
pa.output_stream('schema.as').write(serialized_schema);

number: int64 not null
-- schema metadata --
arrow_mode: 'read'
arrow_name: 'ExampleBatch'


  exec(code_obj, self.user_global_ns, self.user_ns)


Creating RecordBatch

In [64]:
import pyarrow as pa

# Open the file created in the previous example as an input stream.
stream = pa.input_stream('schema.as')

# Read the Arrow schema from the stream.
schema = pa.read_schema(stream)

# Show contents.
print(schema)

# Create a list of PyArrow Arrays. Every Array can be seen 
# as a 'Column' of the RecordBatch we will create.
data = [pa.array([1, -3, 3, -7])]

# Create a RecordBatch from the Arrays.
recordbatch = pa.RecordBatch.from_arrays(data, schema)

# Print the first (and only) column of this RecordBatch.
print(recordbatch.column(0))

# Create an Arrow RecordBatchFileWriter.
writer = pa.RecordBatchFileWriter('recordbatch.rb', schema)

# Write the RecordBatch.
writer.write(recordbatch)

# Close the writer.
writer.close()

number: int64 not null
-- schema metadata --
arrow_mode: 'read'
arrow_name: 'ExampleBatch'
[
  1,
  -3,
  3,
  -7
]




In [65]:
# the 0-position fixed-typed array, which must be the same length as the caller
xs = pa.array([5,6,7])
# the 1-position fixed-type array, same length restriction
ys = pa.array([False, False, True])
# the types array, which tells the UnionArray which child array to draw from by indx
types = pa.array([0, 1, 1], type=pa.int8())
# construct the array
union_arr = pa.UnionArray.from_sparse(types, [xs, ys])

In [66]:
xs = pa.array([5, 6, 7])
ys = pa.array([False, True])
types = pa.array([0, 1, 1, 0, 0], type=pa.int8())
offsets = pa.array([0, 0, 1, 1, 2], type=pa.int32())
union_arr = pa.UnionArray.from_dense(types, offsets, [xs, ys])

In [67]:
indices = pa.array([0, 1, 0, 1, 2, 0, None, 2])
dictionary = pa.array(['foo', 'bar', 'baz'])
dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)

In [68]:
print(dictionary)
print(dict_array)

[
  "foo",
  "bar",
  "baz"
]

-- dictionary:
  [
    "foo",
    "bar",
    "baz"
  ]
-- indices:
  [
    0,
    1,
    0,
    1,
    2,
    0,
    null,
    2
  ]


In [69]:
data = [
     pa.array([1, 2, 3, 4]),
     pa.array(['foo', 'bar', 'baz', None]),
     pa.array([True, None, False, True])
       ]
batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2'])

In [70]:
print(batch)

pyarrow.RecordBatch
f0: int64
f1: string
f2: bool


In [71]:
batches = [batch] * 5
table = pa.Table.from_batches(batches)

In [72]:
import pyarrow as pa
from pyarrow import csv

nyc = csv.read_csv("yellow_tripdata_2015-01.csv")
print(len(nyc))

999


In [73]:
nyc.schema

VendorID: int64
tpep_pickup_datetime: timestamp[s]
tpep_dropoff_datetime: timestamp[s]
passenger_count: int64
trip_distance: double
pickup_longitude: double
pickup_latitude: double
RateCodeID: int64
store_and_fwd_flag: string
dropoff_longitude: double
dropoff_latitude: double
payment_type: int64
fare_amount: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
improvement_surcharge: double
total_amount: double

In [74]:
mmap = pa.memory_map("yellow_tripdata_2015-01.csv")

In [75]:
mmap

<pyarrow.lib.MemoryMappedFile at 0x7f9a33a06230>

In [76]:
record_batches = pa.get_record_batch_size(batch)



In [77]:
print(record_batches)

352
