## Testing Streaming Output into Parquet with PyArrow

In [21]:
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import torch
from datasets import load_dataset

### Testing out basic table creation

In [10]:
table = pa.table([
    pa.array([
        "The quick brown fox jumped over the log.",
        "Color dreams sleep furiously."       
]),
    pa.array([
        torch.zeros(256).numpy(),
        torch.zeros(256).numpy()
    ])
], names=["text", "embeddings"])

In [11]:
table.schema

text: string
embeddings: list<item: float>
  child 0, item: float

### Testing streaming out into Parquet file
TODO: Best practices for breaking up into multiple files / shards?

#### Write random data

In [12]:
with pq.ParquetWriter('testing.tmp.parquet', table.schema) as writer:
   for i in range(3):
      table = pa.table([
            pa.array([
               f"{i}: The quick brown fox jumped over the log.",
               f"{i}: Color dreams sleep furiously."       
         ]),
            pa.array([
               torch.randn(256).numpy(),
               torch.randn(256).numpy()
            ])
         ], names=["text", "embeddings"])

      writer.write_table(table)

In [18]:
!du -h *.parquet

12K	testing.tmp.parquet


#### Read it back via HuggingFace iterable dataset

In [20]:
parquet_file = pq.ParquetFile('testing.tmp.parquet')
metadata = parquet_file.metadata
print(metadata)

<pyarrow._parquet.FileMetaData object at 0x7f31b8ac38d0>
  created_by: parquet-cpp-arrow version 17.0.0
  num_columns: 2
  num_rows: 6
  num_row_groups: 3
  format_version: 2.6
  serialized_size: 1202


In [35]:
dataset = load_dataset("parquet", data_files={'train': 'testing.tmp.parquet'}, streaming=True, batch_size=1024, )

In [38]:
for x in dataset['train'].iter(batch_size=1024):
    print(list(x.keys()), len(x['text']), len(x['embeddings']))
    print(type(np.array(x['embeddings'][0])))

['text', 'embeddings'] 6 6
<class 'float'>
