In [None]:
from pathlib import Path as PythonPath

import numpy as np
import pyarrow as pa
import pyarrow.fs as fs
import pyarrow.dataset as ds
import pyarrow.parquet as pq

from object_store import ArrowFileSystemHandler

table = pa.table({"a": range(10), "b": np.random.randn(10), "c": [1, 2] * 5})

base = PythonPath.cwd()
store = fs.PyFileSystem(ArrowFileSystemHandler(str(base.absolute())))
arrow_fs = fs.SubTreeFileSystem(str(base.absolute()), fs.LocalFileSystem())

pq.write_table(table.slice(0, 5), "data/data1.parquet", filesystem=store)
pq.write_table(table.slice(5, 10), "data/data2.parquet", filesystem=store)

dataset = ds.dataset("data", format="parquet", filesystem=store)


In [None]:
import duckdb

con = duckdb.connect()
results = con.execute("SELECT * FROM dataset WHERE c = 2").arrow()

results.shape

In [None]:
visited_paths = []


def file_visitor(written_file):
    visited_paths.append(written_file)


partitioning = ds.partitioning(pa.schema([("c", pa.int64())]), flavor="hive")
ds.write_dataset(
    table,
    "partitioned",
    partitioning=partitioning,
    format="parquet",
    filesystem=store,
    file_visitor=file_visitor,
)

len(visited_paths)


In [None]:
partitioning = ds.partitioning(pa.schema([("c", pa.int64())]), flavor="hive")
dataset_part = ds.dataset("/partitioned", format="parquet", filesystem=store, partitioning=partitioning)
dataset_part.schema
