In [None]:
from pathlib import Path as PythonPath

import numpy as np
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.fs as fs
import pyarrow.parquet as pq

from object_store.arrow import ArrowFileSystemHandler

table = pa.table({"a": range(10), "b": np.random.randn(10), "c": [1, 2] * 5})

base = PythonPath.cwd()
store = fs.PyFileSystem(ArrowFileSystemHandler(str(base.absolute())))
arrow_fs = fs.SubTreeFileSystem(str(base.absolute()), fs.LocalFileSystem())

pq.write_table(table.slice(0, 5), "data/data1.parquet", filesystem=store)
pq.write_table(table.slice(5, 10), "data/data2.parquet", filesystem=store)

dataset = ds.dataset("data", format="parquet", filesystem=store)

In [None]:
from pathlib import Path as PythonPath

import numpy as np
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.fs as fs
import pyarrow.parquet as pq

from object_store.arrow import ArrowFileSystemHandler

table = pa.table({"a": range(10), "b": np.random.randn(10), "c": [1, 2] * 5})

base = PythonPath.cwd()
store = ArrowFileSystemHandler(str(base.absolute()))

import pickle

with PythonPath("asd.pkl").open("wb") as handle:
    pickle.dump(store, handle)

with PythonPath("asd.pkl").open("rb") as handle:
    store_pkl = pickle.load(handle)

store_pkl.get_file_info(["asd.pkl"])

In [None]:
from object_store import ObjectMeta, ObjectStore

# we use an in-memory store for demonstration purposes.
# data will not be persisted and is not shared across store instances
store = ObjectStore("memory://")

store.put("data", b"some data")

data = store.get("data")
assert data == b"some data"

blobs = store.list()

meta: ObjectMeta = store.head("data")

range = store.get_range("data", start=0, length=4)
assert range == b"some"

store.copy("data", "copied")
copied = store.get("copied")
assert copied == data

In [None]:
import duckdb

con = duckdb.connect()
results = con.execute("SELECT * FROM dataset WHERE c = 2").arrow()

results.shape

In [None]:
visited_paths = []


def file_visitor(written_file):
    visited_paths.append(written_file)


partitioning = ds.partitioning(pa.schema([("c", pa.int64())]), flavor="hive")
ds.write_dataset(
    table,
    "partitioned",
    partitioning=partitioning,
    format="parquet",
    filesystem=store,
    file_visitor=file_visitor,
)

len(visited_paths)

In [None]:
partitioning = ds.partitioning(pa.schema([("c", pa.int64())]), flavor="hive")
dataset_part = ds.dataset("/partitioned", format="parquet", filesystem=store, partitioning=partitioning)
dataset_part.schema

In [None]:
from object_store import ObjectStore

store = ObjectStore("az://delta-rs", options={"account_name": "mlfusiondev", "use_azure_cli": "true"})

store.list()

In [None]:
import os

import pyarrow.fs as pa_fs

from object_store import ClientOptions
from object_store.arrow import ArrowFileSystemHandler

storage_options = {
    "account_name": os.environ["AZURE_STORAGE_ACCOUNT_NAME"],
    "account_key": os.environ["AZURE_STORAGE_ACCOUNT_KEY"],
}

filesystem = pa_fs.PyFileSystem(ArrowFileSystemHandler("adl://simple", storage_options, ClientOptions()))
filesystem.get_file_info(["part-00000-a72b1fb3-f2df-41fe-a8f0-e65b746382dd-c000.snappy.parquet"])