DigitalOcean Spaces is a S3 compatible Object Storage.


In [None]:
import os

import boto3

DO_ACCESS_KEY_ID = os.getenv("DO_ACCESS_KEY_ID") or ""
DO_SECRET_ACCESS_KEY = os.getenv("DO_SECRET_ACCESS_KEY") or ""
DO_REGION = "fra1"

spaces = boto3.resource(
    service_name="s3",
    endpoint_url=f"https://{DO_REGION}.digitaloceanspaces.com",
    region_name=DO_REGION,
    aws_access_key_id=DO_ACCESS_KEY_ID,
    aws_secret_access_key=DO_SECRET_ACCESS_KEY,
)

### List Buckets

At DigitalOcean, every Spaces product is one bucket, i.e. to create a new bucket, you'll have to buy another Spaces.


In [None]:
import duckdb

duck = duckdb.connect()
duck.execute(f"SET s3_access_key_id='{DO_ACCESS_KEY_ID}'")
duck.execute(f"SET s3_secret_access_key='{DO_SECRET_ACCESS_KEY}'")
duck.execute(f"SET s3_region='{DO_REGION}'")
duck.execute("SET s3_endpoint='fra1.digitaloceanspaces.com'")

duck.read_parquet("s3://uniquestocks/datalake/raw/part-0.parquet")

In [None]:
import polars as pl

storage_options = {
    "aws_endpoint_url": f"https://{DO_REGION}.digitaloceanspaces.com",
    "aws_access_key_id": DO_ACCESS_KEY_ID,
    "aws_secret_access_key": DO_SECRET_ACCESS_KEY,
}

df = pl.scan_delta(
    "s3://uniquestocks/datalake/raw/test.csv",
    storage_options=storage_options,
)
df.collect()

In [None]:
import polars as pl

storage_options = {
    "aws_endpoint_url": f"https://uniquestocks.{DO_REGION}.digitaloceanspaces.com",
    "aws_access_key_id": DO_ACCESS_KEY_ID,
    "aws_secret_access_key": DO_SECRET_ACCESS_KEY,
    "aws_virtual_hosted_style_request": "true",
    # "aws_bucket_name": "uniquestocks",
}


df = pl.scan_pyarrow_dataset(
    "s3://raw/e/",
    storage_options=storage_options,
)
df

### PyArrow dataset


In [None]:
import pyarrow.dataset as ds
from pyarrow import fs

filesystem = fs.S3FileSystem(
    access_key=DO_ACCESS_KEY_ID,
    secret_key=DO_SECRET_ACCESS_KEY,
    endpoint_override=f"{DO_REGION}.digitaloceanspaces.com",
)

dataset = ds.dataset("uniquestocks/datalake/raw/", format="parquet", filesystem=filesystem)
dataset.head(10)

In [None]:
pl.scan_pyarrow_dataset(dataset).collect()

In [None]:
for bucket in spaces.buckets.all():
    print(bucket.name)

### List Objects in Bucket


In [None]:
for obj in spaces.Bucket("uniquestocks").objects.all():
    print(obj.key)