# Create a geoparquet file 

In [3]:
import pystac_client
import json
from pathlib import Path
from stac_geoparquet.arrow import parse_stac_ndjson_to_arrow, to_parquet
from pyarrow.parquet import read_table

## Query a STAC API endpoint

In [4]:
# STAC API Endpoint
stac_url = "https://planetarycomputer.microsoft.com/api/stac/v1"

# Define Time Range
start_date = "2023-01-01"
end_date = "2023-12-31"
date_range = f"{start_date}/{end_date}"

# AOI for our collection
geometry = {
    "type": "Polygon",
    "coordinates": [
        [
            [-9.775114442649567, 44.141939637656606],
            [-9.775114442649567, 35.48272079491679],
            [1.302203384529662, 35.48272079491679],
            [1.302203384529662, 44.141939637656606],
            [-9.775114442649567, 44.141939637656606],
        ]
    ],
}

# Connect to STAC API
catalog = pystac_client.Client.open(stac_url)

# Search for Items
search = catalog.search(
    collections=["sentinel-2-l2a"],
    intersects=geometry,
    datetime=date_range,
    max_items=1000,
)

## Create a JSONl file

The JSON Lines text format, also called newline-delimited JSON, is a convenient format for storing structured data that may be processed one record at a time. 

Create a jsonl file with one feature (STAC Item) per line

In [5]:
items_iter = search.items()

max_items = 1000
s2_json_path = Path("sentinel-2-l2a.jsonl")
if not s2_json_path.exists():
    with open(s2_json_path, "w") as f:
        count = 0

        for item in items_iter:
            json.dump(item.to_dict(), f, separators=(",", ":"))
            f.write("\n")

            count += 1
            if count >= max_items:
                break

## Create a record batch reader

Use `stac_geoparquet.arrow.parse_stac_ndjson_to_arrow` to create an Apache Arrow record batch reader

See:

* `stac_geoparquet.arrow.parse_stac_ndjson_to_arrow` [https://stac-utils.github.io/stac-geoparquet/latest/api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_arrow](https://stac-utils.github.io/stac-geoparquet/latest/api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_arrow)
* `RecordBatchReader` [https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader](https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader)

In [6]:
record_batch_reader = parse_stac_ndjson_to_arrow(s2_json_path)

## Create a `pyarrow.lib.Table`

See  [https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow-table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow-table)

In [7]:
table = record_batch_reader.read_all()

table.schema

assets: struct<AOT: struct<gsd: double, href: string, proj:bbox: list<item: double>, proj:shape: list<item: int64>, proj:transform: list<item: double>, roles: list<item: string>, title: string, type: string>, B01: struct<eo:bands: list<item: struct<center_wavelength: double, common_name: string, description: string, full_width_half_max: double, name: string>>, gsd: double, href: string, proj:bbox: list<item: double>, proj:shape: list<item: int64>, proj:transform: list<item: double>, roles: list<item: string>, title: string, type: string>, B02: struct<eo:bands: list<item: struct<center_wavelength: double, common_name: string, description: string, full_width_half_max: double, name: string>>, gsd: double, href: string, proj:bbox: list<item: double>, proj:shape: list<item: int64>, proj:transform: list<item: double>, roles: list<item: string>, title: string, type: string>, B03: struct<eo:bands: list<item: struct<center_wavelength: double, common_name: string, description: string, full_width

Inspect the first row

In [8]:
table.to_pandas().head()

Unnamed: 0,assets,bbox,collection,geometry,id,links,stac_extensions,stac_version,type,constellation,...,s2:product_uri,s2:reflectance_conversion_factor,s2:saturated_defective_pixel_percentage,s2:snow_ice_percentage,s2:thin_cirrus_percentage,s2:unclassified_percentage,s2:vegetation_percentage,s2:water_percentage,sat:orbit_state,sat:relative_orbit
0,"{'AOT': {'gsd': 10.0, 'href': 'https://sentine...","{'xmin': -10.25298175, 'ymin': 43.25815001, 'x...",sentinel-2-l2a,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x0e\x00...,S2B_MSIL2A_20231231T114409_R123_T29TMJ_2023123...,[{'href': 'https://planetarycomputer.microsoft...,[https://stac-extensions.github.io/eo/v1.1.0/s...,1.1.0,Feature,Sentinel 2,...,S2B_MSIL2A_20231231T114409_N0510_R123_T29TMJ_2...,1.03404,0.0,0.0,40.464744,0.0,0.0,0.708269,descending,123
1,"{'AOT': {'gsd': 10.0, 'href': 'https://sentine...","{'xmin': 0.45686052, 'ymin': 44.13799675, 'xma...",sentinel-2-l2a,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,S2A_MSIL2A_20231231T105441_R051_T31TCK_2023123...,[{'href': 'https://planetarycomputer.microsoft...,[https://stac-extensions.github.io/eo/v1.1.0/s...,1.1.0,Feature,Sentinel 2,...,S2A_MSIL2A_20231231T105441_N0510_R051_T31TCK_2...,1.034038,0.0,0.129087,13.664892,0.0,0.0,0.001739,descending,51
2,"{'AOT': {'gsd': 10.0, 'href': 'https://sentine...","{'xmin': 0.4959286, 'ymin': 43.2382626, 'xmax'...",sentinel-2-l2a,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,S2A_MSIL2A_20231231T105441_R051_T31TCJ_2023123...,[{'href': 'https://planetarycomputer.microsoft...,[https://stac-extensions.github.io/eo/v1.1.0/s...,1.1.0,Feature,Sentinel 2,...,S2A_MSIL2A_20231231T105441_N0510_R051_T31TCJ_2...,1.034038,0.0,1e-05,18.065368,8.6e-05,0.0,0.010481,descending,51
3,"{'AOT': {'gsd': 10.0, 'href': 'https://sentine...","{'xmin': 0.53321795, 'ymin': 42.33836183, 'xma...",sentinel-2-l2a,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,S2A_MSIL2A_20231231T105441_R051_T31TCH_2023123...,[{'href': 'https://planetarycomputer.microsoft...,[https://stac-extensions.github.io/eo/v1.1.0/s...,1.1.0,Feature,Sentinel 2,...,S2A_MSIL2A_20231231T105441_N0510_R051_T31TCH_2...,1.034038,0.0,0.055783,4.123205,0.047372,0.522049,0.014681,descending,51
4,"{'AOT': {'gsd': 10.0, 'href': 'https://sentine...","{'xmin': 0.5688037, 'ymin': 41.4388363, 'xmax'...",sentinel-2-l2a,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,S2A_MSIL2A_20231231T105441_R051_T31TCG_2023123...,[{'href': 'https://planetarycomputer.microsoft...,[https://stac-extensions.github.io/eo/v1.1.0/s...,1.1.0,Feature,Sentinel 2,...,S2A_MSIL2A_20231231T105441_N0510_R051_T31TCG_2...,1.034038,0.0,0.0,4.116742,0.269664,0.144867,0.097903,descending,51


## Serialize as a geoparquet

Use `stac_geoparquet.arrow.to_parquet` to serialize as geoparquet.

See [https://stac-utils.github.io/stac-geoparquet/latest/api/arrow/#stac_geoparquet.arrow.to_parquet](https://stac-utils.github.io/stac-geoparquet/latest/api/arrow/#stac_geoparquet.arrow.to_parquet)

In [9]:
s2_parquet_path = "s2-stac-api.parquet"
to_parquet(table, s2_parquet_path)

## Verify serialized geoparquet

Use `pyarrow.parquet.read_table` from pyarrow.parquet, see [https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html#pyarrow-parquet-read-table](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html#pyarrow-parquet-read-table)


In [10]:
read_table(s2_parquet_path) == table

True