In [None]:
import pandas as pd
from pyiceberg.catalog import load_catalog
import os
import sys

# Add the pipeline to path
sys.path.append('/home/tr1x/programming/bike-data-flow/wrm_pipeline')

from wrm_pipeline.config import (
    S3_ENDPOINT_URL, S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY,
    BUCKET_NAME, WRM_STATIONS_S3_PREFIX
)

# Create the catalog with the same configuration as your Dagster resources
catalog = load_catalog(
    "default",
    type="sql",
    uri=f"sqlite:///{os.path.expanduser('~')}/iceberg_catalog.db",
    warehouse=f"s3://{BUCKET_NAME}/{WRM_STATIONS_S3_PREFIX}iceberg/",
    **{
        "s3.endpoint": S3_ENDPOINT_URL,
        "s3.access-key-id": S3_ACCESS_KEY_ID,
        "s3.secret-access-key": S3_SECRET_ACCESS_KEY,
        "s3.path-style-access": "true",
    }
)

print("Catalog loaded successfully with S3 configuration!")

# Query the tables using the correct names from the default namespace
print("\nLoading sample bike stations table...")
sample_table = catalog.load_table("default.sample_bike_stations_iceberg")

print("Loading processed bike stations table...")
processed_table = catalog.load_table("default.processed_bike_stations_iceberg")

Catalog loaded successfully with S3 configuration!

Loading sample bike stations table...
Loading processed bike stations table...


In [36]:
sample_table.scan().to_pandas()

Unnamed: 0,station_id,station_name,latitude,longitude,available_bikes,available_docks,last_updated
0,1,Station_001,51.192798,16.939933,5,18,2025-06-16 00:42:08
1,2,Station_002,51.037155,16.945371,18,24,2025-06-16 00:42:08
2,3,Station_003,51.013434,17.05558,2,21,2025-06-16 00:42:08
3,4,Station_004,51.073556,16.971973,18,15,2025-06-16 00:42:08
4,5,Station_005,51.070285,17.022264,15,10,2025-06-16 00:42:08
5,6,Station_006,51.009132,16.921601,8,21,2025-06-16 00:42:08
6,7,Station_007,51.156915,17.050512,18,20,2025-06-16 00:42:08
7,8,Station_008,51.127558,17.046979,11,20,2025-06-16 00:42:08
8,9,Station_009,51.061532,17.031355,17,15,2025-06-16 00:42:08
9,10,Station_010,51.145213,16.923004,0,14,2025-06-16 00:42:08


In [38]:
# Convert to pandas and display
print("Converting to pandas DataFrames...")
sample_df = sample_table.scan().to_pandas()
processed_df = processed_table.scan().to_pandas()

print("\n" + "="*50)
print("SAMPLE BIKE STATIONS DATA:")
print("="*50)
print(sample_df.head())
print(f"\nShape: {sample_df.shape}")
print(f"Columns: {list(sample_df.columns)}")
print(f"Data types:\n{sample_df.dtypes}")

print("\n" + "="*50)
print("PROCESSED BIKE STATIONS DATA:")
print("="*50)
print(processed_df.head())
print(f"\nShape: {processed_df.shape}")
print(f"Columns: {list(processed_df.columns)}")
print(f"Data types:\n{processed_df.dtypes}")

# Show some basic statistics
print("\n" + "="*50)
print("BASIC STATISTICS:")
print("="*50)
print("Sample data stats:")
print(sample_df.describe())

print("\nProcessed data stats:")
print(processed_df.describe())

Converting to pandas DataFrames...

SAMPLE BIKE STATIONS DATA:
   station_id station_name   latitude  longitude  available_bikes  \
0           1  Station_001  51.192798  16.939933                5   
1           2  Station_002  51.037155  16.945371               18   
2           3  Station_003  51.013434  17.055580                2   
3           4  Station_004  51.073556  16.971973               18   
4           5  Station_005  51.070285  17.022264               15   

   available_docks        last_updated  
0               18 2025-06-16 00:42:08  
1               24 2025-06-16 00:42:08  
2               21 2025-06-16 00:42:08  
3               15 2025-06-16 00:42:08  
4               10 2025-06-16 00:42:08  

Shape: (50, 7)
Columns: ['station_id', 'station_name', 'latitude', 'longitude', 'available_bikes', 'available_docks', 'last_updated']
Data types:
station_id                  int64
station_name               object
latitude                  float64
longitude                 f