In [None]:
import duckdb
from pyiceberg.catalog.sql import SqlCatalog
import pyarrow as pa
import os
import shutil
import gcsfs

In [None]:
# get Q2 2023 to through april 2024 (latest available data)
trips_ls = []
months = [
    '2023-04',
    '2023-05', 
    '2023-06', 
    '2023-07', 
    '2023-08', 
    '2023-09', 
    '2023-10', 
    '2023-11', 
    '2023-12', 
    '2024-01', 
    '2024-02', 
    '2024-03', 
    '2024-04'
    ]
for month in months:
    table_path = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{month}.parquet'
    table = duckdb.sql(f"SELECT * FROM '{table_path}'").arrow()
    trips_ls.append(table)

# concatenate all tables
trips = pa.concat_tables(trips_ls)
print("Rows in trips: ",trips.num_rows)

# get location zone mapping
zones = duckdb.sql("SELECT * FROM 'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv'").arrow()
print("Rows in zones: ",zones.num_rows)

In [None]:
# create iceberg catalog using postgres database in gcp
catalog_name = "demo_iceberg"
catalog_uri = "postgresql://postgres:T%26%23-zHSbQ2Y%600%3DY2@34.85.145.188:5432/postgres"
warehouse_path = "gs://def-blog-bucket/duck-iceberg-blog"


# create iceberg catalog using gcp hosted postgres database
catalog = SqlCatalog(
    catalog_name,
    **{
        "uri": catalog_uri,
        "warehouse": warehouse_path,
    },
)

# create a namespace for Iceberg
name_space = 'taxi'
try:
    catalog.create_namespace(name_space)
except Exception as e:
    print(e)

In [None]:
def add_version_hint(iceberg_table):
    metadata_location = iceberg_table.metadata_location
    protocol = metadata_location.split(":")[0]

    if protocol == "file":
        metadata_location = metadata_location[7:]
    elif protocol == "gs":
        metadata_location = metadata_location[5:]
    else:
        print(f"Unsupported metadata location: {metadata_location}")
        return

    metadata_dir = os.path.dirname(metadata_location)
    new_metadata_file = os.path.join(metadata_dir, "v1.metadata.json")
    version_hint_file = os.path.join(metadata_dir, "version-hint.text")

    if protocol == "file":
        shutil.copy(metadata_location, new_metadata_file)
        with open(version_hint_file, "w") as f:
            f.write("1")
    elif protocol == "gs":
        fs = gcsfs.GCSFileSystem()
        fs.copy(metadata_location, new_metadata_file)
        with fs.open(version_hint_file, "w") as f:
            f.write("1")

    print(f"Copied metadata file to {new_metadata_file}")
    print(f"Created {version_hint_file} with content '1'")

In [None]:
# add tables to iceberg catalog
for table, table_name in [
    (trips, "trips"),
    (zones, "zones"),
]:  
	# create the iceberg table
    iceberg_table = catalog.create_table(
        f"{name_space}.{table_name}",
        schema=table.schema,
    )

    # add data to iceberg table
    iceberg_table.append(table)

    # copy catalog metadata to iceberg table
    add_version_hint(iceberg_table)
    
    print(f"Created {table_name}, {table.num_rows} rows")

In [None]:
# initiate a duckdb connection which we will use to be the query engine for iceberg
con = duckdb.connect(database=':memory:', read_only=False)
setup_sql = '''
INSTALL iceberg;
LOAD iceberg;

CREATE SECRET (
    TYPE GCS,
    KEY_ID 'YOUR_HMAC_KEY',
    SECRET 'YOUR_HMAC_SECRET'
);
'''
res = con.execute(setup_sql)

In [None]:
catalog_name = "demo_iceberg"
catalog_uri = "YOUR_POSTGRES_URI"
warehouse_path = "gs://YOUR_BUCKET"
name_space = 'taxi'

In [None]:
# create the schema and views of iceberg tables in duckdb
database_path = f'{warehouse_path}/{name_space}.db'

create_view_sql = f'''
CREATE SCHEMA IF NOT EXISTS taxi;

CREATE VIEW taxi.trips AS
SELECT * FROM iceberg_scan('{database_path}/trips', allow_moved_paths = true);

CREATE VIEW taxi.zones AS
SELECT * FROM iceberg_scan('{database_path}/zones', allow_moved_paths = true);
'''

con.execute(create_view_sql)

In [None]:
sql = f'''
select 
    count(*)
from taxi.trips
'''

%time res = con.execute(sql)
res.fetchdf()

In [None]:
sql = f'''
select 
    date_trunc('month', tpep_pickup_datetime) as month,
    avg(passenger_count) as avg_passenger_count,
    avg(trip_distance) as avg_trip_distance,
    sum(trip_distance) as total_trip_distance,
    avg(total_amount) as avg_total_amount,
    sum(total_amount) as total_amount,
    count(*) as total_trips
from taxi.trips
-- some data pre and post our target date range is in the dataset, so we filter it out
where tpep_pickup_datetime between '2023-04-01' and '2024-05-01'
group by 1
order by 1
'''

%time res = con.execute(sql)
res.fetchdf()

In [None]:
sql = f'''
select 
    zones.Borough,
    count(*) as total_trips,
    sum(total_amount) as total_amount
from taxi.zones as zones
left join taxi.trips as trips
    on zones.LocationID = trips.DOLocationID
group by 1 
order by 2 desc
'''

%time res = con.execute(sql)
res.fetchdf()

In [None]:
sql = f'''
select 
    starting_zone.Borough as pickup_borough,
    ending_zone.Borough as dropoff_borough,
    count(*) as trip_count
from
taxi.trips as trips
left join taxi.zones as starting_zone
    on trips.PULocationID = starting_zone.LocationID
left join taxi.zones as ending_zone
    on trips.DOLocationID = ending_zone.LocationID
group by 1, 2
order by 1 asc, 3 desc
'''

%time res = con.execute(sql)
res.fetchdf().head(20)