In [15]:
# create the schema and views of iceberg tables in duckdb
warehouse_path = "s3://iceberg-duckdb-demo/iceberg-catalog/"
name_space = 'taxi'

database_path = f'{warehouse_path}{name_space}.db'

create_view_sql = f'''
CREATE SCHEMA IF NOT EXISTS taxi;

CREATE VIEW taxi.trips AS
SELECT * FROM iceberg_scan('{database_path}/trips', allow_moved_paths = true);

CREATE VIEW taxi.zones AS
SELECT * FROM iceberg_scan('{database_path}/zones', allow_moved_paths = true);
'''

In [16]:
print(create_view_sql)


CREATE SCHEMA IF NOT EXISTS taxi;

CREATE VIEW taxi.trips AS
SELECT * FROM iceberg_scan('s3://iceberg-duckdb-demo/iceberg-catalog/taxi.db/trips', allow_moved_paths = true);

CREATE VIEW taxi.zones AS
SELECT * FROM iceberg_scan('s3://iceberg-duckdb-demo/iceberg-catalog/taxi.db/zones', allow_moved_paths = true);



In [44]:
import config
import boto3 
import duckdb
from botocore.exceptions import ClientError
import json


# gets secrets from aws secret manager
def get_secret(secret_name, region_name):
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )

    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        raise e

    secret = json.loads(get_secret_value_response['SecretString'])
    return secret.get('secret_key')


# def init_duckdb_connection():
#     access_key = config.ACCESS_KEY
#     secret_key = get_secret(config.SECRET_KEY_NAME, config.SECRET_REGION)
#     con = duckdb.connect()
#     setup_sql = f"""
#         INSTALL iceberg;
#         LOAD iceberg;

#         INSTALL httpfs;
#         LOAD httpfs;

#         CREATE SECRET (
#             TYPE S3,
#             KEY_ID '{access_key}',
#             SECRET '{secret_key}',
#             REGION '{config.S3_BUCKET_REGION}'
#         );
#     """
#     con.execute(setup_sql)
#     return con

access_key = config.ACCESS_KEY
secret_key = get_secret(config.SECRET_KEY_NAME, config.SECRET_REGION)
con = duckdb.connect()
setup_sql = f"""
    INSTALL iceberg;
    LOAD iceberg;

    INSTALL httpfs;
    LOAD httpfs;

    CREATE SECRET (
        TYPE S3,
        KEY_ID '{access_key}',
        SECRET '{secret_key}',
        REGION '{config.S3_BUCKET_REGION}'
    );
"""
con.execute(setup_sql)


<duckdb.duckdb.DuckDBPyConnection at 0x1121743b0>

In [61]:
import requests 
# url = 'http://127.0.0.1:5000/query'
url = 'http://18.216.146.210:5000/query'
query = {
    "query": create_view_sql
}
response = requests.post(url, params=query)
response.text

'{\n  "result": []\n}\n'

In [62]:
import requests 

sql = f'''
select 
    starting_zone.Borough as pickup_borough,
    ending_zone.Borough as dropoff_borough,
    count(*) as trip_count
from
taxi.trips as trips
left join taxi.zones as starting_zone
    on trips.PULocationID = starting_zone.LocationID
left join taxi.zones as ending_zone
    on trips.DOLocationID = ending_zone.LocationID
group by 1, 2
order by 1 asc, 3 desc
limit 20
'''
query = {
    "query": sql
}
response = requests.post(url, params=query)
response.json()

{'result': [['Bronx', 'Bronx', 38900],
  ['Bronx', 'Manhattan', 33779],
  ['Bronx', 'Queens', 7179],
  ['Bronx', 'Brooklyn', 6943],
  ['Bronx', 'N/A', 522],
  ['Bronx', 'Unknown', 231],
  ['Bronx', 'Staten Island', 163],
  ['Bronx', 'EWR', 35],
  ['Brooklyn', 'Brooklyn', 181514],
  ['Brooklyn', 'Manhattan', 130742],
  ['Brooklyn', 'Queens', 43224],
  ['Brooklyn', 'Bronx', 6967],
  ['Brooklyn', 'N/A', 1002],
  ['Brooklyn', 'EWR', 922],
  ['Brooklyn', 'Unknown', 896],
  ['Brooklyn', 'Staten Island', 643],
  ['EWR', 'EWR', 4710],
  ['EWR', 'Unknown', 237],
  ['EWR', 'N/A', 205],
  ['EWR', 'Manhattan', 129]]}

In [70]:
import requests 

sql = f'''
select 
    starting_zone.Borough as pickup_borough,
    ending_zone.Borough as dropoff_borough,
    count(*) as trip_count
from
taxi.trips as trips
left join taxi.zones as starting_zone
    on trips.PULocationID = starting_zone.LocationID
left join taxi.zones as ending_zone
    on trips.DOLocationID = ending_zone.LocationID
group by 1, 2
order by 1 asc, 3 desc
limit 20
'''

query = {
    "query": sql
}
response = requests.post(url, params=query)
response.json()

{'result': [['Bronx', 'Bronx', 38900],
  ['Bronx', 'Manhattan', 33779],
  ['Bronx', 'Queens', 7179],
  ['Bronx', 'Brooklyn', 6943],
  ['Bronx', 'N/A', 522],
  ['Bronx', 'Unknown', 231],
  ['Bronx', 'Staten Island', 163],
  ['Bronx', 'EWR', 35],
  ['Brooklyn', 'Brooklyn', 181514],
  ['Brooklyn', 'Manhattan', 130742],
  ['Brooklyn', 'Queens', 43224],
  ['Brooklyn', 'Bronx', 6967],
  ['Brooklyn', 'N/A', 1002],
  ['Brooklyn', 'EWR', 922],
  ['Brooklyn', 'Unknown', 896],
  ['Brooklyn', 'Staten Island', 643],
  ['EWR', 'EWR', 4710],
  ['EWR', 'Unknown', 237],
  ['EWR', 'N/A', 205],
  ['EWR', 'Manhattan', 129]]}

In [64]:
import requests 

sql = f'''
select 
    count(*)
from taxi.trips
'''

query = {
    "query": sql
}
response = requests.post(url, params=query)
response.json()

{'result': [[41994806]]}

In [65]:
import requests 

sql = f'''
select 
    count(*)
from iceberg_scan('{database_path}/trips', allow_moved_paths = true);
'''

query = {
    "query": sql
}
response = requests.post(url, params=query)
response.json()

{'result': [[41994806]]}

In [66]:
import requests 

sql = f'''
select 
    date_trunc('month', tpep_pickup_datetime) as month,
    avg(passenger_count) as avg_passenger_count,
    avg(trip_distance) as avg_trip_distance,
    sum(trip_distance) as total_trip_distance,
    avg(total_amount) as avg_total_amount,
    sum(total_amount) as total_amount,
    count(*) as total_trips
from taxi.trips
-- some data pre and post our target date range is in the dataset, so we filter it out
where tpep_pickup_datetime between '2023-04-01' and '2024-05-01'
group by 1
order by 1
'''

query = {
    "query": sql
}
response = requests.post(url, params=query)
response.json()

{'result': [['Sat, 01 Apr 2023 00:00:00 GMT',
   1.3828223099929224,
   4.096190325722789,
   13468941.469999624,
   28.26947759882149,
   92954650.26977366,
   3288163],
  ['Mon, 01 May 2023 00:00:00 GMT',
   1.3588013955991263,
   4.345793046802145,
   15269656.579999011,
   28.96293478253187,
   101766021.27973007,
   3513664],
  ['Thu, 01 Jun 2023 00:00:00 GMT',
   1.3690120759300761,
   4.36875410120578,
   14448601.319999726,
   29.068591310734654,
   96137360.22974898,
   3307259],
  ['Sat, 01 Jul 2023 00:00:00 GMT',
   1.4019610865600263,
   4.489436884199477,
   13051210.539998109,
   28.568067928964304,
   83050030.29981662,
   2907093],
  ['Tue, 01 Aug 2023 00:00:00 GMT',
   1.38697920378319,
   4.782777107578069,
   13507523.889999092,
   28.628029520532756,
   80851309.59991813,
   2824201],
  ['Fri, 01 Sep 2023 00:00:00 GMT',
   1.3564035830565937,
   4.274258174522569,
   12167705.989998553,
   29.781914452319462,
   84781396.92991036,
   2846741],
  ['Sun, 01 Oct 2023 0

In [71]:
len(response.json()['result'])

20

In [69]:
import requests 

sql = f'''
select 
    zones.Borough,
    count(*) as total_trips,
    sum(total_amount) as total_amount
from taxi.zones as zones
left join taxi.trips as trips
    on zones.LocationID = trips.DOLocationID
group by 1 
order by 2 desc
limit 20
'''

# url = 'https://duck-iceberg-demo-rn5klrouba-ue.a.run.app/query'
query = {
    "query": sql
}
response = requests.post(url, params=query)
response.json()

{'result': [['Manhattan', 37176949, 922731898.1879705],
  ['Queens', 2254654, 121335533.93970689],
  ['Brooklyn', 1623715, 81354331.42998819],
  ['Unknown', 367291, 10740173.809998704],
  ['Bronx', 253341, 13218865.089996865],
  ['N/A', 182458, 21679919.79999616],
  ['EWR', 125164, 15625457.64000111],
  ['Staten Island', 11237, 1127515.209999987]]}

In [68]:
len(response.json()['result'])

8