In [1]:
!pip install -q trino

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
CATALOG_URL = "http://lakekeeper:8181/catalog"
TRINO_URI = "http://trino:8080"
KEYCLOAK_TOKEN_ENDPOINT = "http://keycloak:8080/realms/iceberg/protocol/openid-connect/token"
WAREHOUSE = "irisa-ot"  # Changed from "demo" to match DuckDB

CLIENT_ID = "trino"  # Use trino client as configured in bootstrap
CLIENT_SECRET = "AK48QgaKsqdEpP9PomRJw7l2T7qWGHdZ"  # Use trino client secret

# Create Trino Catalog

In [3]:
from trino.dbapi import connect

conn = connect(host=TRINO_URI, user="trino")

In [4]:
cur = conn.cursor()
cur.execute(
    f"""
    CREATE CATALOG lakekeeper USING iceberg
    WITH (
        "iceberg.catalog.type" = 'rest',
        "iceberg.rest-catalog.uri" = '{CATALOG_URL}',
        "iceberg.rest-catalog.warehouse" = '{WAREHOUSE}',
        "iceberg.rest-catalog.security" = 'OAUTH2',
        "iceberg.rest-catalog.oauth2.credential" = '{CLIENT_ID}:{CLIENT_SECRET}',
        "iceberg.rest-catalog.vended-credentials-enabled" = 'true',
        "iceberg.rest-catalog.oauth2.scope" = 'lakekeeper',
        "iceberg.rest-catalog.oauth2.server-uri" = '{KEYCLOAK_TOKEN_ENDPOINT}',
        "s3.region"= 'dummy',
        "s3.path-style-access" = 'true',
        "s3.endpoint" = 'http://minio:9000',
        "fs.native-s3.enabled" = 'true'
    )
"""
)

<trino.dbapi.Cursor at 0x7521b6d09520>

## Read and Write Tables

In [5]:
# Connect directly to demo catalog, so that we don't have to use it as a prefix
conn = connect(host=TRINO_URI, user="trino", catalog="lakekeeper")
cur = conn.cursor()

In [6]:
cur.execute("CREATE SCHEMA IF NOT EXISTS irisa")  # Match DuckDB namespace

<trino.dbapi.Cursor at 0x7521b6d0a630>

In [None]:
# cur.execute(
#     "CREATE TABLE trino_namespace.my_table (my_ints INT, my_floats DOUBLE, strings VARCHAR) WITH (format='PARQUET')"
# )
# cur.execute(
#     "INSERT INTO trino_namespace.my_table VALUES (1, 1.0, 'a'), (2, 2.0, 'b')"
# )

In [7]:
# Query the existing table from DuckDB notebook
cur.execute("SELECT * FROM irisa.fake_seclink LIMIT 5").fetchall()

[[1,
  6826,
  1,
  1,
  datetime.datetime(2024, 6, 25, 20, 39, 44),
  datetime.datetime(2024, 6, 25, 21, 21, 44),
  'Message body for record 1 from source 2 to destination 5'],
 [28,
  8797,
  2,
  5,
  datetime.datetime(2024, 6, 14, 16, 46, 43),
  datetime.datetime(2024, 6, 14, 16, 50, 43),
  'Message body for record 28 from source 5 to destination 4'],
 [29,
  5441,
  3,
  4,
  datetime.datetime(2024, 6, 30, 15, 46, 44),
  datetime.datetime(2024, 6, 30, 16, 15, 44),
  'Message body for record 29 from source 3 to destination 5'],
 [10,
  1531,
  5,
  5,
  datetime.datetime(2024, 6, 1, 19, 56, 54),
  datetime.datetime(2024, 6, 1, 20, 35, 54),
  'Message body for record 10 from source 3 to destination 2'],
 [15,
  8321,
  2,
  2,
  datetime.datetime(2024, 6, 20, 4, 38),
  datetime.datetime(2024, 6, 20, 5, 33),
  'Message body for record 15 from source 4 to destination 5']]

In [8]:
print("📊 Query 1: Total record count")
total_count = cur.execute("SELECT COUNT(*) FROM irisa.fake_seclink").fetchone()[0]
print(f"Total records: {total_count}")

📊 Query 1: Total record count
Total records: 10000


In [9]:
# Cell 11: Records by Month (Partitioning Test)
print("📅 Query 2: Records by month (partitioning test)")
for month in range(1, 7):
    result = cur.execute(f"""
        SELECT COUNT(*) FROM irisa.fake_seclink
        WHERE EXTRACT(MONTH FROM DateIn) = {month} AND EXTRACT(YEAR FROM DateIn) = 2024
    """).fetchone()[0]
    print(f"Month {month} (2024-{month:02d}): {result} records")

📅 Query 2: Records by month (partitioning test)
Month 1 (2024-01): 1709 records
Month 2 (2024-02): 1572 records
Month 3 (2024-03): 1696 records
Month 4 (2024-04): 1622 records
Month 5 (2024-05): 1754 records
Month 6 (2024-06): 1647 records


In [10]:
# Cell 12: Top Sources by Record Count
print("🏢 Query 3: Top sources by record count")
top_sources = cur.execute("""
    SELECT Source, COUNT(*) as count
    FROM irisa.fake_seclink
    GROUP BY Source
    ORDER BY count DESC
    LIMIT 5
""").fetchall()
for source, count in top_sources:
    print(f"Source {source}: {count} records")

🏢 Query 3: Top sources by record count
Source 5: 2035 records
Source 4: 2015 records
Source 3: 2014 records
Source 2: 1973 records
Source 1: 1963 records


In [13]:
# Cell 13: Average Processing Time Analysis
print("⏱️ Query 4: Average processing time analysis")
avg_stats = cur.execute("""
    SELECT 
        AVG(date_diff('minute', DateIn, DateOut)) as avg_min,
        MIN(date_diff('minute', DateIn, DateOut)) as min_min,
        MAX(date_diff('minute', DateIn, DateOut)) as max_min
    FROM irisa.fake_seclink
    WHERE DateOut IS NOT NULL AND DateIn IS NOT NULL
""").fetchone()
print(f"Average processing time: {avg_stats[0]:.2f} minutes")
print(f"Min processing time: {avg_stats[1]:.2f} minutes")
print(f"Max processing time: {avg_stats[2]:.2f} minutes")

⏱️ Query 4: Average processing time analysis
Average processing time: 30.62 minutes
Min processing time: 1.00 minutes
Max processing time: 60.00 minutes


In [14]:
# Cell 14: Busiest Hour of the Day
print("🕐 Query 5: Busiest hour of the day")
hour_stats = cur.execute("""
    SELECT EXTRACT(HOUR FROM DateIn) as hour_of_day, COUNT(*) as count
    FROM irisa.fake_seclink
    GROUP BY EXTRACT(HOUR FROM DateIn)
    ORDER BY count DESC
    LIMIT 1
""").fetchone()
print(f"Busiest hour: {int(hour_stats[0])}:00 with {hour_stats[1]} records")

🕐 Query 5: Busiest hour of the day
Busiest hour: 6:00 with 465 records
