In [1]:
!pip install -q starrocks "sqlalchemy<3.0"

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# This CATALOG_URL works for the "docker compose" testing and development environment
# Change 'lakekeeper' if you are not running on "docker compose" (f. ex. 'localhost' if Lakekeeper is running locally).
CATALOG_URL = "http://lakekeeper:8181/catalog"
STARROCKS_URI = "starrocks://root@starrocks:9030"
KEYCLOAK_TOKEN_ENDPOINT = "http://keycloak:8080/realms/iceberg/protocol/openid-connect/token"
WAREHOUSE = "irisa-ot"  # Changed to match other notebooks

CLIENT_ID = "starrocks"
CLIENT_SECRET = "X5IWbfDJBTcU1F3PGZWgxDJwLyuFQmSf"

# Create Starrocks Catalog

In [3]:
from sqlalchemy import create_engine, text

engine = create_engine(STARROCKS_URI)

In [4]:
# Create StarRocks Catalog
with engine.connect() as connection:
    connection.execute(text("DROP CATALOG IF EXISTS lakekeeper"))
    connection.execute(
        text(f"""
        CREATE EXTERNAL CATALOG lakekeeper
        PROPERTIES
        (
            "type" = "iceberg",
            "iceberg.catalog.type" = "rest",
            "iceberg.catalog.uri" = "{CATALOG_URL}",
            "iceberg.catalog.warehouse" = "{WAREHOUSE}",
            "iceberg.catalog.oauth2-server-uri" = "{KEYCLOAK_TOKEN_ENDPOINT}",
            "iceberg.catalog.credential" = "{CLIENT_ID}:{CLIENT_SECRET}",
            "iceberg.catalog.scope" = "lakekeeper offline_access",
            "aws.s3.region" = "local",
            "aws.s3.enable_path_style_access" = "true",
            "aws.s3.endpoint" = "http://minio:9000",
            "aws.s3.access_key" = "minio-root-user",
            "aws.s3.secret_key" = "minio-root-password"
        )
        """)
    )
    connection.execute(text("SET CATALOG lakekeeper"))

## Read and Write Tables

In [5]:
# Create schema to match other notebooks
with engine.connect() as connection:
    connection.execute(text("CREATE SCHEMA IF NOT EXISTS irisa"))

In [6]:
# Query 1: Total record count
print("📊 Query 1: Total record count")
with engine.connect() as connection:
    connection.execute(text("SET CATALOG lakekeeper"))
    total_count = connection.execute(
        text("SELECT COUNT(*) FROM irisa.fake_seclink")
    ).fetchone()[0]
    print(f"Total records: {total_count}")

📊 Query 1: Total record count
Total records: 10000


In [7]:
# Query 2: Records by month (partitioning test)
print("📅 Query 2: Records by month (partitioning test)")
with engine.connect() as connection:
    connection.execute(text("SET CATALOG lakekeeper"))
    for month in range(1, 7):
        result = connection.execute(
            text(f"""
                SELECT COUNT(*) FROM irisa.fake_seclink
                WHERE EXTRACT(MONTH FROM DateIn) = {month} AND EXTRACT(YEAR FROM DateIn) = 2024
            """)
        ).fetchone()[0]
        print(f"Month {month} (2024-{month:02d}): {result} records")

📅 Query 2: Records by month (partitioning test)
Month 1 (2024-01): 1711 records
Month 2 (2024-02): 1616 records
Month 3 (2024-03): 1685 records
Month 4 (2024-04): 1641 records
Month 5 (2024-05): 1660 records
Month 6 (2024-06): 1687 records


In [11]:
# Query 3: Top sources by record count
print("🏢 Query 3: Top sources by record count")
with engine.connect() as connection:
    connection.execute(text("SET CATALOG lakekeeper"))
    top_sources = connection.execute(
        text("""
            SELECT Source, COUNT(*) as count
            FROM irisa.fake_seclink
            GROUP BY Source
            ORDER BY count DESC
            LIMIT 5
        """)
    ).fetchall()
    for source, count in top_sources:
        print(f"Source {source}: {count} records")

🏢 Query 3: Top sources by record count
Source 5: 2035 records
Source 4: 2015 records
Source 3: 2014 records
Source 2: 1973 records
Source 1: 1963 records


In [12]:
# Query 4: Average processing time analysis
print("⏱️ Query 4: Average processing time analysis")
with engine.connect() as connection:
    connection.execute(text("SET CATALOG lakekeeper"))
    avg_stats = connection.execute(
        text("""
            SELECT 
                AVG(TIMESTAMPDIFF(MINUTE, DateIn, DateOut)) as avg_min,
                MIN(TIMESTAMPDIFF(MINUTE, DateIn, DateOut)) as min_min,
                MAX(TIMESTAMPDIFF(MINUTE, DateIn, DateOut)) as max_min
            FROM irisa.fake_seclink
            WHERE DateOut IS NOT NULL AND DateIn IS NOT NULL
        """)
    ).fetchone()
    print(f"Average processing time: {avg_stats[0]:.2f} minutes")
    print(f"Min processing time: {avg_stats[1]:.2f} minutes")
    print(f"Max processing time: {avg_stats[2]:.2f} minutes")

⏱️ Query 4: Average processing time analysis
Average processing time: 30.62 minutes
Min processing time: 1.00 minutes
Max processing time: 60.00 minutes


In [13]:
# Query 5: Busiest hour of the day
print("🕐 Query 5: Busiest hour of the day")
with engine.connect() as connection:
    connection.execute(text("SET CATALOG lakekeeper"))
    hour_stats = connection.execute(
        text("""
            SELECT EXTRACT(HOUR FROM DateIn) as hour_of_day, COUNT(*) as count
            FROM irisa.fake_seclink
            GROUP BY EXTRACT(HOUR FROM DateIn)
            ORDER BY count DESC
            LIMIT 1
        """)
    ).fetchone()
    print(f"Busiest hour: {int(hour_stats[0])}:00 with {hour_stats[1]} records")

🕐 Query 5: Busiest hour of the day
Busiest hour: 6:00 with 465 records


In [14]:
# Sample data query
print("📋 Sample of recent records")
with engine.connect() as connection:
    connection.execute(text("SET CATALOG lakekeeper"))
    recent_records = connection.execute(
        text("SELECT * FROM irisa.fake_seclink LIMIT 5")
    ).fetchall()
    for record in recent_records:
        print(record)

📋 Sample of recent records
(3, 4904, 5, 3, datetime.datetime(2024, 2, 17, 20, 10, 29), datetime.datetime(2024, 2, 17, 20, 40, 29), 'Message body for record 3 from source 4 to destination 4')
(4, 3079, 2, 5, datetime.datetime(2024, 2, 27, 7, 1, 34), datetime.datetime(2024, 2, 27, 7, 30, 34), 'Message body for record 4 from source 3 to destination 3')
(9, 3701, 1, 4, datetime.datetime(2024, 2, 29, 8, 53, 19), datetime.datetime(2024, 2, 29, 8, 56, 19), 'Message body for record 9 from source 4 to destination 3')
(14, 8168, 3, 1, datetime.datetime(2024, 2, 9, 14, 50, 19), datetime.datetime(2024, 2, 9, 15, 22, 19), 'Message body for record 14 from source 1 to destination 5')
(17, 1113, 2, 4, datetime.datetime(2024, 2, 12, 6, 6, 55), datetime.datetime(2024, 2, 12, 6, 16, 55), 'Message body for record 17 from source 5 to destination 4')
