In [1]:
!pip install -q pyjwt

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import requests, jwt
from IPython.display import JSON
# from pyiceberg.catalog.rest import RestCatalog
import pandas as pd
# from pyiceberg.schema import Schema
# from pyiceberg.partitioning import PartitionSpec, PartitionField
# from pyiceberg.types import NestedField, StringType, IntegerType, TimestampType
# from pyiceberg.transforms import MonthTransform
import random
from datetime import datetime, timedelta
import pyarrow as pa
import duckdb

# Authentication and API endpoints
CATALOG_URL = "http://lakekeeper:8181/catalog"
MANAGEMENT_URL = "http://lakekeeper:8181/management"
KEYCLOAK_TOKEN_URL = "http://keycloak:8080/realms/iceberg/protocol/openid-connect/token"

# Table configuration
WAREHOUSE = "irisa-ot"
NAMESPACE = "irisa"
TABLE_NAME = "fake_seclink"

print(f"🔧 Configuration:")
print(f"   - Catalog URL: {CATALOG_URL}")
print(f"   - Management URL: {MANAGEMENT_URL}")
print(f"   - Warehouse: {WAREHOUSE}")
print(f"   - Namespace: {NAMESPACE}")
print(f"   - Table: {TABLE_NAME}")

🔧 Configuration:
   - Catalog URL: http://lakekeeper:8181/catalog
   - Management URL: http://lakekeeper:8181/management
   - Warehouse: irisa-ot
   - Namespace: irisa
   - Table: fake_seclink


# Sign in

In [3]:
# Login to Keycloak for authentication
CLIENT_ID = "spark"
CLIENT_SECRET = "2OR3eRvYfSZzzZ16MlPd95jhLnOaLM52"

print("🔐 Authenticating with Keycloak...")

response = requests.post(
    url=KEYCLOAK_TOKEN_URL,
    data={
        "grant_type": "client_credentials",
        "client_id": CLIENT_ID,
        "client_secret": CLIENT_SECRET,
        "scope": "lakekeeper"
    },
    headers={"Content-type": "application/x-www-form-urlencoded"},
)
response.raise_for_status()
access_token = response.json()['access_token']

# Verify the token
token_data = jwt.decode(access_token, options={"verify_signature": False})
print(f"✅ Authentication successful")
print(f"   - Client: {token_data.get('preferred_username', 'Unknown')}")
print(f"   - Expires: {datetime.fromtimestamp(token_data.get('exp', 0))}")

🔐 Authenticating with Keycloak...
✅ Authentication successful
   - Client: service-account-spark
   - Expires: 2025-08-04 15:29:40


In [8]:
# Connect to DuckDB with authentication context and robust extension handling
import time
import random
import os

print("🔗 Setting up DuckDB connection with authentication...")

# Initialize DuckDB connection
print("📡 Step 1: Initializing DuckDB connection...")
con = duckdb.connect("local.duckdb")
# con = duckdb.connect()
print("✅ DuckDB connection established")

con.sql("INSTALL ICEBERG;");
con.sql("INSTALL httpfs;");
# con.sql("UPDATE EXTENSIONS;");
con.sql("LOAD ICEBERG;");
con.sql("LOAD httpfs;");

# Load extensions separately with detailed logging
print("\n📂 Step 2: Loading DuckDB extensions...")

installed_extensions = []

# Verify extensions are working
print("\n🔍 Step 3: Verifying extensions...")
result = con.sql("SELECT * FROM duckdb_extensions()").fetchall()
for ext in result:
    try:
        if ext[0] in ['iceberg', 'httpfs']:
            print(f"   🔍 {ext[0]} extension verified")
            installed_extensions.append(ext[0])
    except Exception as e:
        print(f"   ❌ Failed to verify {ext}: {e}")

# Step 5: Attach the Iceberg catalog with authentication
print(f"\n🔗 Step 4: Attaching Iceberg catalog with warehouse: {WAREHOUSE}")

if "iceberg" in installed_extensions:
    try:
        print("   🔗 Attempting to attach Iceberg catalog...")
        print(f"   📡 Catalog URL: {CATALOG_URL}")
        print(f"   >> Warehouse: {WAREHOUSE}")
        print(f"   🔐 Using authenticated token: {access_token[:20]}...")
        
        try : 
            # First create the secret
            con.sql(f"""
                CREATE SECRET lakekeeper_secret (
                    TYPE ICEBERG,
                    CLIENT_ID '{CLIENT_ID}',
                    CLIENT_SECRET '{CLIENT_SECRET}',
                    OAUTH2_SCOPE 'lakekeeper',
                    OAUTH2_SERVER_URI '{KEYCLOAK_TOKEN_URL}'
                )
            """)
        except:
            print(f" ⚠️ Secret `lakekeeper_secret` already Exists!")
        try:
            # Then attach using the secret
            con.sql(f"""
                ATTACH '{WAREHOUSE}' AS irisa_datalake (
                    TYPE ICEBERG,
                    ENDPOINT '{CATALOG_URL}',
                    SECRET lakekeeper_secret
                )
            """)
        except:
            print(f" ⚠️ Warehouse `irisa_datalake` already Exists!")
        
        print("   ✅ Iceberg catalog attached successfully")
        
        # Verify the attachment by checking if we can query the catalog
        print("   🔍 Verifying catalog attachment...")
        try:
            # Try to list tables in the catalog to verify it's working
            tables = con.sql(f"SELECT * FROM irisa_datalake.irisa.fake_seclink LIMIT 1").fetchall()
            print(f"   ✅ Catalog verification successful - found {len(tables)} test records")
        except Exception as verify_error:
            print(f"   ⚠️ Catalog verification failed: {verify_error}")
            print("   ℹ️ This might be normal if the table doesn't exist yet")
                
        
            
    except Exception as e:
        print(f"   ❌ Failed to attach Iceberg catalog: {e}")
        print("   🔧 This might be due to network issues or authentication problems")
        raise Exception(f"Failed to attach Iceberg catalog: {e}")
else:
    print("   ⚠️ Skipping Iceberg catalog attachment - iceberg extension not loaded")

print("\n✅ DuckDB setup completed!")
print(f">> Summary:")
print(f"   - Extensions installed: {installed_extensions}")
print(f"   - Authentication: Using Keycloak token")
print(f"   - Warehouse: {WAREHOUSE}")

# Additional debugging information
print(f"\n🔍 Debug Information:")
print(f"   - DuckDB version: {con.sql('SELECT version()').fetchone()[0]}")
print(f"   - Working directory: {os.getcwd()}")
print(f"   - Environment: Containerized (no internet access detected)")

🔗 Setting up DuckDB connection with authentication...
📡 Step 1: Initializing DuckDB connection...
✅ DuckDB connection established

📂 Step 2: Loading DuckDB extensions...

🔍 Step 3: Verifying extensions...
   🔍 httpfs extension verified
   🔍 iceberg extension verified

🔗 Step 4: Attaching Iceberg catalog with warehouse: irisa-ot
   🔗 Attempting to attach Iceberg catalog...
   📡 Catalog URL: http://lakekeeper:8181/catalog
   >> Warehouse: irisa-ot
   🔐 Using authenticated token: eyJhbGciOiJSUzI1NiIs...
   ✅ Iceberg catalog attached successfully
   🔍 Verifying catalog attachment...
   ✅ Catalog verification successful - found 1 test records

✅ DuckDB setup completed!
>> Summary:
   - Extensions installed: ['httpfs', 'iceberg']
   - Authentication: Using Keycloak token
   - Warehouse: irisa-ot

🔍 Debug Information:
   - DuckDB version: v1.3.2
   - Working directory: /opt/jupyter/notebooks
   - Environment: Containerized (no internet access detected)


In [9]:
print(">> Running queries to test the data and partitioning...")
print("   - Authentication: Using Keycloak token")
print("   - Warehouse: " + WAREHOUSE)

# Iceberg table reference
table_ref = "irisa_datalake.irisa.fake_seclink"

# Query 1: Total record count
print("\n📊 Query 1: Total record count")
total_count = con.sql(f"SELECT COUNT(*) FROM {table_ref}").fetchone()[0]
print(f"Total records: {total_count}")

# Query 2: Records by month (demonstrating partitioning benefits)
print("\n📅 Query 2: Records by month (partitioning test)")
for month in range(1, 7):
    result = con.sql(f"""
        SELECT COUNT(*) FROM {table_ref}
        WHERE EXTRACT(MONTH FROM DateIn) = {month} AND EXTRACT(YEAR FROM DateIn) = 2024
    """).fetchone()[0]
    print(f"Month {month} (2024-{month:02d}): {result} records")

# Query 3: Top sources by record count
print("\n🏢 Query 3: Top sources by record count")
top_sources = con.sql(f"""
    SELECT Source, COUNT(*) as count
    FROM {table_ref}
    GROUP BY Source
    ORDER BY count DESC
    LIMIT 5
""").fetchall()
for source, count in top_sources:
    print(f"Source {source}: {count} records")

# Query 4: Average processing time analysis
print("\n⏱️ Query 4: Average processing time analysis")
avg_stats = con.sql(f"""
    SELECT 
        AVG(EXTRACT(EPOCH FROM DateOut - DateIn) / 60) as avg_min,
        MIN(EXTRACT(EPOCH FROM DateOut - DateIn) / 60) as min_min,
        MAX(EXTRACT(EPOCH FROM DateOut - DateIn) / 60) as max_min
    FROM {table_ref}
    WHERE DateOut IS NOT NULL AND DateIn IS NOT NULL
""").fetchone()
print(f"Average processing time: {avg_stats[0]:.2f} minutes")
print(f"Min processing time: {avg_stats[1]:.2f} minutes")
print(f"Max processing time: {avg_stats[2]:.2f} minutes")

# Query 5: Busiest hour of the day
print("\n🕐 Query 5: Busiest hour of the day")
hour_stats = con.sql(f"""
    SELECT EXTRACT(HOUR FROM DateIn) as hour, COUNT(*) as count
    FROM {table_ref}
    GROUP BY hour
    ORDER BY count DESC
    LIMIT 1
""").fetchone()
print(f"Busiest hour: {int(hour_stats[0])}:00 with {hour_stats[1]} records")

# Query 6: Sample of recent records
print("\n>> Query 6: Sample of recent records")
recent_records = con.sql(f"""
    SELECT Id, DateIn, Source, Destination, 
           ROUND(EXTRACT(EPOCH FROM DateOut - DateIn) / 60, 2) AS ProcessingTime
    FROM {table_ref}
    WHERE DateOut IS NOT NULL AND DateIn IS NOT NULL
    ORDER BY DateIn DESC
    LIMIT 10
""").df()
print(recent_records.to_string(index=False))

# Query 7: Performance by source system
print("\n📈 Query 7: Performance analysis by source system")
performance_stats = con.sql(f"""
    SELECT 
        Source,
        ROUND(AVG(EXTRACT(EPOCH FROM DateOut - DateIn) / 60), 2) as avg_min,
        ROUND(MIN(EXTRACT(EPOCH FROM DateOut - DateIn) / 60), 2) as min_min,
        ROUND(MAX(EXTRACT(EPOCH FROM DateOut - DateIn) / 60), 2) as max_min,
        COUNT(*) as count
    FROM {table_ref}
    WHERE DateOut IS NOT NULL AND DateIn IS NOT NULL
    GROUP BY Source
    ORDER BY count DESC
""").df()
print(performance_stats.to_string(index=False))

print("\n✅ All queries completed successfully!")
print("🎯 The month partitioning is working efficiently for time-based queries!")
print("🔐 All operations performed with proper Keycloak authentication")

>> Running queries to test the data and partitioning...
   - Authentication: Using Keycloak token
   - Warehouse: irisa-ot

📊 Query 1: Total record count
Total records: 10000

📅 Query 2: Records by month (partitioning test)
Month 1 (2024-01): 1718 records
Month 2 (2024-02): 1606 records
Month 3 (2024-03): 1753 records
Month 4 (2024-04): 1585 records
Month 5 (2024-05): 1652 records
Month 6 (2024-06): 1686 records

🏢 Query 3: Top sources by record count
Source 2: 2024 records
Source 5: 2008 records
Source 4: 1998 records
Source 1: 1988 records
Source 3: 1982 records

⏱️ Query 4: Average processing time analysis
Average processing time: 30.50 minutes
Min processing time: 1.00 minutes
Max processing time: 60.00 minutes

🕐 Query 5: Busiest hour of the day
Busiest hour: 11:00 with 447 records

>> Query 6: Sample of recent records
  Id              DateIn  Source  Destination  ProcessingTime
7897 2024-06-30 23:17:35       3            4            35.0
4609 2024-06-30 22:56:02       4        

In [10]:
con.close()