In [1]:
!pip install -q pyjwt

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import requests, jwt
from IPython.display import JSON
# from pyiceberg.catalog.rest import RestCatalog
import pandas as pd
# from pyiceberg.schema import Schema
# from pyiceberg.partitioning import PartitionSpec, PartitionField
# from pyiceberg.types import NestedField, StringType, IntegerType, TimestampType
# from pyiceberg.transforms import MonthTransform
import random
from datetime import datetime, timedelta
import pyarrow as pa
import duckdb

# Authentication and API endpoints
CATALOG_URL = "http://lakekeeper:8181/catalog"
MANAGEMENT_URL = "http://lakekeeper:8181/management"
KEYCLOAK_TOKEN_URL = "http://keycloak:8080/realms/iceberg/protocol/openid-connect/token"

# Table configuration
WAREHOUSE = "irisa-ot"
NAMESPACE = "irisa"
TABLE_NAME = "fake_seclink"

print(f"üîß Configuration:")
print(f"   - Catalog URL: {CATALOG_URL}")
print(f"   - Management URL: {MANAGEMENT_URL}")
print(f"   - Warehouse: {WAREHOUSE}")
print(f"   - Namespace: {NAMESPACE}")
print(f"   - Table: {TABLE_NAME}")

üîß Configuration:
   - Catalog URL: http://lakekeeper:8181/catalog
   - Management URL: http://lakekeeper:8181/management
   - Warehouse: irisa-ot
   - Namespace: irisa
   - Table: fake_seclink


# Sign in

In [19]:
# Login to Keycloak for authentication
CLIENT_ID = "spark"
CLIENT_SECRET = "2OR3eRvYfSZzzZ16MlPd95jhLnOaLM52"

# CLIENT_ID = "jane-client"
# CLIENT_SECRET = "0Er7OKDvE2C0q9OnfhDp87BwEexKpthZ"  # Replace with actual secret

print("üîê Authenticating with Keycloak...")

response = requests.post(
    url=KEYCLOAK_TOKEN_URL,
    data={
        "grant_type": "client_credentials",
        "client_id": CLIENT_ID,
        "client_secret": CLIENT_SECRET    
    },
    headers={"Content-type": "application/x-www-form-urlencoded"},
)
response.raise_for_status()
access_token = response.json()['access_token']

# Verify the token
token_data = jwt.decode(access_token, options={"verify_signature": False})
print(f"‚úÖ Authentication successful")
print(f"   - Client: {token_data.get('preferred_username', 'Unknown')}")
print(f"   - Expires: {datetime.fromtimestamp(token_data.get('exp', 0))}")

üîê Authenticating with Keycloak...
‚úÖ Authentication successful
   - Client: service-account-spark
   - Expires: 2025-08-05 00:57:11


In [20]:
# Connect to DuckDB with authentication context and robust extension handling
import time
import random
import os

print("üîó Setting up DuckDB connection with authentication...")

# Initialize DuckDB connection
print("üì° Step 1: Initializing DuckDB connection...")
con = duckdb.connect("local2.duckdb")
# con = duckdb.connect()
print("‚úÖ DuckDB connection established")

con.sql("INSTALL ICEBERG;");
con.sql("INSTALL httpfs;");
# con.sql("UPDATE EXTENSIONS;");
con.sql("LOAD ICEBERG;");
con.sql("LOAD httpfs;");

# Load extensions separately with detailed logging
print("\nüìÇ Step 2: Loading DuckDB extensions...")

installed_extensions = []

# Verify extensions are working
print("\nüîç Step 3: Verifying extensions...")
result = con.sql("SELECT * FROM duckdb_extensions()").fetchall()
for ext in result:
    try:
        if ext[0] in ['iceberg', 'httpfs']:
            print(f"   üîç {ext[0]} extension verified")
            installed_extensions.append(ext[0])
    except Exception as e:
        print(f"   ‚ùå Failed to verify {ext}: {e}")

# Step 5: Attach the Iceberg catalog with authentication
print(f"\nüîó Step 4: Attaching Iceberg catalog with warehouse: {WAREHOUSE}")

if "iceberg" in installed_extensions:
    try:
        print("   üîó Attempting to attach Iceberg catalog...")
        print(f"   üì° Catalog URL: {CATALOG_URL}")
        print(f"   >> Warehouse: {WAREHOUSE}")
        print(f"   üîê Using authenticated token: {access_token[:20]}...")
        
        try : 
            # First create the secret
            con.sql(f"""
                CREATE SECRET lakekeeper_secret (
                    TYPE ICEBERG,
                    CLIENT_ID '{CLIENT_ID}',
                    CLIENT_SECRET '{CLIENT_SECRET}',
                    OAUTH2_SCOPE 'lakekeeper',
                    OAUTH2_SERVER_URI '{KEYCLOAK_TOKEN_URL}'
                )
            """)
        except:
            print(f" ‚ö†Ô∏è Secret `lakekeeper_secret` already Exists!")
        try:
            # Then attach using the secret
            con.sql(f"""
                ATTACH '{WAREHOUSE}' AS irisa_datalake (
                    TYPE ICEBERG,
                    ENDPOINT '{CATALOG_URL}',
                    SECRET lakekeeper_secret
                )
            """)
        except:
            print(f" ‚ö†Ô∏è Warehouse `irisa_datalake` already Exists!")
        
        print("   ‚úÖ Iceberg catalog attached successfully")
        
        # Verify the attachment by checking if we can query the catalog
        print("   üîç Verifying catalog attachment...")
        try:
            # Try to list tables in the catalog to verify it's working
            tables = con.sql(f"SELECT * FROM irisa_datalake.irisa.fake_seclink LIMIT 1").fetchall()
            print(f"   ‚úÖ Catalog verification successful - found {len(tables)} test records")
        except Exception as verify_error:
            print(f"   ‚ö†Ô∏è Catalog verification failed: {verify_error}")
            print("   ‚ÑπÔ∏è This might be normal if the table doesn't exist yet")
                
        
            
    except Exception as e:
        print(f"   ‚ùå Failed to attach Iceberg catalog: {e}")
        print("   üîß This might be due to network issues or authentication problems")
        raise Exception(f"Failed to attach Iceberg catalog: {e}")
else:
    print("   ‚ö†Ô∏è Skipping Iceberg catalog attachment - iceberg extension not loaded")

print("\n‚úÖ DuckDB setup completed!")
print(f">> Summary:")
print(f"   - Extensions installed: {installed_extensions}")
print(f"   - Authentication: Using Keycloak token")
print(f"   - Warehouse: {WAREHOUSE}")

# Additional debugging information
print(f"\nüîç Debug Information:")
print(f"   - DuckDB version: {con.sql('SELECT version()').fetchone()[0]}")
print(f"   - Working directory: {os.getcwd()}")
print(f"   - Environment: Containerized (no internet access detected)")

üîó Setting up DuckDB connection with authentication...
üì° Step 1: Initializing DuckDB connection...
‚úÖ DuckDB connection established

üìÇ Step 2: Loading DuckDB extensions...

üîç Step 3: Verifying extensions...
   üîç httpfs extension verified
   üîç iceberg extension verified

üîó Step 4: Attaching Iceberg catalog with warehouse: irisa-ot
   üîó Attempting to attach Iceberg catalog...
   üì° Catalog URL: http://lakekeeper:8181/catalog
   >> Warehouse: irisa-ot
   üîê Using authenticated token: eyJhbGciOiJSUzI1NiIs...
 ‚ö†Ô∏è Secret `lakekeeper_secret` already Exists!
 ‚ö†Ô∏è Warehouse `irisa_datalake` already Exists!
   ‚úÖ Iceberg catalog attached successfully
   üîç Verifying catalog attachment...
   ‚úÖ Catalog verification successful - found 1 test records

‚úÖ DuckDB setup completed!
>> Summary:
   - Extensions installed: ['httpfs', 'iceberg']
   - Authentication: Using Keycloak token
   - Warehouse: irisa-ot

üîç Debug Information:
   - DuckDB version: v1.3.2
   

In [21]:
print(">> Running queries to test the data and partitioning...")
print("   - Authentication: Using Keycloak token")
print("   - Warehouse: " + WAREHOUSE)

# Iceberg table reference
table_ref = "irisa_datalake.irisa.fake_seclink"


con.sql(f"SELECT Id, Destination FROM {table_ref}").fetchone()[0]
# Query 1: Total record count
print("\nüìä Query 1: Total record count")
total_count = con.sql(f"SELECT COUNT(*) FROM {table_ref}").fetchone()[0]
print(f"Total records: {total_count}")

# Query 2: Records by month (demonstrating partitioning benefits)
print("\nüìÖ Query 2: Records by month (partitioning test)")
for month in range(1, 7):
    result = con.sql(f"""
        SELECT COUNT(*) FROM {table_ref}
        WHERE EXTRACT(MONTH FROM DateIn) = {month} AND EXTRACT(YEAR FROM DateIn) = 2024
    """).fetchone()[0]
    print(f"Month {month} (2024-{month:02d}): {result} records")

# Query 3: Top sources by record count
print("\nüè¢ Query 3: Top sources by record count")
top_sources = con.sql(f"""
    SELECT Source, COUNT(*) as count
    FROM {table_ref}
    GROUP BY Source
    ORDER BY count DESC
    LIMIT 5
""").fetchall()
for source, count in top_sources:
    print(f"Source {source}: {count} records")

# Query 4: Average processing time analysis
print("\n‚è±Ô∏è Query 4: Average processing time analysis")
avg_stats = con.sql(f"""
    SELECT 
        AVG(EXTRACT(EPOCH FROM DateOut - DateIn) / 60) as avg_min,
        MIN(EXTRACT(EPOCH FROM DateOut - DateIn) / 60) as min_min,
        MAX(EXTRACT(EPOCH FROM DateOut - DateIn) / 60) as max_min
    FROM {table_ref}
    WHERE DateOut IS NOT NULL AND DateIn IS NOT NULL
""").fetchone()
print(f"Average processing time: {avg_stats[0]:.2f} minutes")
print(f"Min processing time: {avg_stats[1]:.2f} minutes")
print(f"Max processing time: {avg_stats[2]:.2f} minutes")

# Query 5: Busiest hour of the day
print("\nüïê Query 5: Busiest hour of the day")
hour_stats = con.sql(f"""
    SELECT EXTRACT(HOUR FROM DateIn) as hour, COUNT(*) as count
    FROM {table_ref}
    GROUP BY hour
    ORDER BY count DESC
    LIMIT 1
""").fetchone()
print(f"Busiest hour: {int(hour_stats[0])}:00 with {hour_stats[1]} records")

# Query 6: Sample of recent records
print("\n>> Query 6: Sample of recent records")
recent_records = con.sql(f"""
    SELECT Id, DateIn, Source, Destination, 
           ROUND(EXTRACT(EPOCH FROM DateOut - DateIn) / 60, 2) AS ProcessingTime
    FROM {table_ref}
    WHERE DateOut IS NOT NULL AND DateIn IS NOT NULL
    ORDER BY DateIn DESC
    LIMIT 10
""").df()
print(recent_records.to_string(index=False))

# Query 7: Performance by source system
print("\nüìà Query 7: Performance analysis by source system")
performance_stats = con.sql(f"""
    SELECT 
        Source,
        ROUND(AVG(EXTRACT(EPOCH FROM DateOut - DateIn) / 60), 2) as avg_min,
        ROUND(MIN(EXTRACT(EPOCH FROM DateOut - DateIn) / 60), 2) as min_min,
        ROUND(MAX(EXTRACT(EPOCH FROM DateOut - DateIn) / 60), 2) as max_min,
        COUNT(*) as count
    FROM {table_ref}
    WHERE DateOut IS NOT NULL AND DateIn IS NOT NULL
    GROUP BY Source
    ORDER BY count DESC
""").df()
print(performance_stats.to_string(index=False))

print("\n‚úÖ All queries completed successfully!")
print("üéØ The month partitioning is working efficiently for time-based queries!")
print("üîê All operations performed with proper Keycloak authentication")

>> Running queries to test the data and partitioning...
   - Authentication: Using Keycloak token
   - Warehouse: irisa-ot

üìä Query 1: Total record count
Total records: 10000

üìÖ Query 2: Records by month (partitioning test)
Month 1 (2024-01): 1709 records
Month 2 (2024-02): 1572 records
Month 3 (2024-03): 1696 records
Month 4 (2024-04): 1622 records
Month 5 (2024-05): 1754 records
Month 6 (2024-06): 1647 records

üè¢ Query 3: Top sources by record count
Source 5: 2035 records
Source 4: 2015 records
Source 3: 2014 records
Source 2: 1973 records
Source 1: 1963 records

‚è±Ô∏è Query 4: Average processing time analysis
Average processing time: 30.62 minutes
Min processing time: 1.00 minutes
Max processing time: 60.00 minutes

üïê Query 5: Busiest hour of the day
Busiest hour: 6:00 with 465 records

>> Query 6: Sample of recent records
  Id              DateIn  Source  Destination  ProcessingTime
9385 2024-06-30 23:23:43       1            2            42.0
1362 2024-06-30 23:18:22 

In [6]:
con.close()