In [1]:
!pip install -q pyjwt

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import requests, jwt
from IPython.display import JSON
from pyiceberg.catalog.rest import RestCatalog
import pandas as pd
from pyiceberg.schema import Schema
from pyiceberg.partitioning import PartitionSpec, PartitionField
from pyiceberg.types import NestedField, StringType, IntegerType, TimestampType
from pyiceberg.transforms import MonthTransform
import random
from datetime import datetime, timedelta
import pyarrow as pa
import duckdb

# Authentication and API endpoints
CATALOG_URL = "http://lakekeeper:8181/catalog"
MANAGEMENT_URL = "http://lakekeeper:8181/management"
KEYCLOAK_TOKEN_URL = "http://keycloak:8080/realms/iceberg/protocol/openid-connect/token"

# Table configuration
WAREHOUSE = "irisa-ot"
NAMESPACE = "irisa"
TABLE_NAME = "fake_seclink"

print(f"🔧 Configuration:")
print(f"   - Catalog URL: {CATALOG_URL}")
print(f"   - Management URL: {MANAGEMENT_URL}")
print(f"   - Warehouse: {WAREHOUSE}")
print(f"   - Namespace: {NAMESPACE}")
print(f"   - Table: {TABLE_NAME}")

🔧 Configuration:
   - Catalog URL: http://lakekeeper:8181/catalog
   - Management URL: http://lakekeeper:8181/management
   - Warehouse: irisa-ot
   - Namespace: irisa
   - Table: fake_seclink


# Sign in

In [3]:
# Login to Keycloak for authentication
CLIENT_ID = "spark"
CLIENT_SECRET = "2OR3eRvYfSZzzZ16MlPd95jhLnOaLM52"

print("🔐 Authenticating with Keycloak...")

response = requests.post(
    url=KEYCLOAK_TOKEN_URL,
    data={
        "grant_type": "client_credentials",
        "client_id": CLIENT_ID,
        "client_secret": CLIENT_SECRET,
        "scope": "lakekeeper"
    },
    headers={"Content-type": "application/x-www-form-urlencoded"},
)
response.raise_for_status()
access_token = response.json()['access_token']

# Verify the token
token_data = jwt.decode(access_token, options={"verify_signature": False})
print(f"✅ Authentication successful")
print(f"   - Client: {token_data.get('preferred_username', 'Unknown')}")
print(f"   - Expires: {datetime.fromtimestamp(token_data.get('exp', 0))}")

🔐 Authenticating with Keycloak...
✅ Authentication successful
   - Client: service-account-spark
   - Expires: 2025-08-04 15:40:10


# Creating a Warehouse

In [4]:
# Create or verify warehouse exists
print("🏗️ Setting up warehouse...")

warehouse_config = {
    "warehouse-name": WAREHOUSE,
    "storage-profile": {
        "type": "s3",
        "bucket": "irisa-warehouse",
        "key-prefix": "ot",
        "endpoint": "http://minio:9000",
        "region": "local-01",
        "path-style-access": True,
        "flavor": "minio",
        "sts-enabled": True
    },
    "storage-credential": {
        "type": "s3",
        "credential-type": "access-key",
        "aws-access-key-id": "minio-root-user",
        "aws-secret-access-key": "minio-root-password"
    }
}

try:
    response = requests.post(
        url=f"{MANAGEMENT_URL}/v1/warehouse",
        headers={"Authorization": f"Bearer {access_token}"},
        json=warehouse_config
    )
    response.raise_for_status()
    print(f"✅ Warehouse '{WAREHOUSE}' created successfully")
    JSON(response.json())
except requests.exceptions.HTTPError as e:
    if e.response.status_code == 409 or e.response.status_code==400 :  # Already exists
        print(f"ℹ️ Warehouse '{WAREHOUSE}' already exists")
    else:
        raise

🏗️ Setting up warehouse...
ℹ️ Warehouse 'irisa-ot' already exists


In [5]:
# Initialize the catalog with authenticated access
catalog = RestCatalog(
    name="irisa_catalog",
    warehouse=WAREHOUSE,
    uri=CATALOG_URL,
    token=access_token,  # Use the real access token
)

print("✓ Catalog initialized successfully with authentication")
print(f"Available namespaces: {list(catalog.list_namespaces())}")

✓ Catalog initialized successfully with authentication
Available namespaces: [('irisa',)]


In [6]:
# Create the irisa namespace if it doesn't exist
irisa_namespace = (NAMESPACE,)

if irisa_namespace not in catalog.list_namespaces():
    catalog.create_namespace(irisa_namespace)
    print(f"✓ Created namespace: {NAMESPACE}")
else:
    print(f"ℹ Namespace '{NAMESPACE}' already exists")

print(f"📋 Available namespaces: {list(catalog.list_namespaces())}")

ℹ Namespace 'irisa' already exists
📋 Available namespaces: [('irisa',)]


In [7]:
# Define the schema for fake_seclink with proper timestamp handling
schema = Schema(
    NestedField(field_id=1, name="Id", field_type=IntegerType(), required=True),
    NestedField(field_id=2, name="TelegramCode", field_type=IntegerType(), required=False),
    NestedField(field_id=3, name="Source", field_type=IntegerType(), required=False),
    NestedField(field_id=4, name="Destination", field_type=IntegerType(), required=False),
    NestedField(field_id=5, name="DateIn", field_type=TimestampType(), required=False),
    NestedField(field_id=6, name="DateOut", field_type=TimestampType(), required=False),
    NestedField(field_id=7, name="Body", field_type=StringType(), required=False),
)

print(" Schema Defined:")
print(f"   - Total fields: {len(schema.fields)}")
for field in schema.fields:
   print(f"   - {field.name}: {field.field_type}")
print("   - Note: Timestamps use microsecond precision for Iceberg compatibility")

 Schema Defined:
   - Total fields: 7
   - Id: int
   - TelegramCode: int
   - Source: int
   - Destination: int
   - DateIn: timestamp
   - DateOut: timestamp
   - Body: string
   - Note: Timestamps use microsecond precision for Iceberg compatibility


In [8]:
# Define partitioning strategy (by month of DateIn)
partition_spec = PartitionSpec(
    PartitionField(
        source_id=5,  # DateIn field ID
        field_id=1000,
        name="DateIn_month",
        transform=MonthTransform()
    )
)

print("🔧 Partitioning Strategy:")
print(f"   - Partition by: month(DateIn)")
print(f"   - Total partition fields: {len(partition_spec.fields)}")
print("   - Benefits: Efficient time-based queries and data organization")
print("   - Storage optimization: Data automatically organized by month")

🔧 Partitioning Strategy:
   - Partition by: month(DateIn)
   - Total partition fields: 1
   - Benefits: Efficient time-based queries and data organization
   - Storage optimization: Data automatically organized by month


In [9]:
# Create the table with proper authentication
table_identifier = (NAMESPACE, TABLE_NAME)

# Check if table already exists
if table_identifier in catalog.list_tables(namespace=irisa_namespace):
    print(f"⚠ Table '{TABLE_NAME}' already exists in namespace '{NAMESPACE}'")
    print("   Dropping existing table...")
    catalog.drop_table(table_identifier)
    print("   ✓ Existing table dropped")

# Create the new table
try:
    table = catalog.create_table(
        identifier=table_identifier,
        schema=schema,
        partition_spec=partition_spec
    )

    print(f"✅ Table created successfully!")
    print(f"   - Table: {NAMESPACE}.{TABLE_NAME}")
    print(f"   - Location: {table.location()}")
    print(f"   - Format: {table.format_version}")
    print(f"   - Partitioning: {len(partition_spec.fields)} fields")
    print(f"   - Authentication: Using Keycloak token")

except Exception as e:
    print(f"❌ Error creating table: {str(e)}")
    print("   - Check warehouse configuration and permissions")
    raise

⚠ Table 'fake_seclink' already exists in namespace 'irisa'
   Dropping existing table...
   ✓ Existing table dropped
✅ Table created successfully!
   - Table: irisa.fake_seclink
   - Location: s3://irisa-warehouse/ot/01987563-e0b6-7f10-9b7e-f2fbe82c4cc0/01987586-9b4e-7ac2-8252-c532da3126cf
   - Format: 2
   - Partitioning: 1 fields
   - Authentication: Using Keycloak token


In [10]:
# Generate fake data for 6 months (10,000 records)
print("🎲 Generating fake data...")

# Set up date range for 6 months
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 6, 30)
total_records = 10000

# Generate random data
data = []
for i in range(total_records):
    # Random date within the 6-month period
    random_days = random.randint(0, (end_date - start_date).days)
    date_in = start_date + timedelta(days=random_days)

    # Random time within the day
    random_hours = random.randint(0, 23)
    random_minutes = random.randint(0, 59)
    random_seconds = random.randint(0, 59)
    date_in = date_in.replace(hour=random_hours, minute=random_minutes, second=random_seconds)

    # DateOut is typically 1-60 minutes after DateIn
    random_duration = random.randint(1, 60)
    date_out = date_in + timedelta(minutes=random_duration)

    record = {
        "Id": i + 1,
        "TelegramCode": random.randint(1000, 9999),
        "Source": random.randint(1, 5),
        "Destination": random.randint(1, 5),
        "DateIn": date_in,
        "DateOut": date_out,
        "Body": f"Message body for record {i + 1} from source {random.randint(1, 5)} to destination {random.randint(1, 5)}"
    }
    data.append(record)

# Convert to DataFrame
df = pd.DataFrame(data)

# 🔧 CRITICAL FIX: Convert timestamps to microsecond precision
print("🔧 Converting timestamps to microsecond precision...")
df['DateIn'] = df['DateIn'].dt.floor('us')
df['DateOut'] = df['DateOut'].dt.floor('us')
print("✅ Timestamps converted to microsecond precision")

print(f"✅ Generated {len(df)} fake records")
print(f">> Date range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
print(f"📊 Sample data:")
print(df.head())

# Show distribution by month
monthly_distribution = df['DateIn'].dt.to_period('M').value_counts().sort_index()
print(f"\n📈 Monthly distribution:")
for month, count in monthly_distribution.items():
    print(f"   - {month}: {count} records")

🎲 Generating fake data...
🔧 Converting timestamps to microsecond precision...
✅ Timestamps converted to microsecond precision
✅ Generated 10000 fake records
>> Date range: 2024-01-01 to 2024-06-30
📊 Sample data:
   Id  TelegramCode  Source  Destination              DateIn  \
0   1          6826       1            1 2024-06-25 20:39:44   
1   2          3193       1            5 2024-05-06 01:23:58   
2   3          4904       5            3 2024-02-17 20:10:29   
3   4          3079       2            5 2024-02-27 07:01:34   
4   5          3257       4            2 2024-05-17 06:58:27   

              DateOut                                               Body  
0 2024-06-25 21:21:44  Message body for record 1 from source 2 to des...  
1 2024-05-06 02:00:58  Message body for record 2 from source 5 to des...  
2 2024-02-17 20:40:29  Message body for record 3 from source 4 to des...  
3 2024-02-27 07:30:34  Message body for record 4 from source 3 to des...  
4 2024-05-17 07:13:27  Messa

In [11]:
# Insert data into the Iceberg table with proper authentication
print("📤 Inserting data into Iceberg table...")

try:
    # Load the table (now with authenticated access)
    table = catalog.load_table(table_identifier)

    # 🔧 CRITICAL FIX: Convert DataFrame to PyArrow table with explicit microsecond precision schema
    arrow_table = pa.Table.from_pandas(df, schema=pa.schema([
        pa.field("Id", pa.int32(), nullable=False),  # Required field
        pa.field("TelegramCode", pa.int32(), nullable=True),
        pa.field("Source", pa.int32(), nullable=True),
        pa.field("Destination", pa.int32(), nullable=True),
        pa.field("DateIn", pa.timestamp('us'), nullable=True),  # Microsecond precision
        pa.field("DateOut", pa.timestamp('us'), nullable=True),  # Microsecond precision
        pa.field("Body", pa.string(), nullable=True),
    ]))

    print("✅ PyArrow table created with microsecond precision timestamps")

    # Append data to the table
    table.append(arrow_table)

    print(f"✅ Successfully inserted {len(df)} records into {NAMESPACE}.{TABLE_NAME}")
    print(f"   - Authentication: Using Keycloak token")
    print(f"   - Warehouse: {WAREHOUSE}")

    # Simple verification
    print(f"\n🔍 Data insertion completed successfully!")
    print(f"📊 Expected records: {len(df)}")
    print(f"✅ Table: {NAMESPACE}.{TABLE_NAME} is ready for queries!")

except Exception as e:
    print(f"❌ Error inserting data: {str(e)}")
    print("   - Check authentication and warehouse permissions")
    raise

📤 Inserting data into Iceberg table...
✅ PyArrow table created with microsecond precision timestamps
✅ Successfully inserted 10000 records into irisa.fake_seclink
   - Authentication: Using Keycloak token
   - Warehouse: irisa-ot

🔍 Data insertion completed successfully!
📊 Expected records: 10000
✅ Table: irisa.fake_seclink is ready for queries!
