In [9]:

# Querying Parquet Files from S3
import duckdb
import pandas as pd
import boto3
import os
from pathlib import Path

# Method 1: Using DuckDB to query parquet files directly from S3
# DuckDB has built-in S3 support and can query parquet files directly from S3 URLs

# Connect to DuckDB
con = duckdb.connect()

# Install and load httpfs extension for S3 support
print("Installing and loading httpfs extension...")
con.execute("INSTALL httpfs")
con.execute("LOAD httpfs")

# Configure DuckDB with AWS credentials and region
# Get credentials from environment variables
aws_access_key = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_key = os.getenv('AWS_SECRET_ACCESS_KEY')
aws_region = os.getenv('AWS_DEFAULT_REGION', 'il-central-1')

if aws_access_key and aws_secret_key:
    con.execute(f"SET s3_region='{aws_region}'")
    con.execute(f"SET s3_access_key_id='{aws_access_key}'")
    con.execute(f"SET s3_secret_access_key='{aws_secret_key}'")
    print(f"✅ DuckDB configured for AWS S3 access in region: {aws_region}")
    print(f"✅ Using credentials: {aws_access_key[:10]}...")
else:
    print("⚠️ AWS credentials not found in environment variables")
    print(" Run: source .env")

# Presigned URL for the parquet file
s3_presigned_url = "https://ais-research-data.s3.il-central-1.amazonaws.com/parquet/year%3D2017/month%3D3/day%3D1/01_exactEarth_historical_data_2017-03-01.parquet?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAQ67NRVWQK3FPSR5P%2F20250913%2Fil-central-1%2Fs3%2Faws4_request&X-Amz-Date=20250913T230937Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=289a0459fc168aa5fd16d5518818f3d924adb64bb0ef435476df99cb9080eda7"

# Query the parquet file from S3
try:
    print(f"Querying: {s3_presigned_url}")
    # Fixed: Use the correct variable name
    result = con.execute(f"SELECT * FROM '{s3_presigned_url}' LIMIT 10").fetchdf()
    print("Sample data from S3 parquet file:")
    print(result)
    print(f"\nDataFrame shape: {result.shape}")
    print(f"Columns: {list(result.columns)}")
except Exception as e:
    print(f"Error querying S3 parquet file: {e}")
    print("Make sure your AWS credentials are configured and the S3 URL is correct")

# Alternative: If you don't need AWS credentials (since you have a presigned URL)
# You can try without setting credentials
try:
    print("\n--- Trying without explicit credentials (presigned URL should work) ---")
    con_simple = duckdb.connect()
    con_simple.execute("INSTALL httpfs")
    con_simple.execute("LOAD httpfs")
    
    result2 = con_simple.execute(f"SELECT * FROM '{s3_presigned_url}' LIMIT 5").fetchdf()
    print("Success with presigned URL:")
    print(result2)
except Exception as e:
    print(f"Error with presigned URL approach: {e}")

# Close connections
con.close()

Installing and loading httpfs extension...
✅ DuckDB configured for AWS S3 access in region: il-central-1
✅ Using credentials: AKIAQ67NRV...
Querying: https://ais-research-data.s3.il-central-1.amazonaws.com/parquet/year%3D2017/month%3D3/day%3D1/01_exactEarth_historical_data_2017-03-01.parquet?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAQ67NRVWQK3FPSR5P%2F20250913%2Fil-central-1%2Fs3%2Faws4_request&X-Amz-Date=20250913T230937Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=289a0459fc168aa5fd16d5518818f3d924adb64bb0ef435476df99cb9080eda7
Sample data from S3 parquet file:
      base_date_time  year  month  day  hour  minute  second        lon  \
0  2017/3/1 00:00:00   NaN    NaN  NaN   NaN     NaN    13.0 -40.782283   
1  2017/3/1 00:00:00   NaN    NaN  NaN   NaN     NaN    54.0 -46.323187   
2  2017/3/1 00:00:00   NaN    NaN  NaN   NaN     NaN    51.0 -18.380190   
3  2017/3/1 00:00:00   NaN    NaN  NaN   NaN     NaN    42.0 -18.953433   
4  2017/3/1 00:00:00   NaN  

In [None]:

# Querying Parquet Files from S3
import duckdb
import pandas as pd
import boto3
import os
from pathlib import Path

# Method 1: Using DuckDB to query parquet files directly from S3
# DuckDB has built-in S3 support and can query parquet files directly from S3 URLs

# Connect to DuckDB
con = duckdb.connect()

# Install and load httpfs extension for S3 support
print("Installing and loading httpfs extension...")
con.execute("INSTALL httpfs")
con.execute("LOAD httpfs")

# Configure DuckDB with AWS credentials and region
# Get credentials from environment variables
aws_access_key = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_key = os.getenv('AWS_SECRET_ACCESS_KEY')
aws_region = os.getenv('AWS_DEFAULT_REGION', 'il-central-1')

if aws_access_key and aws_secret_key:
    con.execute(f"SET s3_region='{aws_region}'")
    con.execute(f"SET s3_access_key_id='{aws_access_key}'")
    con.execute(f"SET s3_secret_access_key='{aws_secret_key}'")
    print(f"✅ DuckDB configured for AWS S3 access in region: {aws_region}")
    print(f"✅ Using credentials: {aws_access_key[:10]}...")
else:
    print("⚠️ AWS credentials not found in environment variables")
    print(" Run: source .env")

# Presigned URL for the parquet file
s3_presigned_url = "https://ais-research-data.s3.il-central-1.amazonaws.com/parquet/year%3D2017/month%3D3/day%3D1/01_exactEarth_historical_data_2017-03-01.parquet?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAQ67NRVWQK3FPSR5P%2F20250913%2Fil-central-1%2Fs3%2Faws4_request&X-Amz-Date=20250913T230937Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=289a0459fc168aa5fd16d5518818f3d924adb64bb0ef435476df99cb9080eda7"

# Query the parquet file from S3
try:
    print(f"Querying: {s3_presigned_url}")
    # Fixed: Use the correct variable name
    result = con.execute(f"SELECT * FROM '{s3_presigned_url}' LIMIT 10").fetchdf()
    print("Sample data from S3 parquet file:")
    print(result)
    print(f"\nDataFrame shape: {result.shape}")
    print(f"Columns: {list(result.columns)}")
except Exception as e:
    print(f"Error querying S3 parquet file: {e}")
    print("Make sure your AWS credentials are configured and the S3 URL is correct")

# Alternative: If you don't need AWS credentials (since you have a presigned URL)
# You can try without setting credentials
try:
    print("\n--- Trying without explicit credentials (presigned URL should work) ---")
    con_simple = duckdb.connect()
    con_simple.execute("INSTALL httpfs")
    con_simple.execute("LOAD httpfs")
    
    result2 = con_simple.execute(f"SELECT * FROM '{s3_presigned_url}' LIMIT 5").fetchdf()
    print("Success with presigned URL:")
    print(result2)
except Exception as e:
    print(f"Error with presigned URL approach: {e}")

# Close connections
con.close()

Installing and loading httpfs extension...
✅ DuckDB configured for AWS S3 access in region: il-central-1
✅ Using credentials: AKIAQ67NRV...
Querying: s3://ais-research-data/parquet/year=2017/month=3/day=1/01_exactEarth_historical_data_2017-03-01.parquet
Error querying S3 parquet file: HTTP Error: HTTP GET error on 'https://ais-research-data.s3.amazonaws.com/parquet/year%3D2017/month%3D3/day%3D1/01_exactEarth_historical_data_2017-03-01.parquet' (HTTP 400)

Bad Request - this can be caused by the S3 region being set incorrectly.
* Provided region is "il-central-1"
Make sure your AWS credentials are configured and the S3 URL is correct
