# Fix Spark Session Configuration

This notebook fixes the Spark session configuration to properly work with Apache Iceberg and MinIO.

In [1]:
# Import required libraries
from pyspark.sql import SparkSession
import boto3
from botocore.client import Config
import time

## Step 1: Stop any existing Spark session

In [2]:
# Stop any existing Spark session
try:
    SparkSession.builder.getOrCreate().stop()
    print("Stopped existing Spark session")
except:
    print("No existing Spark session to stop")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/05/09 20:24:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/09 20:25:00 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/09 20:25:00 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
Stopped existing Spark session


## Step 2: Create a new Spark session with the correct configuration

In [3]:
# Create a new Spark session with the correct configuration
import os

# Create warehouse directory if it doesn't exist
warehouse_dir = "/opt/bitnami/spark/warehouse"
os.makedirs(warehouse_dir, exist_ok=True)

spark = SparkSession.builder \
    .appName("Banking Reconciliation Demo") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") \
    .config("spark.sql.catalog.spark_catalog.type", "hive") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", f"file://{warehouse_dir}") \
    .config("spark.sql.defaultCatalog", "local") \
    .getOrCreate()

print("Spark session created successfully")

25/05/09 20:25:02 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/09 20:25:02 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
Spark session created successfully


## Step 3: Initialize MinIO buckets

In [4]:
# Initialize MinIO client
s3_client = boto3.client(
    's3',
    endpoint_url='http://minio:9000',
    aws_access_key_id='minio',
    aws_secret_access_key='minio123',
    config=Config(signature_version='s3v4'),
    region_name='us-east-1'
)

# List existing buckets
try:
    existing_buckets = [bucket['Name'] for bucket in s3_client.list_buckets()['Buckets']]
    print(f"Existing buckets: {existing_buckets}")
except Exception as e:
    print(f"Error listing buckets: {str(e)}")
    print("Waiting for MinIO to be ready...")
    time.sleep(10)  # Wait for MinIO to be ready
    existing_buckets = [bucket['Name'] for bucket in s3_client.list_buckets()['Buckets']]

# Create buckets if they don't exist
buckets = ['warehouse', 'raw-data', 'stage-data', 'reconciled-data']
for bucket in buckets:
    if bucket not in existing_buckets:
        try:
            s3_client.create_bucket(Bucket=bucket)
            print(f"Created bucket: {bucket}")
        except Exception as e:
            print(f"Error creating bucket {bucket}: {str(e)}")
    else:
        print(f"Bucket already exists: {bucket}")

Existing buckets: ['test']
Created bucket: warehouse
Created bucket: raw-data
Created bucket: stage-data
Created bucket: reconciled-data


## Step 4: Create Iceberg tables

In [5]:
# Create the banking namespace
spark.sql("CREATE NAMESPACE IF NOT EXISTS local.banking")
print("Created namespace: local.banking")

25/05/07 01:01:45 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: s3a://warehousebanking

In [None]:
# Create source_transactions table
spark.sql("""
CREATE TABLE IF NOT EXISTS local.banking.source_transactions (
  transaction_id STRING,
  source_system STRING,
  transaction_date TIMESTAMP,
  amount DECIMAL(18,2),
  account_id STRING,
  transaction_type STRING,
  reference_id STRING,
  status STRING,
  payload STRING,
  created_at TIMESTAMP,
  processing_timestamp TIMESTAMP
)
USING iceberg
PARTITIONED BY (days(transaction_date), source_system)
""")
print("Created table: local.banking.source_transactions")

In [None]:
# Create reconciliation_results table
spark.sql("""
CREATE TABLE IF NOT EXISTS local.banking.reconciliation_results (
  reconciliation_id STRING,
  batch_id STRING,
  primary_transaction_id STRING,
  secondary_transaction_id STRING,
  match_status STRING,
  discrepancy_type STRING,
  discrepancy_amount DECIMAL(18,2),
  reconciliation_timestamp TIMESTAMP,
  notes STRING
)
USING iceberg
PARTITIONED BY (days(reconciliation_timestamp), match_status)
""")
print("Created table: local.banking.reconciliation_results")

In [None]:
# Create reconciliation_batches table
spark.sql("""
CREATE TABLE IF NOT EXISTS local.banking.reconciliation_batches (
  batch_id STRING,
  reconciliation_date TIMESTAMP,
  source_systems ARRAY<STRING>,
  start_date TIMESTAMP,
  end_date TIMESTAMP,
  status STRING,
  total_transactions BIGINT,
  matched_count BIGINT,
  unmatched_count BIGINT,
  created_at TIMESTAMP,
  completed_at TIMESTAMP
)
USING iceberg
""")
print("Created table: local.banking.reconciliation_batches")

## Step 5: Verify tables

In [None]:
# List tables to verify
print("Verifying tables...")
spark.sql("SHOW TABLES IN local.banking").show()

## Step 6: Test a simple query

In [None]:
# Test a simple query
print("Testing a simple query...")
spark.sql("SELECT * FROM local.banking.source_transactions LIMIT 10").show()