In [1]:
from pyspark.sql import SparkSession

# Initialize Spark with Iceberg catalog configurations
spark = SparkSession.builder \
    .appName("Iceberg-MinIO-Demo") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", "s3a://local-datalake/warehouse") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl.disable.cache", "true") \
    .config("spark.hadoop.fs.s3a.fast.upload", "true") \
    .config("spark.hadoop.fs.s3a.connection.establish.timeout", "5000") \
    .config("spark.hadoop.fs.s3a.attempts.maximum", "20") \
    .config("spark.hadoop.fs.s3a.connection.timeout", "10000") \
    .config("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2") \
    .getOrCreate()

spark.conf.set("spark.hadoop.fs.s3a.access.key", "fakesecret")
spark.conf.set("spark.hadoop.fs.s3a.secret.key", "fakesecret")
spark.conf.set("spark.hadoop.fs.s3a.endpoint", "http://minio-service:9000")

# Use AnonymousAWSCredentialsProvider for read operations
spark.conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")

# Add ACL permissions
spark.conf.set("spark.hadoop.fs.s3a.acl.default", "PublicReadWrite")
spark.conf.set("spark.hadoop.fs.s3a.multipart.size", "5242880")

# Print Spark configurations to verify they are set correctly
print("Spark Iceberg Configurations:")
print(f"Catalog type: {spark.conf.get('spark.sql.catalog.local.type')}")
print(f"Warehouse location: {spark.conf.get('spark.sql.catalog.local.warehouse')}")
print(f"S3A endpoint: {spark.conf.get('spark.hadoop.fs.s3a.endpoint')}")

# Create namespace for our tables
print("\nCreating namespace...")
spark.sql("CREATE NAMESPACE IF NOT EXISTS local.demo")
print("Namespace created: local.demo")

Spark Iceberg Configurations:
Catalog type: hadoop
Warehouse location: s3a://local-datalake/warehouse
S3A endpoint: http://minio-service:9000

Creating namespace...
Namespace created: local.demo


In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
import datetime

# Define schema for our data
schema = StructType([
    StructField("user_id", StringType(), False),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("signup_date", TimestampType(), True)
])

# Create sample data
current_time = datetime.datetime.now()
data = [
    ("user1", "Alice", 34, current_time - datetime.timedelta(days=30)),
    ("user2", "Bob", 45, current_time - datetime.timedelta(days=25)),
    ("user3", "Carol", 27, current_time - datetime.timedelta(days=20))
]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Write DataFrame to Iceberg table
table_name = "local.demo.users"
df.writeTo(table_name) \
    .tableProperty("write.format.default", "parquet") \
    .tableProperty("write.parquet.compression-codec", "snappy") \
    .createOrReplace()

print(f"Successfully created Iceberg table: {table_name}")

Successfully created Iceberg table: local.demo.users


In [4]:
schema = spark.table(table_name).schema
print(schema)

StructType([StructField('user_id', StringType(), True), StructField('name', StringType(), True), StructField('age', IntegerType(), True), StructField('signup_date', TimestampType(), True)])


In [5]:
spark.table(table_name).show()

+-------+-----+---+--------------------+
|user_id| name|age|         signup_date|
+-------+-----+---+--------------------+
|  user1|Alice| 34|2025-02-20 18:28:...|
|  user2|  Bob| 45|2025-02-25 18:28:...|
|  user3|Carol| 27|2025-03-02 18:28:...|
+-------+-----+---+--------------------+



In [6]:
spark.sql("SHOW TABLES IN local.demo").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|     demo|    users|      false|
+---------+---------+-----------+



In [10]:
history_df = spark.sql(f"SELECT * FROM {table_name}.history")
history_df.show()

+--------------------+-------------------+---------+-------------------+
|     made_current_at|        snapshot_id|parent_id|is_current_ancestor|
+--------------------+-------------------+---------+-------------------+
|2025-03-22 18:28:...|5519466210033624142|     NULL|               true|
+--------------------+-------------------+---------+-------------------+



In [11]:
spark.sql(f"DESCRIBE TABLE {table_name}").show()

+-----------+---------+-------+
|   col_name|data_type|comment|
+-----------+---------+-------+
|    user_id|   string|   NULL|
|       name|   string|   NULL|
|        age|      int|   NULL|
|signup_date|timestamp|   NULL|
+-----------+---------+-------+



In [13]:
# Add new data to the table
new_data = [
    ("user6", "Frank", 42, datetime.datetime.now()),
    ("user7", "Grace", 31, datetime.datetime.now())
]

new_df = spark.createDataFrame(new_data, schema)

# Append new rows to the table
try:
    new_df.writeTo(table_name).append()
    print(f"Successfully appended new data to {table_name}")
except Exception as e:
    print(f"Error appending data: {str(e)}")

try:
    updated_df = spark.table(table_name)
    updated_df.show()
except Exception as e:
    print(f"Error reading updated data: {str(e)}")

Successfully appended new data to local.demo.users
+-------+-----+---+--------------------+
|user_id| name|age|         signup_date|
+-------+-----+---+--------------------+
|  user6|Frank| 42|2025-03-22 18:32:...|
|  user6|Frank| 42|2025-03-22 18:31:...|
|  user1|Alice| 34|2025-02-20 18:28:...|
|  user7|Grace| 31|2025-03-22 18:32:...|
|  user2|  Bob| 45|2025-02-25 18:28:...|
|  user7|Grace| 31|2025-03-22 18:31:...|
|  user3|Carol| 27|2025-03-02 18:28:...|
+-------+-----+---+--------------------+



In [16]:
# Add a new column to the schema
try:
    spark.sql(f"ALTER TABLE {table_name} ADD COLUMN email STRING")
    print(f"Successfully added 'email' column to {table_name}")
except Exception as e:
    print(f"Error adding column: {str(e)}")

# View the updated schema
print("\nUpdated table schema:")
try:
    spark.sql(f"DESCRIBE TABLE {table_name}").show()
except Exception as e:
    print(f"Error describing table: {str(e)}")

# View the updated data
print("\nTable data with new column:")
try:
    evolved_df = spark.table(table_name)
    evolved_df.show()
except Exception as e:
    print(f"Error reading evolved data: {str(e)}")

Error adding column: [FIELDS_ALREADY_EXISTS] Cannot add column, because `email` already exists in "STRUCT<user_id: STRING, name: STRING, age: INT, signup_date: TIMESTAMP, email: STRING>".; line 1 pos 0;
AddColumns [QualifiedColType(None,email,StringType,true,None,None,None)]
+- ResolvedTable org.apache.iceberg.spark.SparkCatalog@2abc31bd, demo.users, local.demo.users, [user_id#435, name#436, age#437, signup_date#438, email#439]


Updated table schema:
+-----------+---------+-------+
|   col_name|data_type|comment|
+-----------+---------+-------+
|    user_id|   string|   NULL|
|       name|   string|   NULL|
|        age|      int|   NULL|
|signup_date|timestamp|   NULL|
|      email|   string|   NULL|
+-----------+---------+-------+


Table data with new column:
+-------+-----+---+--------------------+-----+
|user_id| name|age|         signup_date|email|
+-------+-----+---+--------------------+-----+
|  user6|Frank| 42|2025-03-22 18:31:...| NULL|
|  user6|Frank| 42|2025-03-22 18:32:..

In [17]:
# First, view the table history to get snapshot IDs
print("Table history:")
try:
    history_df = spark.sql(f"SELECT * FROM {table_name}.history")
    history_df.show()
except Exception as e:
    print(f"Error getting history: {str(e)}")

# Get snapshot IDs dynamically
try:
    snapshots = history_df.select("snapshot_id").collect()
    if len(snapshots) >= 2:
        # Get the second most recent snapshot (before our latest changes)
        previous_snapshot_id = snapshots[1][0]
        
        print(f"\nTime travel to snapshot: {previous_snapshot_id}")
        
        # Read table at the previous snapshot
        previous_df = spark.read.format("iceberg").option("snapshot-id", previous_snapshot_id).load(table_name)
        print("Table data at previous snapshot:")
        previous_df.show()
    else:
        print("Not enough snapshots available for time travel demonstration")
except Exception as e:
    print(f"Error during time travel: {str(e)}")

Table history:
+--------------------+-------------------+-------------------+-------------------+
|     made_current_at|        snapshot_id|          parent_id|is_current_ancestor|
+--------------------+-------------------+-------------------+-------------------+
|2025-03-22 18:28:...|5519466210033624142|               NULL|               true|
|2025-03-22 18:31:...|6069084531131085546|5519466210033624142|               true|
|2025-03-22 18:32:...|3818681792534402167|6069084531131085546|               true|
+--------------------+-------------------+-------------------+-------------------+


Time travel to snapshot: 6069084531131085546
Table data at previous snapshot:
+-------+-----+---+--------------------+
|user_id| name|age|         signup_date|
+-------+-----+---+--------------------+
|  user1|Alice| 34|2025-02-20 18:28:...|
|  user6|Frank| 42|2025-03-22 18:31:...|
|  user2|  Bob| 45|2025-02-25 18:28:...|
|  user7|Grace| 31|2025-03-22 18:31:...|
|  user3|Carol| 27|2025-03-02 18:28:.