In [None]:
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import pandas as pd

# This CATALOG_URL works for the "docker compose" testing and development environment
# Change 'server' if you are not running on "docker compose" (f. ex. 'localhost' if Lakekeeper is running locally).
CATALOG_URL = "http://server:8181/catalog"
WAREHOUSE = "demo"
MY_NAMESPACE = "my_db"

SPARK_VERSION = pyspark.__version__
SPARK_MINOR_VERSION = '.'.join(SPARK_VERSION.split('.')[:2])
ICEBERG_VERSION = "1.8.1"

# Connect with Spark

In [None]:
# Configures the Iceberg catalog (Lakekeeper) and loads the Iceberg library
# NOTE: no credentials are being passed. The catalog automatically assigned temp credentials per session
config = {
    f"spark.sql.catalog.lakekeeper": "org.apache.iceberg.spark.SparkCatalog",
    f"spark.sql.catalog.lakekeeper.type": "rest",
    f"spark.sql.catalog.lakekeeper.uri": CATALOG_URL,
    f"spark.sql.catalog.lakekeeper.warehouse": WAREHOUSE,
    f"spark.sql.catalog.lakekeeper.io-impl": "org.apache.iceberg.aws.s3.S3FileIO",
    "spark.sql.extensions": "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
    "spark.sql.defaultCatalog": "lakekeeper",
    "spark.jars.packages": f"org.apache.iceberg:iceberg-spark-runtime-{SPARK_MINOR_VERSION}_2.12:{ICEBERG_VERSION},org.apache.iceberg:iceberg-aws-bundle:{ICEBERG_VERSION}",
}


In [None]:
spark_config = SparkConf().setMaster('local').setAppName("Qlik-Connect-Iceberg-Workshop")
for k, v in config.items():
    spark_config = spark_config.set(k, v)

spark = SparkSession.builder.config(conf=spark_config).getOrCreate()

# Select the "lakekeeper catalog" to use in subsequent SQL operations
spark.sql("USE lakekeeper")

## Create your first Iceberg table
In this section, we'll create an Iceberg table, load a few rows and query it

In [None]:
# A Namespace is a logical grouping of catalog resources, like a database
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {MY_NAMESPACE}")

# Confirm the namespace has been created
spark.sql("SHOW NAMESPACES").toPandas()

In [None]:
# Create a simple Iceberg table to represent users, including an ID and a name
spark.sql(f"""
            CREATE TABLE {MY_NAMESPACE}.users (
                id INT,
                name STRING
            ) USING ICEBERG
          """).toPandas()

In [None]:
# Insert some rows into the table.

spark.sql(f"""
            INSERT INTO {MY_NAMESPACE}.users VALUES
            (1, 'roy'),
            (2, 'ori'),
            (3, 'john'),
            (4, 'jason'),
            (5, 'david'),
            (6, 'ajay')
          """).toPandas()

In [None]:
# Query your new table and see the rows you inserted

spark.sql(f"SELECT * FROM {MY_NAMESPACE}.users").toPandas()

# Iceberg tables under the hood

In the following sections we'll look at the structure of an Iceberg table.

## Snapshots
When you query the snapshots information table you'll be able to see the current and previous snapshots of your table.
Pay attention to the `summary` column, note that `added-records` equal the number of rows we inserted in the previous statement.

In [None]:
# Inspect the snapshots table

spark.sql(f"SELECT * FROM lakekeeper.{MY_NAMESPACE}.users.snapshots").toPandas()

### Inserting a new value
Insert a new value to the table. A new snapshot is created and the row is added into a new data file

In [None]:
spark.sql(f"INSERT INTO {MY_NAMESPACE}.users VALUES (7, 'bob');").toPandas()

A second snapshot was created representing the new row we added above.
Pay attention under `summary` column to `added-records` which shows 1 and `total-records` which shows 7

In [None]:
spark.sql(f"SELECT * FROM lakekeeper.{MY_NAMESPACE}.users.snapshots").toPandas()

### Updating existing values
To update an individual value in a table, use the `UPDATE` keyword with an appropriate `WHERE` to identify the row you want to update

In [None]:
spark.sql(f"""
          UPDATE {MY_NAMESPACE}.users
          SET name = 'dave'
          WHERE id = 5
          """).toPandas()

Inspect the snapshots table and note the new snapshot that was created represents an `overwrite` operation. In this operation, Iceberg deleted a row, `deleted-record=1` which was the row we updated containing the original values. And added a row `added-record=1` which is the row with the new values.

In [None]:
spark.sql(f"SELECT * FROM lakekeeper.{MY_NAMESPACE}.users.snapshots").toPandas()

Inspecting the `manifests` table is another way to understand the changes performed on a specific Iceberg table.

In [None]:
spark.sql(f"SELECT * FROM lakekeeper.{MY_NAMESPACE}.users.manifests").toPandas()

### Deleting values

You can delete values from a table using the `DELETE` keyword and a `WHERE` clause to identify which rows to delete

In [None]:
spark.sql(f"DELETE FROM {MY_NAMESPACE}.users WHERE id = 5").toPandas()

In [None]:
# Inspect the snapshots table and confirm a new one has been created

spark.sql(f"SELECT * FROM lakekeeper.{MY_NAMESPACE}.users.snapshots").toPandas()

Notice that when you exectue the following query, no results are returned. This means that no delete files where created when rows were updated or deleted.
Kind of strage no?  Well in fact that's because the table is by default configured to Copy On Write. This mode of operation automatically merges the delete files with data files. This is ideal for batch workloads with flexible latency, but is far less ideal for streaming, near real time use cases.

In [None]:
spark.sql(f"SELECT * FROM lakekeeper.{MY_NAMESPACE}.users.all_delete_files").toPandas()

### Working with MoR and CoW tables

MoR tables require the query engine to merge data and delete files on read. You can periodically compact these files to make the query engine's life easier.
CoW tables merges the data and delete files when the rows are written. It requires more IO on write, but far less on read.

Spark allows you to configure MoR or CoW for either `delete`, `update` or `merge` operations. This gives you flexibility to control how your tables should be updated.

Start by changing the default mode of operation that Spark uses to write and update Iceberg tables

In [None]:
spark.sql(f"ALTER TABLE {MY_NAMESPACE}.users SET TBLPROPERTIES ('write.update.mode'='merge-on-read')").toPandas()

Next, lets update a row and see how our table reacts

In [None]:
spark.sql(f"""
            UPDATE {MY_NAMESPACE}.users
            SET name='bobby'
            WHERE id = 6
          """).toPandas()

Check to see if any delete files were created.  Remember, previously no delete files where created.

In [None]:
spark.sql(f"SELECT * FROM lakekeeper.{MY_NAMESPACE}.users.all_delete_files").toPandas()

Inspecting the `manifests` table also shows that Iceberg created a specific manifest file to track the delete file, along with manifests to track the data files.
You can tell by looking at the `content` column. `0` means manifest tracking data files and `1` means manifest tracking delete files

In [None]:
spark.sql(f"SELECT * FROM lakekeeper.{MY_NAMESPACE}.users.manifests").toPandas()