In [None]:
from pyspark.sql.session import SparkSession

# .master("spark://spark-master:7077") \
spark = SparkSession.builder \
            .master("local[*]") \
            .appName("SparkByExamples.com") \
            .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
            .config("spark.jars.packages","org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.0") \
            .config("spark.sql.catalog.spark_catalog","org.apache.iceberg.spark.SparkSessionCatalog") \
            .config("spark.sql.catalog.spark_catalog.type","hive") \
            .config("spark.hadoop.hive.metastore.uris","thrift://hive-metastore:9083") \
            .config("spark.hadoop.datanucleus.autoCreateSchema","true") \
            .config("spark.hadoop.datanucleus.fixedDatastore","false") \
            .config("spark.sql.shuffle.partitions","1") \
            .getOrCreate()  


In [None]:
# -- local is the path-based catalog defined above
spark.sql("CREATE TABLE icetable (id bigint, data string) USING iceberg")

In [None]:
spark.sql("INSERT INTO icetable VALUES (1, 'a'), (2, 'b'), (3, 'c');")
# spark.sql("INSERT INTO local.db.table SELECT id, data FROM source WHERE length(data) = 1;")

In [None]:
df = spark.sql("SELECT * FROM icetable")
df.sort("id").show()

In [None]:
df = spark.sql("""
    MERGE INTO icetable t                                                -- a target table
    USING (SELECT *, 'delete' as op FROM icetable WHERE id=1) s          -- the source updates
    ON t.id = s.id                                                       -- condition to find updates for target rows
    WHEN MATCHED AND s.op = 'delete' THEN DELETE                         -- updates
""")
df = spark.sql("SELECT * FROM icetable")
df.sort("id").show()

In [None]:
df = spark.sql("""
    MERGE INTO icetable t                                                -- a target table
    USING (SELECT *, 'increment' as op FROM icetable WHERE data='c') s   -- the source updates
    ON t.id = s.id and t.data = s.data                                   -- condition to find updates for target rows
    WHEN MATCHED AND s.op = 'increment' THEN UPDATE SET t.id = t.id + 1  -- updates
""")
df = spark.sql("SELECT * FROM icetable")
df.sort("id").show()