<a href="https://colab.research.google.com/github/ramayer/google-colab-examples/blob/main/Spark_Delta_Tables_more_efficient_MERGE_INTO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Spark Delta Table more efficient MERGE INTO


### Based on

* https://docs.databricks.com/_static/notebooks/merge-in-streaming.html
* https://github.com/delta-io/delta/issues/490
* https://kb.databricks.com/delta/delta-merge-into.html
* https://docs.microsoft.com/en-us/azure/databricks/kb/delta/delta-merge0-into



#### install Java and Spark

In [1]:
!apt-get -qq install -y openjdk-8-jdk-headless > /tmp/apt-get.out
!(wget -q --show-progress -nc https://mirrors.ocf.berkeley.edu/apache/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz)
!tar xf spark-3.1.2-bin-hadoop3.2.tgz



## Install pyspark and related python libraries



In [2]:
try:
  import pyspark, findspark, delta
except:
  %pip install -q --upgrade pyspark findspark delta


[K     |████████████████████████████████| 281.3 MB 41 kB/s 
[K     |████████████████████████████████| 198 kB 32.7 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Building wheel for delta (setup.py) ... [?25l[?25hdone


# Start a Spark Session


In [3]:
import findspark
import pyspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"
# Reasonable for tiny one-node Spark "cluster" in Google Colab notebooks
MAX_MEMORY="8g"
findspark.init()
from pyspark.sql import SparkSession
spark = (pyspark.sql.SparkSession.builder.appName("MyApp") 
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") 
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
    .config("spark.executor.memory", MAX_MEMORY) 
    .config("spark.driver.memory", MAX_MEMORY) 
    .enableHiveSupport() 
    .getOrCreate()        
    )
spark

In [4]:
spark.sql("""
   create table if not exists test_stream (id bigint, data  string) USING DELTA;
""")


DataFrame[]

In [5]:
input_stream = (spark.readStream.format("delta")
    .option("maxFilesPerTrigger",10)
    .option("maxBytesPerTrigger",1_000_000_000)
    .option("ignoreChanges","True")
    .table("test_stream")
    )

In [10]:
spark.sql("DESCRIBE detail spark_streaming_bronze").show()


+------+--------------------+--------------------+-----------+--------------------+--------------------+-------------------+----------------+--------+-----------+----------+----------------+----------------+
|format|                  id|                name|description|            location|           createdAt|       lastModified|partitionColumns|numFiles|sizeInBytes|properties|minReaderVersion|minWriterVersion|
+------+--------------------+--------------------+-----------+--------------------+--------------------+-------------------+----------------+--------+-----------+----------+----------------+----------------+
| delta|e2909139-09e7-47a...|default.spark_str...|       null|file:/content/spa...|2021-11-26 03:07:...|2021-11-26 03:07:45|              []|       0|          0|        {}|               1|               2|
+------+--------------------+--------------------+-----------+--------------------+--------------------+-------------------+----------------+--------+-----------+------

## Create a spark streaming pipeline streaming data from Bronze -> Silver -> Gold

In [4]:
spark.sql("DROP TABLE IF EXISTS spark_streaming_bronze")
spark.sql("""
   create table if not exists spark_streaming_bronze (
     id            bigint,
     version_id    int,
     partition_id  int, -- generated always as (id %10) 
     ts            timestamp,
     data          string
   ) USING DELTA;
""")


DataFrame[]

In [15]:
bronze_input_stream = (spark.readStream.format("delta")
    .option("maxFilesPerTrigger",10)
    .option("maxBytesPerTrigger",1_000_000_000)
    .option("ignoreChanges","True")
    .table("spark_streaming_bronze")
    )
bronze_input_stream.isStreaming

AnalysisException: ignored

In [5]:
df = spark.read.table("spark_streaming_bronze")
df.limit(0).write.format("delta").mode("overwrite").saveAsTable("spark_streaming_silver_naive")
df.limit(0).write.format("delta").mode("overwrite").saveAsTable("spark_streaming_silver_theoretically_better")
df.limit(0).write.format("delta").mode("overwrite").saveAsTable("spark_streaming_silver_actually_better")

## Set up the streaming operations from Bronze to Silver

In [None]:
def upsertToDelta(microBatchOutputDF, batchId): 
  '''
    Naive approach from https://docs.databricks.com/_static/notebooks/merge-in-streaming.html
    Can result in many unnecessary rows streamed downstream.
  '''
  microBatchOutputDF.createOrReplaceTempView("updates")
  microBatchOutputDF._jdf.sparkSession().sql("""
    MERGE INTO spark_streaming_silver_naive t
    USING updates s
    ON s.key = t.key
    WHEN MATCHED THEN UPDATE SET *
    WHEN NOT MATCHED THEN INSERT *
  """)
  return 1

naive_output_stream = (bronze_input_stream.writeStream
                       .format("delta")
                       .trigger(processingTime='5 seconds')
                       .option("checkpointLocation","/tmp/naive_checkpoint_1")
                       .foreachBatch(upsertToDelta)
                       .outputMode("update")
                       .start()
)


In [None]:
s = spark.streams.active[0]
s.recentProgress

# Show table histories



In [92]:
import IPython.display
import html
import datetime
import json

def html_escape(c):
  if isinstance(c,datetime.datetime):
    return c.isoformat()
  if isinstance(c,dict):
    return "<pre>"+html.escape(json.dumps(c,indent=1))+"</pre>"
  return html.escape(str(c))

def df_to_html(df,rows=10,title = None):
  html_rows = []
  for idx,row in enumerate(hist_df.take(rows)):
    data = row.asDict(True)
    if idx == 0:
      cells = [html.escape(v) for v in data.keys()]
      html_rows.append("<tr><th>"+"</th><th>".join(cells)+"</th></tr>")
    cells = [html_escape(v) for v in data.values()]
    html_rows.append("<tr><td>"+"</td><td>".join(cells)+"</td></tr>")
  h = "<table>" + ("\n".join(html_rows)) + "</table>"
  style = """<style>td {border: 1px solid black}</style>"""
  return "<h2>"+(title or '')+"</h2>"+style + h


def table_history(tbl):
  interesting_fields = "timestamp, operation, operationmetrics,operationParameters".split(',')
  return spark.sql(f"""describe history {tbl}""").selectExpr(interesting_fields)

IPython.display.HTML(
    df_to_html(table_history('spark_streaming_bronze')      ,title='source') +
    df_to_html(table_history('spark_streaming_silver_naive'),title='silver_1') +
    df_to_html(table_history('spark_streaming_bronze')      ,title='source')
)


AnalysisException: ignored

## Reddit questions

In [None]:
# https://old.reddit.com/r/apachespark/comments/qrh5qn/join_3k_rows_with_4m_rows_to_create_3bn_row_delta/
import time
if True:
  small_tbl_size = 3000
  large_tbl_size = 4_000_000
  join_col_cardinality = 4

  small_data = [(x,f"row {x}", x%join_col_cardinality) for x in range(small_tbl_size)]
  large_data = [(x,f"row {x}", x%join_col_cardinality) for x in range(large_tbl_size)]

  small_df = spark.createDataFrame(small_data, 'id int, txt string, join_col int')
  large_df = spark.createDataFrame(large_data, 'id int, txt string, join_col int')

  small_df.createOrReplaceTempView('small_tbl')
  large_df.createOrReplaceTempView('large_tbl')

  t0 = time.time()
  spark.sql("""
    SELECT * 
      FROM small_tbl AS s
      JOIN large_tbl AS l ON (s.join_col = l.join_col)
  """).show()
  print(f"{time.time() - t0} seconds")

+---+-----+--------+---+------+--------+
| id|  txt|join_col| id|   txt|join_col|
+---+-----+--------+---+------+--------+
|  1|row 1|       1|  1| row 1|       1|
|  1|row 1|       1|  5| row 5|       1|
|  1|row 1|       1|  9| row 9|       1|
|  1|row 1|       1| 13|row 13|       1|
|  1|row 1|       1| 17|row 17|       1|
|  1|row 1|       1| 21|row 21|       1|
|  1|row 1|       1| 25|row 25|       1|
|  1|row 1|       1| 29|row 29|       1|
|  1|row 1|       1| 33|row 33|       1|
|  1|row 1|       1| 37|row 37|       1|
|  1|row 1|       1| 41|row 41|       1|
|  1|row 1|       1| 45|row 45|       1|
|  1|row 1|       1| 49|row 49|       1|
|  1|row 1|       1| 53|row 53|       1|
|  1|row 1|       1| 57|row 57|       1|
|  1|row 1|       1| 61|row 61|       1|
|  1|row 1|       1| 65|row 65|       1|
|  1|row 1|       1| 69|row 69|       1|
|  1|row 1|       1| 73|row 73|       1|
|  1|row 1|       1| 77|row 77|       1|
+---+-----+--------+---+------+--------+
only showing top

In [None]:
# https://www.reddit.com/r/apachespark/comments/r0fwrx/merge_two_rdds/
d1 = [3,5,8]
d2 = [1,2,3,4]
df1 = spark.createDataFrame(d1,'int').createOrReplaceTempView('v1')
df2 = spark.createDataFrame(d2,'int').createOrReplaceTempView('v2')

spark.sql("""
   select flatten(array(array(v2.value),v1s.values))
     from v2 
     join (select collect_list(value) as values from v1) as v1s
""").show()

+------------------------------------+
|flatten(array(array(value), values))|
+------------------------------------+
|                        [1, 3, 5, 8]|
|                        [2, 3, 5, 8]|
|                        [3, 3, 5, 8]|
|                        [4, 3, 5, 8]|
+------------------------------------+

