## 6-2. Change Data Fedd


In [1]:
import findspark
findspark.init()

import os
print(os.environ['JAVA_HOME'])
print(os.environ['SPARK_HOME'])

/usr/lib/jvm/java-11-openjdk-amd64
/usr/local/spark


In [2]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON

from delta import *

# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]
warehouse_dir = f"{work_dir}/spark-warehouse"

# Create spark session with hive enabled
builder = (
    SparkSession
    .builder
    .appName("pyspark-notebook")
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.databricks.delta.retentionDurationCheck.enabled", "true")
    .config("spark.sql.catalogImplementation", "hive")
    .config("spark.sql.warehouse.dir", warehouse_dir)
    .enableHiveSupport()
)

In [3]:
# 델타 레이크 생성시에 반드시 `configure_spark_with_delta_pip` 구성을 통해 실행되어야 정상적인 델타 의존성이 로딩됩니다
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [4]:
# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
spark.conf.set("spark.sql.decimalOperations.allowPrecisionLoss", "true")
spark

In [5]:
def show(queries, num_rows = 20):
    for query in queries.split(";"):
        spark.sql(query).show(num_rows, truncate=False)

def sql(query):
    return spark.sql(query)

def history(dbName, tableName):
    return spark.sql("describe history {}.{}".format(dbName, tableName))

def table(dbName, tableName):
    return spark.read.format("delta").table("{}.{}".format(dbName, tableName))

def describe(dbName, tableName, extended = True, num_rows = 20):
    if extended:
        show("describe extended {}.{}".format(dbName, tableName), num_rows)
    else:
        show("describe {}.{}".format(dbName, tableName), num_rows)

def ls(target):
    !ls -al {target}

def ls_and_head(target, lineno):
    !ls -al {target} | grep -v 'crc' | head -{lineno}

def cat(filename):
    !cat {filename}

def grep(keyword, filename):
    !grep -i {keyword} {filename}

def grep_and_json(keyword, filename):
    !grep {keyword} {filename} | python -m json.tool

def grep_sed_json(keyword, lineno, filename):
    !grep {keyword} {filename} | sed -n {lineno}p | python -m json.tool


In [6]:
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType

def dropAndRemoveTable(dbName, tableName):
    location="/home/jovyan/work/spark-warehouse/{}".format(tableName)
    !rm -rf {location}
    sql("DROP TABLE IF EXISTS {}.{}".format(dbName, tableName))

In [7]:
sql("show tables")

namespace,tableName,isTemporary
default,delta_v1,False
default,delta_v2,False
default,family,False
default,pusan_popular_trip,False
default,pusan_popular_zorder,False
default,users,False


In [24]:
dbName = "taxidb"
tableName = "tripAggregates"

In [25]:
dropAndRemoveTable(dbName, tableName)

In [26]:
sql("""
CREATE TABLE IF NOT EXISTS {}.{} 
(VendorId INT, PassengerCount INT, FareAmount INT)
USING DELTA
TBLPROPERTIES (delta.enableChangeDataFeed = true)
""".format(dbName, tableName))

In [27]:
sql(f"use {dbName}")
sql("show tables")
sql(f"select * from {dbName}.{tableName}")

VendorId,PassengerCount,FareAmount


In [28]:
sql(f"ALTER TABLE {dbName}.{tableName} SET TBLPROPERTIES (delta.enableChangeDataFeed = true)")

In [29]:
ls(f"spark-warehouse/{dbName}.db/{tableName}")

total 0
drwxrwxrwx 1 jovyan 1000 512 Nov  5 08:00 .
drwxrwxrwx 1 jovyan 1000 512 Nov  5 08:00 ..
drwxrwxrwx 1 jovyan 1000 512 Nov  5  2024 _delta_log


In [38]:
sql(f"INSERT INTO {dbName}.{tableName} VALUES (1, 500, 1000)")
sql(f"UPDATE {dbName}.{tableName} SET FareAmount = 2500 WHERE VendorId = 1")
sql(f"INSERT INTO {dbName}.{tableName} VALUES (4, 500, 1000)")
sql(f"DELETE FROM {dbName}.{tableName} WHERE VendorId = 4")
ls_and_head(f"spark-warehouse/{dbName}.db/{tableName}", 5)

total 20
drwxrwxrwx 1 jovyan 1000  512 Nov  5  2024 .
drwxrwxrwx 1 jovyan 1000  512 Nov  5 08:00 ..
drwxrwxrwx 1 jovyan 1000  512 Nov  5  2024 _change_data
drwxrwxrwx 1 jovyan 1000  512 Nov  5  2024 _delta_log


In [40]:
ls_and_head(f"spark-warehouse/{dbName}.db/{tableName}/_change_data")

total 16
drwxrwxrwx 1 jovyan 1000  512 Nov  5 08:05 .
drwxrwxrwx 1 jovyan 1000  512 Nov  5 08:05 ..
-rwxrwxrwx 1 jovyan 1000 1354 Nov  5 08:05 cdc-00000-26b8c438-25d0-4d2b-9d3c-8e04a73c7b1f.c000.snappy.parquet
-rwxrwxrwx 1 jovyan 1000 1239 Nov  5 08:02 cdc-00000-b4814949-a19c-4a4c-b12f-3ad1a35421c3.c000.snappy.parquet


In [41]:
ls_and_head(f"spark-warehouse/{dbName}.db/{tableName}/_delta_log")

total 36
drwxrwxrwx 1 jovyan 1000  512 Nov  5 08:05 .
drwxrwxrwx 1 jovyan 1000  512 Nov  5 08:05 ..
-rwxrwxrwx 1 jovyan 1000  943 Nov  5 08:00 00000000000000000000.json
-rwxrwxrwx 1 jovyan 1000  850 Nov  5 08:00 00000000000000000001.json


In [45]:
sql("""
SELECT *
FROM table_changes('{}.{}', 1, 4)
ORDER BY _commit_timestamp
""".format(dbName, tableName))

AnalysisException: could not resolve `table_changes` to a table-valued function; line 3 pos 5

In [46]:
changeFeed = spark.read.format("delta").option("readChangeFeed", True).option("startingVersion", 1).option("endingVersion", 4).table(f"{dbName}.{tableName}")

In [47]:
changeFeed.printSchema()

root
 |-- VendorId: integer (nullable = true)
 |-- PassengerCount: integer (nullable = true)
 |-- FareAmount: integer (nullable = true)
 |-- _change_type: string (nullable = true)
 |-- _commit_version: long (nullable = true)
 |-- _commit_timestamp: timestamp (nullable = true)



In [48]:
changeFeed.show(10, truncate=False)

+--------+--------------+----------+------------+---------------+-----------------------+
|VendorId|PassengerCount|FareAmount|_change_type|_commit_version|_commit_timestamp      |
+--------+--------------+----------+------------+---------------+-----------------------+
|4       |500           |1000      |delete      |4              |2024-11-05 08:02:37.781|
|4       |500           |1000      |delete      |4              |2024-11-05 08:02:37.781|
|4       |500           |1000      |insert      |2              |2024-11-05 08:00:37.562|
|4       |500           |1000      |insert      |3              |2024-11-05 08:02:35.48 |
+--------+--------------+----------+------------+---------------+-----------------------+



In [49]:
history(dbName, tableName)

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
8,2024-11-05 17:05:46.717,,,DELETE,"{predicate -> [""(spark_catalog.taxidb.tripAggregates.VendorId = 4)""]}",,,,7.0,Serializable,False,"{numRemovedFiles -> 1, numCopiedRows -> 0, numAddedChangeFiles -> 1, executionTimeMs -> 536, numD...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
7,2024-11-05 17:05:44.988,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,6.0,Serializable,True,"{numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 968}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
6,2024-11-05 17:05:43.713,,,UPDATE,{predicate -> (VendorId#3889 = 1)},,,,5.0,Serializable,False,"{numRemovedFiles -> 1, numCopiedRows -> 0, numAddedChangeFiles -> 1, executionTimeMs -> 699, scan...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
5,2024-11-05 17:05:41.841,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,4.0,Serializable,True,"{numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 968}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
4,2024-11-05 17:02:37.781,,,DELETE,"{predicate -> [""(spark_catalog.taxidb.tripAggregates.VendorId = 4)""]}",,,,3.0,Serializable,False,"{numRemovedFiles -> 2, numCopiedRows -> 0, numAddedChangeFiles -> 2, executionTimeMs -> 1169, num...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
3,2024-11-05 17:02:35.48,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,2.0,Serializable,True,"{numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 968}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
2,2024-11-05 17:00:37.562,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,1.0,Serializable,True,"{numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 968}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
1,2024-11-05 17:00:32.661,,,SET TBLPROPERTIES,"{properties -> {""delta.enableChangeDataFeed"":""true""}}",,,,0.0,Serializable,True,{},,Apache-Spark/3.2.1 Delta-Lake/2.0.0
0,2024-11-05 17:00:25.519,,,CREATE TABLE,"{isManaged -> true, description -> null, partitionBy -> [], properties -> {""delta.enableChangeDat...",,,,,Serializable,True,{},,Apache-Spark/3.2.1 Delta-Lake/2.0.0


In [55]:
changes = spark.read.format("delta").option("readChangeFeed", True).option("startingVersion", 0).option("endingVersion", 100).table(f"{dbName}.{tableName}")
changes.orderBy(asc("_commit_timestamp")).show(100, truncate=False)

+--------+--------------+----------+----------------+---------------+-----------------------+
|VendorId|PassengerCount|FareAmount|_change_type    |_commit_version|_commit_timestamp      |
+--------+--------------+----------+----------------+---------------+-----------------------+
|4       |500           |1000      |insert          |2              |2024-11-05 08:00:37.562|
|4       |500           |1000      |insert          |3              |2024-11-05 08:02:35.48 |
|4       |500           |1000      |delete          |4              |2024-11-05 08:02:37.781|
|4       |500           |1000      |delete          |4              |2024-11-05 08:02:37.781|
|1       |500           |1000      |insert          |5              |2024-11-05 08:05:41.841|
|1       |500           |1000      |update_preimage |6              |2024-11-05 08:05:43.713|
|1       |500           |2500      |update_postimage|6              |2024-11-05 08:05:43.713|
|4       |500           |1000      |insert          |7      

In [56]:
source=f"spark-warehouse/{dbName}.db/{tableName}"
ls_and_head(source, 10)

total 20
drwxrwxrwx 1 jovyan 1000  512 Nov  5 08:05 .
drwxrwxrwx 1 jovyan 1000  512 Nov  5 08:00 ..
drwxrwxrwx 1 jovyan 1000  512 Nov  5 08:05 _change_data
drwxrwxrwx 1 jovyan 1000  512 Nov  5 08:05 _delta_log
-rwxrwxrwx 1 jovyan 1000 1185 Nov  5 08:05 part-00000-0b48e892-6353-4d8d-b156-d4aa897125e4.c000.snappy.parquet
-rwxrwxrwx 1 jovyan 1000  968 Nov  5 08:05 part-00000-17099711-3cc6-4848-a0dc-00b47f5c4d8d-c000.snappy.parquet
-rwxrwxrwx 1 jovyan 1000  968 Nov  5 08:05 part-00000-465ec638-994d-42a1-831e-3cd9b2d37642-c000.snappy.parquet
-rwxrwxrwx 1 jovyan 1000  968 Nov  5 08:02 part-00000-6330f8a7-eed5-4186-903b-639d06fe7cec-c000.snappy.parquet
-rwxrwxrwx 1 jovyan 1000  968 Nov  5 08:00 part-00000-7cd2c96a-b44a-431b-8691-11d38d341a59-c000.snappy.parquet


In [10]:
dbName = "taxidb"
tableName = "tripAggregates"

show(f"select * from {dbName}.{tableName}")

+--------+--------------+----------+
|VendorId|PassengerCount|FareAmount|
+--------+--------------+----------+
|1       |500           |2500      |
+--------+--------------+----------+



In [11]:
sql(f"INSERT INTO {dbName}.{tableName} VALUES (2, 1000, 2000)")

In [12]:
sql(f"INSERT INTO {dbName}.{tableName} VALUES (3, 3000, 3000)")

In [13]:
dbName = "taxidb"
targetTable = "streamTarget"
target=f"spark-warehouse/{dbName}.db/{targetTable}"
spark.sql(f"select * from delta.`/home/jovyan/work/{target}`")

VendorId,PassengerCount,FareAmount,RecordStreamTime
3,3000,3000,2024-11-05 18:16:13.737
2,1000,2000,2024-11-05 18:15:54.323
1,500,2500,2024-11-05 17:34:00.304
