# デルタレイク クイックスタート Delta Lake Quickstart
https://docs.delta.io/latest/quick-start.html#python

# 本サンプルの目的 Ovjective
jsonデータをデルタテーブルに格納する

In [1]:
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
  .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
  .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)

workspace = "/workspace"
file_name = "customers_10000"
file_ext = ".json"
data_path = workspace + "/json/" + file_name + file_ext
delta_table_name = "customers_noflatten"
delta_table_path = workspace + "/tables/" + delta_table_name

:: loading settings :: url = jar:file:/usr/local/lib/python3.11/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/vscode/.ivy2/cache
The jars for the packages stored in: /home/vscode/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e7ba7f8c-9cd5-4bba-b9da-2799b102bb50;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.0.0 in central
	found io.delta#delta-storage;3.0.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 118ms :: artifacts dl 12ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.0.0 from central in [default]
	io.delta#delta-storage;3.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0

# 処理開始
jsonデータを取得する

In [2]:
jsonDf = spark.read.option("multiline", "true").json(data_path)
jsonDf.printSchema()
jsonDf.show(100)

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- latitude: double (nullable = true)
 |    |-- longitude: double (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- zip_code: string (nullable = true)
 |-- biography: string (nullable = true)
 |-- email: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- job: struct (nullable = true)
 |    |-- area: string (nullable = true)
 |    |-- description: string (nullable = true)
 |    |-- title: string (nullable = true)
 |    |-- type: string (nullable = true)
 |-- name: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- vehicles: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- model: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |    |    |-- vehicle: string (nullable = true)
 |-- zodiacSign: string (nullable = tr

# デルタテーブルにデータを書き込む
ローカル環境上で`saveAsTable`を使おうとすると、エラーが発生する。<br>
原因は、今の所、調査中。<br>
なので、`save`を使って、データを書き込んでいる。

In [3]:
jsonDf.write\
  .format("delta")\
  .mode("overwrite")\
  .partitionBy("zodiacSign")\
  .save(delta_table_path)

                                                                                

# デルタテーブルを読み込む

In [4]:
df = spark.read.format("delta").load(delta_table_path)
df.createOrReplaceTempView(delta_table_name)
spark.conf.set('dq.val.delta_table_name', delta_table_name)
spark.sql(
  """
    SELECT
      id,
      name,
      email,
      phone_number,
      sex,
      gender,
      biography,
      address,
      job,
      vehicles,
      zodiacSign
    FROM ${dq.val.delta_table_name}
    ORDER BY CAST(id AS BIGINT) ASC
  """
).show(100)

+---+-------------+--------------------+-------------+------+--------------------+--------------------+-------------------------+--------------------+--------------------+-----------+
| id|         name|               email| phone_number|   sex|              gender|           biography|                  address|                 job|            vehicles| zodiacSign|
+---+-------------+--------------------+-------------+------+--------------------+--------------------+-------------------------+--------------------+--------------------+-----------+
|  1|      林 キヨ|or4j6j_kdkkep@yah...|080-6370-1059|female|   Transexual person|entrepreneur, pho...|  {愛菜区, 35.9233, -3...|{Functionality, C...|[{Model T, Extend...|     Cancer|
|  2|    鈴木 一男|  hlzgp861@gmail.com|090-7088-5747|female|          Transexual|catalysis support...|  {結愛村, 20.0774, -6...|{Directives, Inte...|[{CTS, Crew Cab P...|      Aries|
|  3|    山口 優奈|u5rhm2.i9dn5c@gma...|050-7740-5197|  male|   Two-spirit person|model, streamer