# クイックスタート
https://docs.delta.io/latest/quick-start.html#python

In [1]:
import pyspark
from delta import *
import packages.modules as modules
from pyspark.sql.functions import explode, col, flatten, arrays_zip
from pyspark.sql.types import StringType, StructField, StructType

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
  .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
  .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)

workspace = "/workspace"
file_name = "customers_10000"
file_ext = ".json"
data_path = workspace + "/json/" + file_name + file_ext
delta_table_name = "customers_flatten"
delta_table_path = workspace + "/tables/" + delta_table_name

:: loading settings :: url = jar:file:/usr/local/lib/python3.11/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/vscode/.ivy2/cache
The jars for the packages stored in: /home/vscode/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-341b07fe-57fc-4962-a473-45dc1c2bc589;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.0.0 in central
	found io.delta#delta-storage;3.0.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
downloading https://repo1.maven.org/maven2/io/delta/delta-spark_2.12/3.0.0/delta-spark_2.12-3.0.0.jar ...
	[SUCCESSFUL ] io.delta#delta-spark_2.12;3.0.0!delta-spark_2.12.jar (1326ms)
downloading https://repo1.maven.org/maven2/io/delta/delta-storage/3.0.0/delta-storage-3.0.0.jar ...
	[SUCCESSFUL ] io.delta#delta-storage;3.0.0!delta-storage.jar (250ms)
downloading https://repo1.maven.org/maven2/org/antlr/antlr4-runtime/4.9.3/antlr4-runtime-4.9.3.jar ...
	[SUCCESSFUL ] org.antlr#antlr4-runtime;4.9.3!antlr4-runtime.jar (273ms)
:: resolution report :: resolve 3768ms :

# 処理開始
jsonデータを取得する

In [2]:
jsonDf = spark.read.option("multiline", "true").json(data_path)
jsonDf.printSchema()
jsonDf.show(100)

                                                                                

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- latitude: double (nullable = true)
 |    |-- longitude: double (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- zip_code: string (nullable = true)
 |-- biography: string (nullable = true)
 |-- email: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- job: struct (nullable = true)
 |    |-- area: string (nullable = true)
 |    |-- description: string (nullable = true)
 |    |-- title: string (nullable = true)
 |    |-- type: string (nullable = true)
 |-- name: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- vehicles: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- model: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |    |    |-- vehicle: string (nullable = true)
 |-- zodiacSign: string (nullable = tr

# flattenを実行

In [3]:
df_flat = modules.flatten_df(jsonDf)
df_flat.show(1, truncate=True)
df_flat.printSchema()

+--------------------+--------------------+-----------------+---+-------+-------------+------+--------------------+----------+-------------+---------------+--------------------+--------+------------+----------------+-----------------+-------------+----------------+
|           biography|               email|           gender| id|   name| phone_number|   sex|            vehicles|zodiacSign|     job_area|job_description|           job_title|job_type|address_city|address_latitude|address_longitude|address_state|address_zip_code|
+--------------------+--------------------+-----------------+---+-------+-------------+------+--------------------+----------+-------------+---------------+--------------------+--------+------------+----------------+-----------------+-------------+----------------+
|entrepreneur, pho...|or4j6j_kdkkep@yah...|Transexual person|  1|林 キヨ|080-6370-1059|female|[{Model T, Extend...|    Cancer|Functionality|        Central|International Fun...| Planner|      愛菜区|         

 # 展開を実行

In [4]:
df_flat_explode = df_flat.select("id","name","email","phone_number","sex","gender","biography","address_zip_code","address_state","address_city","address_longitude","address_latitude","job_title","job_description","job_type","job_area",explode(df_flat.vehicles),"zodiacSign")\
.select("id","name","email","phone_number","sex","gender","biography","address_zip_code","address_state","address_city","address_longitude","address_latitude","job_title","job_description","job_type","job_area","col.*","zodiacSign")
df_flat_explode.show(10, truncate=False)
df_flat_explode.printSchema()

+---+---------+------------------------------+-------------+------+--------------------------------+--------------------------------------------+----------------+-------------+------------+-----------------+----------------+----------------------------------+---------------+----------+-------------+--------------+-------------------+-----------------------+-----------+
|id |name     |email                         |phone_number |sex   |gender                          |biography                                   |address_zip_code|address_state|address_city|address_longitude|address_latitude|job_title                         |job_description|job_type  |job_area     |model         |type               |vehicle                |zodiacSign |
+---+---------+------------------------------+-------------+------+--------------------------------+--------------------------------------------+----------------+-------------+------------+-----------------+----------------+--------------------------------

# デルタテーブルにデータを書き込む
ローカル環境上で`saveAsTable`を使おうとすると、エラーが発生する。<br>
原因は、今の所、調査中。<br>
なので、`save`を使って、データを書き込んでいる。

In [5]:
df_flat_explode.write\
  .format("delta")\
  .mode("overwrite")\
  .partitionBy("address_state")\
  .save(delta_table_path)

                                                                                

# デルタテーブルを読み込む

In [6]:
df = spark.read.format("delta").load(delta_table_path)
df.createOrReplaceTempView(delta_table_name)
spark.conf.set('dq.val.delta_table_name', delta_table_name)
spark.sql(
  """
    SELECT
      id,
      name,
      email,
      phone_number,
      sex,
      gender,
      biography,
      address_zip_code,
      address_state,
      address_city,
      address_longitude,
      address_latitude,
      job_title,
      job_description,
      job_type,
      job_area,
      model,
      type,
      vehicle,
      zodiacSign
    FROM ${dq.val.delta_table_name}
    ORDER BY CAST(id AS BIGINT) ASC
  """
).show(100)

+---+-----------+--------------------+-------------+------+--------------------+--------------------+----------------+-------------+------------+-----------------+----------------+--------------------+---------------+--------------+--------------+--------------+-------------------+--------------------+-----------+
| id|       name|               email| phone_number|   sex|              gender|           biography|address_zip_code|address_state|address_city|address_longitude|address_latitude|           job_title|job_description|      job_type|      job_area|         model|               type|             vehicle| zodiacSign|
+---+-----------+--------------------+-------------+------+--------------------+--------------------+----------------+-------------+------------+-----------------+----------------+--------------------+---------------+--------------+--------------+--------------+-------------------+--------------------+-----------+
|  1|    林 キヨ|or4j6j_kdkkep@yah...|080-6370-1059|fem