## Delta with PySpark

In [2]:
# Create Spark Session with Delta JARS and conf

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Delta with PySpark") \
    .config('spark.jars.packages', 'io.delta:delta-core_2.12:2.1.1') \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    ) \
    .config("spark.sql.warehouse.dir", "spark-warehouse") \
    .master("local[*]") \
    .enableHiveSupport() \
    .getOrCreate()

spark

In [3]:
# pip install sparksql-magic
# Run below command to enable sparksql
%load_ext sparksql_magic

In [47]:
%%sparksql
show tables in default;

0,1,2
namespace,tableName,isTemporary
default,sales_managed,False


In [48]:
# Lets read our Sales dataset

df_sales = spark.read.parquet("dataset/sales.parquet/*parquet")
df_sales.printSchema()
df_sales.show(10, False)

root
 |-- transacted_at: string (nullable = true)
 |-- trx_id: string (nullable = true)
 |-- retailer_id: string (nullable = true)
 |-- description: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- city_id: string (nullable = true)

+------------------------+----------+-----------+---------------------------------------------+-----------------------+----------+
|transacted_at           |trx_id    |retailer_id|description                                  |amount                 |city_id   |
+------------------------+----------+-----------+---------------------------------------------+-----------------------+----------+
|2017-11-24T19:00:00.000Z|1995601912|2077350195 |Walgreen       11-25                         |197.230000000000000000 |216510442 |
|2017-11-24T19:00:00.000Z|1734117021|644879053  |unkn    ppd id: 768641     11-26             |8.580000000000000000   |930259917 |
|2017-11-24T19:00:00.000Z|1734117022|847200066  |Wal-Mart  ppd id: 555914     Algiers    11-2

In [49]:
# Lets create a sales managed delta table
from pyspark.sql.functions import to_timestamp, expr

df_formatted = (
    df_sales
    .withColumn("transacted_at", to_timestamp("transacted_at"))
    .withColumn("amount", expr("CAST(amount as decimal(14,2))"))
               )
    
df_formatted.write \
    .format("delta") \
    .saveAsTable("sales_delta_managed")

In [50]:
%%sparksql

describe extended default.sales_delta_managed;

0,1,2
col_name,data_type,comment
transacted_at,timestamp,
trx_id,string,
retailer_id,string,
description,string,
amount,"decimal(14,2)",
city_id,string,
,,
# Partitioning,,
Not partitioned,,


In [51]:
%%sparksql

select * from default.sales_delta_managed limit 10;

0,1,2,3,4,5
transacted_at,trx_id,retailer_id,description,amount,city_id
2017-11-24 19:00:00,1995601912,2077350195,Walgreen 11-25,197.23,216510442
2017-11-24 19:00:00,1734117021,644879053,unkn ppd id: 768641 11-26,8.58,930259917
2017-11-24 19:00:00,1734117022,847200066,Wal-Mart ppd id: 555914 Algiers 11-26,1737.26,1646415505
2017-11-24 19:00:00,1734117030,1953761884,Home Depot ppd id: 265293 11-25,384.50,287177635
2017-11-24 19:00:00,1734117089,1898522855,Target 11-25,66.33,1855530529
2017-11-24 19:00:00,1734117117,997626433,Sears ppd id: 856095 Ashgabat,298.87,957346984
2017-11-24 19:00:00,1734117123,1953761884,unkn ppd id: 153174 Little Rock 11-25,19.55,45522086
2017-11-24 19:00:00,1734117152,1429095612,Ikea arc id: 527956 Saint John's 11-26,9.39,1268541279
2017-11-24 19:00:00,1734117153,847200066,unkn Kingstown,2907.57,1483931123


In [52]:
# Lets check the current version of the table

from delta import DeltaTable

dt = DeltaTable.forName(spark, "sales_delta_managed")
dt.history().select("version", "timestamp").show(truncate=False)

+-------+-----------------------+
|version|timestamp              |
+-------+-----------------------+
|0      |2022-11-14 10:32:59.751|
+-------+-----------------------+



In [53]:
%%sparksql

update default.sales_delta_managed set amount = 450.56 where trx_id = '1995601912';

0
num_affected_rows
1


In [54]:
# Lets check the current version of the table

dt.history().select("version", "timestamp").show(truncate=False)

+-------+-----------------------+
|version|timestamp              |
+-------+-----------------------+
|1      |2022-11-14 10:33:34.513|
|0      |2022-11-14 10:32:59.751|
+-------+-----------------------+



In [55]:
%%sparksql

select * from default.sales_delta_managed limit 10;

0,1,2,3,4,5
transacted_at,trx_id,retailer_id,description,amount,city_id
2017-11-24 19:00:00,1995601912,2077350195,Walgreen 11-25,450.56,216510442
2017-11-24 19:00:00,1734117021,644879053,unkn ppd id: 768641 11-26,8.58,930259917
2017-11-24 19:00:00,1734117022,847200066,Wal-Mart ppd id: 555914 Algiers 11-26,1737.26,1646415505
2017-11-24 19:00:00,1734117030,1953761884,Home Depot ppd id: 265293 11-25,384.50,287177635
2017-11-24 19:00:00,1734117089,1898522855,Target 11-25,66.33,1855530529
2017-11-24 19:00:00,1734117117,997626433,Sears ppd id: 856095 Ashgabat,298.87,957346984
2017-11-24 19:00:00,1734117123,1953761884,unkn ppd id: 153174 Little Rock 11-25,19.55,45522086
2017-11-24 19:00:00,1734117152,1429095612,Ikea arc id: 527956 Saint John's 11-26,9.39,1268541279
2017-11-24 19:00:00,1734117153,847200066,unkn Kingstown,2907.57,1483931123


In [56]:
# Verify if a given table is Delta

print(DeltaTable.isDeltaTable(spark, "spark-warehouse/sales_managed/"))
print(DeltaTable.isDeltaTable(spark, "spark-warehouse/sales_delta_managed/"))

False
True


In [58]:
# Shortcut to create a Parquet location to delta table
# We will convert the sales_managed table to delta

DeltaTable.convertToDelta(spark, "parquet.`spark-warehouse/sales_managed`")

<delta.tables.DeltaTable at 0x7fb3dc549810>

In [59]:
# Verify if a given table is Delta

print(DeltaTable.isDeltaTable(spark, "spark-warehouse/sales_managed/"))
print(DeltaTable.isDeltaTable(spark, "spark-warehouse/sales_delta_managed/"))

True
True


In [63]:
%%sparksql

describe extended default.sales_managed;

only showing top 20 row(s)


0,1,2
col_name,data_type,comment
transacted_at,string,
trx_id,string,
retailer_id,string,
description,string,
amount,string,
city_id,string,
,,
# Detailed Table Information,,
Database,default,


In [66]:
%%sparksql

CONVERT TO DELTA default.sales_managed;

In [67]:
%%sparksql

describe extended default.sales_managed;

0,1,2
col_name,data_type,comment
transacted_at,string,
trx_id,string,
retailer_id,string,
description,string,
amount,string,
city_id,string,
,,
# Partitioning,,
Not partitioned,,
