## Delta table read from Hive

In [1]:
# Create Spark Session with Delta JARS and conf

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Delta table with Column Mapping") \
    .config('spark.jars.packages', 'io.delta:delta-core_2.12:2.1.1') \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    ) \
    .config("spark.sql.warehouse.dir", "spark-warehouse") \
    .master("local[*]") \
    .enableHiveSupport() \
    .getOrCreate()

spark

In [2]:
# pip install sparksql-magic
# Run below command to enable sparksql
%load_ext sparksql_magic

In [17]:
from delta import DeltaTable

dt = DeltaTable.forName(spark, "sales_delta_managed")
dt.history().select("version", "timestamp", "operation", "operationMetrics").show()

+-------+--------------------+--------------------+--------------------+
|version|           timestamp|           operation|    operationMetrics|
+-------+--------------------+--------------------+--------------------+
|      1|2022-11-14 10:33:...|              UPDATE|{numRemovedFiles ...|
|      0|2022-11-14 10:32:...|CREATE OR REPLACE...|{numFiles -> 2, n...|
+-------+--------------------+--------------------+--------------------+



In [29]:
%%sparksql

describe extended default.sales_delta_managed;

0,1,2
col_name,data_type,comment
transacted_at,timestamp,
trx_id,string,
retailer_id,string,
description,string,
amount,"decimal(14,2)",
city_id,string,
,,
# Partitioning,,
Not partitioned,,


In [24]:
%%sparksql

CREATE TABLE default.sales_hive_table
(
    transacted_at timestamp,
    trx_id string,
    retailer_id string,
    description string,
    amount decimal(14,2),
    city string   
)
STORED AS PARQUET
LOCATION "sales_delta_managed/"
;

In [28]:
%%sparksql

describe extended default.sales_hive_table;

only showing top 20 row(s)


0,1,2
col_name,data_type,comment
transacted_at,timestamp,
trx_id,string,
retailer_id,string,
description,string,
amount,"decimal(14,2)",
city,string,
,,
# Detailed Table Information,,
Database,default,


In [26]:
%%sparksql

select count(1) from sales_hive_table;

0
count(1)
8264112


In [27]:
%%sparksql

select count(1) from sales_delta_managed;

0
count(1)
4132056


In [31]:
# Generate the symlink manifest for the delta table
dt.generate("symlink_format_manifest")

In [42]:
# Vaccum the delta table to read from hive
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled","false")
dt.vacuum(0)

DataFrame[]

In [45]:
%%sparksql

REFRESH TABLE sales_hive_table;

In [46]:
%%sparksql


select count(1) from sales_hive_table;

0
count(1)
4132056
