In [7]:
# !pip install delta-spark

In [8]:
# Config for notebook - optional
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import pandas as pd
pd.set_option('display.max_colwidth',100)

In [5]:
import pyspark
from delta import *


builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()



:: loading settings :: url = jar:file:/usr/local/spark-3.2.0-bin-hadoop3.2/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d61c2f75-e9e8-4b86-8302-854c157f10db;1.0
	confs: [default]
	found io.delta#delta-core_2.12;1.1.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 234ms :: artifacts dl 8ms
	:: modules in use:
	io.delta#delta-core_2.12;1.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      d

In [6]:
# Read in data in to dataframe

df = spark.read.option("header",True).csv('../data/train.csv')

df.count()

df.show(5)

                                                                                

+----------+-----+----+-----+
|      date|store|item|sales|
+----------+-----+----+-----+
|2013-01-01|    1|   1|   13|
|2013-01-02|    1|   1|   11|
|2013-01-03|    1|   1|   14|
|2013-01-04|    1|   1|   13|
|2013-01-05|    1|   1|   10|
+----------+-----+----+-----+
only showing top 5 rows



In [9]:
# Restart kernel and readin data from delta lake

delta_df = spark.read.format("delta").load("../lake")

type(delta_df)

delta_df.count()

                                                                                

pyspark.sql.dataframe.DataFrame

                                                                                

958000

In [10]:
# Lets check Delta table's meta data
deltaTable = DeltaTable.forPath(spark, "../lake")

In [11]:
deltaTable.history().toPandas()

21/12/24 12:05:14 WARN DeltaHistoryManager: Found Delta commit 2 with a timestamp 1640106233377 which is greater than the next commit timestamp 1640106233377.


Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,3,2021-12-21 17:03:53.378,,,WRITE,"{'mode': 'Append', 'partitionBy': '[]'}",,,,2.0,,True,"{'numOutputRows': '45000', 'numOutputBytes': '198755', 'numFiles': '1'}",,
1,2,2021-12-21 17:03:53.377,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,1.0,,False,"{'numOutputRows': '913000', 'numOutputBytes': '1773875', 'numFiles': '4'}",,
2,1,2021-12-21 17:03:53.376,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,0.0,,False,"{'numOutputRows': '913000', 'numOutputBytes': '1776421', 'numFiles': '4'}",,
3,0,2021-12-21 17:03:53.375,,,WRITE,"{'mode': 'ErrorIfExists', 'partitionBy': '[]'}",,,,,,True,"{'numOutputRows': '913000', 'numOutputBytes': '1773875', 'numFiles': '4'}",,


In [12]:
# Below are all parquet files related to above changes
!ls ../lake

_delta_log
part-00000-457fb9cf-fd52-423d-9af9-c2922d0c85fe-c000.snappy.parquet
part-00000-59d2f1cc-14fb-43a9-a949-8c595261cc50-c000.snappy.parquet
part-00000-bb692ea7-87a1-4af0-b7f3-95189f10bdcf-c000.snappy.parquet
part-00000-f8a68c06-8d60-40a7-9b02-7286f33a2c03-c000.snappy.parquet
part-00001-225c2da3-6d10-4561-a7eb-66d51c4e0720-c000.snappy.parquet
part-00001-a6373976-cac8-45d4-ab3d-120c40bd7222-c000.snappy.parquet
part-00001-e56cef05-cf8a-4784-b764-260aa709f1e0-c000.snappy.parquet
part-00002-2236d358-9a8e-4d96-bb24-38caecae18d6-c000.snappy.parquet
part-00002-712810b1-3c68-4e17-9536-246c20c10d4b-c000.snappy.parquet
part-00002-f1bf05c6-8570-4fda-84f7-e33d320e910e-c000.snappy.parquet
part-00003-376097a8-e45b-4d0e-a3fd-7940325b6c65-c000.snappy.parquet
part-00003-7557dfdc-4c47-42d3-9801-4c80cafa6c21-c000.snappy.parquet
part-00003-e92961a8-b7a4-4463-ae9a-6f9e146cbc80-c000.snappy.parquet


In [13]:
# Restart kernel and now we can read data as of version we are interested in.
old_table = spark.read.format("delta").option("versionAsOf", 0).load("../lake")
print((old_table.count(), len(old_table.columns)))
old_table.limit(5).toPandas()

                                                                                

(913000, 4)


Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [14]:
new_table = spark.read.format("delta").option("versionAsOf", 1).load("../lake")
print((new_table.count(), len(new_table.columns)))
new_table.limit(5).toPandas()

                                                                                

(913000, 5)


Unnamed: 0,date,store,item,sales,col_n
0,2013-01-01,1,1,13,2.0
1,2013-01-02,1,1,11,2.0
2,2013-01-03,1,1,14,2.0
3,2013-01-04,1,1,13,2.0
4,2013-01-05,1,1,10,2.0
