In [0]:
from pyspark.sql.functions import expr
from pyspark.sql.functions import from_unixtime

events = spark.read \
  .option("inferSchema", "true") \
  .json("/databricks-datasets/structured-streaming/events/") \
  .withColumn("date", expr("time")) \
  .drop("time") \
  .withColumn("date", from_unixtime("date", 'yyyy-MM-dd'))
  
display(events)

action,date
Close,2016-07-28
Close,2016-07-28
Open,2016-07-28
Close,2016-07-28
Open,2016-07-28
Open,2016-07-28
Close,2016-07-28
Close,2016-07-28
Close,2016-07-28
Open,2016-07-28


In [0]:
events.write.format("delta").mode("overwrite").partitionBy("date").save("/delta/events/")

In [0]:
%fs
ls /delta/events/

path,name,size
dbfs:/delta/events/_delta_log/,_delta_log/,0
dbfs:/delta/events/date=2016-07-24/,date=2016-07-24/,0
dbfs:/delta/events/date=2016-07-25/,date=2016-07-25/,0
dbfs:/delta/events/date=2016-07-26/,date=2016-07-26/,0
dbfs:/delta/events/date=2016-07-27/,date=2016-07-27/,0
dbfs:/delta/events/date=2016-07-28/,date=2016-07-28/,0


In [0]:
events_delta = spark.read.format("delta").load("/delta/events/")

display(events_delta)

action,time,date
Open,,2016-07-26
Open,,2016-07-26
Open,,2016-07-26
Open,,2016-07-26
Open,,2016-07-26
Open,,2016-07-26
Close,,2016-07-26
Open,,2016-07-26
Close,,2016-07-26
Close,,2016-07-26


In [0]:
display(spark.sql("DROP TABLE IF EXISTS events"))

display(spark.sql("CREATE TABLE events USING DELTA LOCATION '/delta/events/'"))

In [0]:
events_delta.count()

In [0]:
from pyspark.sql.functions import count
display(events_delta.groupBy("action","date").agg(count("action").alias("action_count")).orderBy("date", "action"))

action,date,action_count
Close,2016-07-26,20165
Open,2016-07-26,21176
Close,2016-07-27,24015
Open,2016-07-27,24002
Close,2016-07-28,5820
Open,2016-07-28,4822


In [0]:
historical_events = spark.read \
  .option("inferSchema", "true") \
  .json("/databricks-datasets/structured-streaming/events/") \
  .withColumn("date", expr("time-172800")) \
  .drop("time") \
  .withColumn("date", from_unixtime("date", 'yyyy-MM-dd'))

In [0]:
historical_events.write.format("delta").mode("append").partitionBy("date").save("/delta/events/")

In [0]:
%fs
ls /delta/events/

path,name,size
dbfs:/delta/events/_delta_log/,_delta_log/,0
dbfs:/delta/events/date=2016-07-24/,date=2016-07-24/,0
dbfs:/delta/events/date=2016-07-25/,date=2016-07-25/,0
dbfs:/delta/events/date=2016-07-26/,date=2016-07-26/,0
dbfs:/delta/events/date=2016-07-27/,date=2016-07-27/,0
dbfs:/delta/events/date=2016-07-28/,date=2016-07-28/,0


In [0]:
display(events_delta.groupBy("action","date").agg(count("action").alias("action_count")).orderBy("date", "action"))

action,date,action_count
Close,2016-07-24,20165
Open,2016-07-24,21176
Close,2016-07-25,24015
Open,2016-07-25,24002
Close,2016-07-26,25985
Open,2016-07-26,25998
Close,2016-07-27,24015
Open,2016-07-27,24002
Close,2016-07-28,5820
Open,2016-07-28,4822


In [0]:
events_delta.count()

In [0]:
dbutils.fs.ls("dbfs:/delta/events/date=2016-07-25/")

In [0]:
# Optimizes the layout of Delta Lake data
display(spark.sql("OPTIMIZE events"))

path,metrics
,"List(0, 0, List(null, null, 0.0, 0, 0), List(null, null, 0.0, 0, 0), 5, null, 0)"


In [0]:
display(spark.sql("DESCRIBE HISTORY events"))

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata
6,2021-07-02T00:19:14.000+0000,4475346586554357,atin.intellipaat@gmail.com,OPTIMIZE,"Map(predicate -> [], zOrderBy -> [], batchId -> 0, auto -> false)",,List(1686769686243328),0628-054729-nylon639,5.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 22, numRemovedBytes -> 37842, p25FileSize -> 6049, minFileSize -> 1977, numAddedFiles -> 5, maxFileSize -> 7300, p75FileSize -> 6883, p50FileSize -> 6883, numAddedBytes -> 29092)",
5,2021-07-02T00:18:48.000+0000,4475346586554357,atin.intellipaat@gmail.com,WRITE,"Map(mode -> Append, partitionBy -> [""date""])",,List(1686769686243328),0628-054729-nylon639,4.0,WriteSerializable,True,"Map(numFiles -> 11, numOutputBytes -> 18921, numOutputRows -> 100000)",
4,2021-07-02T00:17:45.000+0000,4475346586554357,atin.intellipaat@gmail.com,WRITE,"Map(mode -> Overwrite, partitionBy -> [""date""])",,List(1686769686243328),0628-054729-nylon639,3.0,WriteSerializable,False,"Map(numFiles -> 11, numOutputBytes -> 18921, numOutputRows -> 100000)",
3,2021-07-01T23:50:15.000+0000,4475346586554357,atin.intellipaat@gmail.com,OPTIMIZE,"Map(predicate -> [], zOrderBy -> [], batchId -> 0, auto -> false)",,List(1686769686243328),0628-054729-nylon639,2.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 33, numRemovedBytes -> 533735, p25FileSize -> 11372, minFileSize -> 10307, numAddedFiles -> 5, maxFileSize -> 166638, p75FileSize -> 146403, p50FileSize -> 53959, numAddedBytes -> 388679)",
2,2021-07-01T23:46:37.000+0000,4475346586554357,atin.intellipaat@gmail.com,WRITE,"Map(mode -> Append, partitionBy -> [""date""])",,List(1686769686243328),0628-054729-nylon639,1.0,WriteSerializable,True,"Map(numFiles -> 11, numOutputBytes -> 18921, numOutputRows -> 100000)",
1,2021-07-01T23:44:41.000+0000,4475346586554357,atin.intellipaat@gmail.com,WRITE,"Map(mode -> Append, partitionBy -> [""date""])",,List(1686769686243328),0628-054729-nylon639,0.0,WriteSerializable,True,"Map(numFiles -> 11, numOutputBytes -> 18921, numOutputRows -> 100000)",
0,2021-07-01T23:42:12.000+0000,4475346586554357,atin.intellipaat@gmail.com,WRITE,"Map(mode -> Overwrite, partitionBy -> [""date""])",,List(1686769686243328),0628-054729-nylon639,,WriteSerializable,False,"Map(numFiles -> 11, numOutputBytes -> 495893, numOutputRows -> 100000)",


In [0]:
display(spark.sql("DESCRIBE DETAIL events"))

format,id,name,description,location,createdAt,lastModified,partitionColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion
delta,3dcd8240-e48d-4d98-a2b5-36e2973b8279,default.events,,dbfs:/delta/events,2021-07-01T23:42:02.884+0000,2021-07-02T00:19:14.000+0000,List(date),5,29092,Map(),1,2


In [0]:
display(spark.sql("DESCRIBE FORMATTED events"))

col_name,data_type,comment
action,string,
time,bigint,
date,string,
,,
# Partitioning,,
Part 0,date,
,,
# Detailed Table Information,,
Name,default.events,
Location,dbfs:/delta/events,
