**SETUP**

In [0]:
%sql
CREATE CATALOG IF NOT EXISTS qa
MANAGED LOCATION 'abfss://unity-catalog-storage@dbstoragedlwvnba4d5k72.dfs.core.windows.net/2141168493853526';

In [0]:
%sql
CREATE CATALOG IF NOT EXISTS dev
MANAGED LOCATION 'abfss://unity-catalog-storage@dbstoragedlwvnba4d5k72.dfs.core.windows.net/2141168493853526';

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS dev.demo_db;
CREATE DATABASE IF NOT EXISTS qa.demo_db;

In [0]:
%sql
DROP DATABASE IF EXISTS dev.default CASCADE;

In [0]:
flight_schema_ddl = """FL_DATE DATE, OP_CARRIER STRING, OP_CARRIER_FL_NUM INT, ORIGIN STRING, 
          ORIGIN_CITY_NAME STRING, DEST STRING, DEST_CITY_NAME STRING, CRS_DEP_TIME INT, DEP_TIME INT, 
          WHEELS_ON INT, TAXI_IN INT, CRS_ARR_TIME INT, ARR_TIME INT, CANCELLED STRING, DISTANCE INT"""

flight_time_df = (spark.read.format("json")
                    .schema(flight_schema_ddl)
                    .option("dateFormat", "M/d/y")
                    .load("abfss://dbfs-container@storage.dfs.core.windows.net/external/flight-time")
)

In [0]:
%sql
CREATE TABLE IF NOT EXISTS dev.demo_db.flight_time_tbl (
    FL_DATE DATE, 
    OP_CARRIER STRING, 
    OP_CARRIER_FL_NUM INT, 
    ORIGIN STRING, 
    ORIGIN_CITY_NAME STRING, 
    DEST STRING, 
    DEST_CITY_NAME STRING, 
    CRS_DEP_TIME INT, 
    DEP_TIME INT, 
    WHEELS_ON INT, 
    TAXI_IN INT, 
    CRS_ARR_TIME INT, 
    ARR_TIME INT, 
    CANCELLED STRING, 
    DISTANCE INT
) USING DELTA

**LOAD DATA INTO DELTA TABLE**

In [0]:
flight_time_df.write.format("delta").mode("append").saveAsTable("dev.demo_db.flight_time_tbl")

In [0]:
flight_time_tbl = spark.read.table("dev.demo_db.flight_time_tbl")
display(flight_time_tbl)

**CREATE A DELTA TABLE USING DELTATABLE API**

In [0]:
(spark.read
    .format("delta")
    .load("abfss://dbfs-container@storage.dfs.core.windows.net/external/flight-time")
    .display()
)



from delta import DeltaTable

(DeltaTable.createOrReplace(spark)
    .tableName("dev.demo_db.flight_time_tbl")
    .addColumn("id", "INT")
    .addColumn("FL_DATE", "DATE")
    .addColumn("OP_CARRIER", "STRING")
    .addColumn("OP_CARRIER_FL_NUM", "INT")
    .addColumn("ORIGIN", "STRING")
    .addColumn("ORIGIN_CITY_NAME", "STRING")
    .addColumn("DEST", "STRING") 
    .addColumn("DEST_CITY_NAME", "STRING")
    .addColumn("CRS_DEP_TIME", "INT")
    .addColumn("DEP_TIME", "INT")
    .addColumn("WHEELS_ON", "INT")
    .addColumn("TAXI_IN", "INT")
    .addColumn("CRS_ARR_TIME", "INT")
    .addColumn("ARR_TIME", "INT")
    .addColumn("CANCELLED", "STRING")
    .addColumn("DISTANCE", "INT")
    .execute()
)

In [0]:
%sql
select * from dev.demo_db.flight_time_tbl

**SAVE DELTA DATA IN EXTERNAL LOCATION**

In [0]:
(flight_time_df.coalesce(1)
    .write
    .format("delta")
    .mode("overwrite")
    .save("abfss://dbfs-container@storage.dfs.core.windows.net/external/flight-time")
)

In [0]:
%fs ls abfss://dbfs-container@prashantsa.dfs.core.windows.net/external/flight-time

**READ DELTA TABLE USING SQL AND DATAFRAME API**

In [0]:
%sql
select * from dev.demo_db.flight_time_tbl

In [0]:
spark.read.format("delta").table("dev.demo_db.flight_time_tbl").display()

**READ EXTERNAL DATA USING DATAFRAM API**

In [0]:
(spark.read
    .format("delta")
    .load("abfss://dbfs-container@storage.dfs.core.windows.net/external/flight-time")
    .display()
)

**DELETE ONE RECORD FROM TABLE USING DELTATABLE API**

In [0]:
from delta import DeltaTable

people_dt = DeltaTable.forName(spark, "dev.demo_db.people")
people_dt.delete("firstName = 'abdul'")

**UPDATE ONE RECORD FROM TABLE USING DELTATABLE API**``

In [0]:
import pyspark.sql.functions as f
people_dt.update(
  condition = "birthDate = '1975-05-25'",
  set = { "firstName": f.initcap("firstName"), "lastName":  f.initcap("lastName") }
)

In [0]:
people_dt.toDF().display()  

**WE WILL LOOK INTO MERGE OPERATION LATER**

In [0]:
%sql
merge into dev.demo_db.people tgt
using (select id, fname as firstName, lname as lastName, dob as birthDate
       from json.`/mnt/files/dataset_ch7/people.json`) src
on tgt.id = src.id
when matched and tgt.firstName = 'Kailash' then
  delete
when matched then
  update set tgt.birthDate = src.birthDate
when not matched then
  insert *

**TIME TRAVEL**

In [0]:
%sql
describe history dev.demo_db.people

In [0]:
%sql
select * from dev.demo_db.people version as of 1

In [0]:
%sql
select * from dev.demo_db.people timestamp as of '2023-12-16T05:12:50Z'

**DELETE THE DELTA TABLE DATA BY MISTAKE**

In [0]:
%sql
delete from dev.demo_db.people

**ROLL BACK TO RESTORE**

In [0]:
%sql
describe history dev.demo_db.people

In [0]:
%sql
restore table dev.demo_db.people to timestamp as of '2023-12-16T05:14:39Z'

**USING DATAFRAME API**

In [0]:
%python
spark.read.option("versionAsOf", "1").table("dev.demo_db.people").display()

In [0]:
%python
spark.read.option("timestampAsOf", "2023-12-16T05:14:40Z").table("dev.demo_db.people").display()

In [0]:
%python
from delta import DeltaTable
people_dt = DeltaTable.forName(spark, "dev.demo_db.people")
people_dt.restoreToVersion(1)

**CONVERT PARQUET TO DELTA**

In [0]:
raw_df = (spark.read
            .format("csv")
            .option("header", "true")
            .option("inferSchema","true")
            .load("/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv")
)

from pyspark.sql.functions import to_date, to_timestamp, round, year
staging_df = (raw_df.withColumnRenamed("Call Number", "CallNumber")
                    .withColumnRenamed("Unit ID", "UnitID")
                    .withColumnRenamed("Incident Number", "IncidentNumber")
                    .withColumnRenamed("Call Date", "CallDate")
                    .withColumnRenamed("Watch Date", "WatchDate")
                    .withColumnRenamed("Call Final Disposition", "CallFinalDisposition")
                    .withColumnRenamed("Available DtTm", "AvailableDtTm")
                    .withColumnRenamed("Zipcode of Incident", "Zipcode")
                    .withColumnRenamed("Station Area", "StationArea")
                    .withColumnRenamed("Final Priority", "FinalPriority")
                    .withColumnRenamed("ALS Unit", "ALSUnit")
                    .withColumnRenamed("Call Type Group", "CallTypeGroup")
                    .withColumnRenamed("Unit sequence in call dispatch", "UnitSequenceInCallDispatch")
                    .withColumnRenamed("Fire Prevention District", "FirePreventionDistrict")
                    .withColumnRenamed("Supervisor District", "SupervisorDistrict")
                    .withColumn("CallDate", to_date("CallDate", "MM/dd/yyyy"))
                    .withColumn("WatchDate", to_date("WatchDate", "MM/dd/yyyy"))
                    .withColumn("AvailableDtTm", to_timestamp("AvailableDtTm", "MM/dd/yyyy hh:mm:ss a"))
                    .withColumn("Delay", round("Delay", 2))
                    .withColumn("Year", year("CallDate"))
)

(staging_df.write
        .format("parquet")
        .mode("overwrite")
        .partitionBy("Year")
        .save("/Volumes/dev/demo_db/files/fire_calls_tbl")
)

In [0]:
%fs ls /Volumes/dev/demo_db/files/fire_calls_tbl

In [0]:
%sql
convert to delta parquet.`/Volumes/dev/demo_db/files/fire_calls_tbl`
partitioned by (Year int)

In [0]:
%sql
describe history delta.`/Volumes/dev/demo_db/files/fire_calls_tbl`

In [0]:
%sql
DESCRIBE EXTENDED dev3.demodb.olist_geolocation_dataset

%md
**SCHEMA VALIDATION INSERING DATA INTO EXISTING TABLE, INSERT, OVERWRITE, MERGE**

Schema Validations
Statements
  1. INSERT
  2. OVERWRITE
  3. MERGE
  4. DataFrame Append

Validation Scenarions

  1. Column matching approach
  2. New Columns
  3. Data Type Mismatch (Not allowed in any case)

Schema Validations Summary

  1. INSERT         - Column matching by position, New columns not allowed
  2. OVERWRITE      - Column matching by position, New columns not allowed
  3. MERGE INSERT     - Column matching by name, New columns ignored
  4. DataFrame Append   - Column matching by name, New columns not allowed
  5. Data Type Mismatch  - Not allowed in any case

#####1. INSERT - Column matching by position (matching names not mandatory)
This has a potential to corrupt your data

In [0]:
%sql
INSERT INTO dev.demo_db.people_tbl
SELECT id, fname, lname
FROM json.`/mnt/files/dataset_ch7/people.json`

#####2. INSERT - New columns not allowed

In [0]:
%sql
INSERT INTO dev.demo_db.people_tbl
SELECT id, fname, lname, dob
FROM json.`/mnt/files/dataset_ch7/people.json`

#####3. OVERWRITE - New columns not allowed

In [0]:
%sql
INSERT OVERWRITE dev.demo_db.people_tbl
SELECT id, fname, lname, dob
FROM json.`/mnt/files/dataset_ch7/people.json`

#####4. MERGE - Column matching by name (matching by position not allowed)

In [0]:
%sql
SELECT id, fname, lname FROM json.`/mnt/files/dataset_ch7/people_2.json`

In [0]:
%sql
MERGE INTO dev.demo_db.people_tbl tgt
USING (SELECT id, fname, lname FROM json.`/mnt/files/dataset_ch7/people_2.json`) src
ON tgt.id = src.id
WHEN NOT MATCHED THEN
    INSERT *  

#####5. MERGE - New columns silently ignored

In [0]:
%sql
SELECT id, fname firstName, lname lastName, dob FROM json.`/mnt/files/dataset_ch7/people_2.json`

In [0]:
%sql
MERGE INTO dev.demo_db.people_tbl tgt
USING (SELECT id, fname firstName, lname lastName, dob FROM json.`/mnt/files/dataset_ch7/people_2.json`) src
ON tgt.id = src.id
WHEN NOT MATCHED THEN
    INSERT *

In [0]:
%sql
select * from dev.demo_db.people_tbl

#####6. Dataframe append - Column matching by name (matching by position not allowed)

In [0]:
%python
people_schema = "id INT, fname STRING, lname STRING"
people_df =  spark.read.format("json").schema(people_schema).load("/mnt/files/dataset_ch7/people_2.json")
people_df.write.format("delta").mode("append").saveAsTable("dev.demo_db.people_tbl")

#####7. Dataframe append - New columns not allowed

In [0]:
%python
people_schema = "id INT, firstName STRING, lastName STRING, dob STRING"
people_df =  spark.read.format("json").schema(people_schema).load("/mnt/files/dataset_ch7/people_2.json")
people_df.write.format("delta").mode("append").saveAsTable("dev.demo_db.people_tbl")