## Lab1__03-Use delta tables in Apache Spark

In [None]:
#########################################
#Magic command
#https://learn.microsoft.com/en-us/fabric/data-engineering/author-execute-notebook#spark-session-configuration-magic-command
#########################################

%%configure
{
    "defaultLakehouse": {
        "name": 
        {
            "parameterName": "defaultLakehouseName"
            ,"defaultValue": "lakehouse"
        },
        "id": {
            "parameterName": "defaultLakehouseId"
            ,"defaultValue": "84dffe93-ada9-402a-a1c0-b841b4294651"
        },
        "workspaceId": {
            "parameterName": "defaultWorkspaceId"
            ,"defaultValue": "bb3f0b26-c54b-4553-880c-fdd60e3815ec"
        }
    },
    
    "useStarterPool": true
}


In [None]:
#########################################
# Spark Best Practice
#########################################

spark.conf.set("spark.sql.parquet.vorder.enabled", "true")
spark.conf.set("spark.microsoft.delta.optimizationWrite.enabled", "true")
spark.conf.set("spark.microsoft.delta.optimizationWrite.binSize", "1073741824")
spark.conf.set('spark.ms.autotune.queryTuning.enabled', 'true')
spark.conf.set('spark.sql.files.maxPartitionBytes', '1073741824')

In [None]:
#########################################
# Download Sample Files
#########################################

import os
import requests

DATA_ROOT = "/lakehouse/default"
DATA_FOLDER = "Files/products"  # folder with data files
DATA_FILE = "products.csv"  # data file name

os.makedirs(f'{DATA_ROOT}/{DATA_FOLDER}', exist_ok=True)

remote_url = "https://github.com/MicrosoftLearning/dp-data/raw/main/products.csv"

r = requests.get(remote_url, timeout=30)
with open(f'{DATA_ROOT}/{DATA_FOLDER}/{DATA_FILE}', 'wb') as f:
    f.write(r.content)

In [None]:
#########################################
# Read csv File
#########################################
df = spark.read.format("csv").option("header","true").load("Files/products/products.csv")
display(df)

In [None]:
#########################################
#1. Managed vs External
#########################################
#Create Managed Table
df.write.format("delta").saveAsTable("managed_products")

In [None]:
#Create External Table
df.write.format("delta").saveAsTable("external_products", path= "abfss://bb3f0b26-c54b-4553-880c-fdd60e3815ec@onelake.dfs.fabric.microsoft.com/84dffe93-ada9-402a-a1c0-b841b4294651/Files/external_products")

In [None]:
%%sql
DESCRIBE FORMATTED managed_products;

In [None]:
%%sql
DESCRIBE FORMATTED external_products;

In [None]:
%%sql
DROP TABLE managed_products;
DROP TABLE external_products; --Only Meta deleted, data not deleted

In [None]:
#########################################
#2. Version
#########################################

%%sql
CREATE TABLE products
USING DELTA
LOCATION 'Files/external_products';

In [None]:
%%sql
SELECT * FROM products;

In [None]:
%%sql
UPDATE products
SET ListPrice = ListPrice * 0.9
WHERE Category = 'Mountain Bikes';

In [None]:
%%sql
DESCRIBE HISTORY products;

In [None]:
delta_table_path = 'Files/external_products'

# Get the current data
current_data = spark.read.format("delta").load(delta_table_path)
display(current_data)

# Get the version 0 data
original_data = spark.read.format("delta").option("versionAsOf", 0).load(delta_table_path)
display(original_data)