In [0]:
import pandas as pd

# Sample data
data = {
    "OrderID": [1, 2, 3, 4],
    "OrderDate": ["2024-01-01 10:00:00", "2024-01-02 11:00:00", "2024-01-03 12:00:00", "2024-01-04 13:00:00"],
    "CustomerID": ["C001", "C002", "C003", "C004"],
    "Product": ["ProductA", "ProductB", "ProductC", "ProductD"],
    "Quantity": [10, 20, 15, 5],
    "Price": [100.0, 200.0, 150.0, 50.0]
}

df = pd.DataFrame(data)

# Save to a local path first
local_csv_path = "/tmp/sales2_data.csv"
local_parquet_path = "/tmp/sales_data.parquet"

# Save to CSV and Parquet locally
df.to_csv(local_csv_path, index=False)
df.to_parquet(local_parquet_path, index=False)

# Now, move the files to DBFS
dbutils.fs.mv(f"file:{local_csv_path}", "dbfs:/tmp/sales2_data.csv")
dbutils.fs.mv(f"file:{local_parquet_path}", "dbfs:/tmp/sales_data.parquet")

print("Files successfully moved to DBFS.")


In [0]:
# Initialize SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp

spark = SparkSession.builder.appName("DelatExample").getOrCreate()

# Load data from CSV
df_sales = spark.read.format("csv").option("header","true").load("dbfs:/tmp/sales2_data.csv")

# Transform the data
df_transformed = df_sales.withColumn("TotalAmount", col("Quantity").cast("int") * col("Price").cast("double"))

# Write transformed data to delta table
delta_table_path = '/delta/sales_data'
df_transformed.write.format("delta").mode("overwrite").save(delta_table_path)

print("Delta table created")


Delta table created


In [0]:
import dlt

@dlt.table
def sales_data():
    df = spark.read.format("delta").load(delta_table_path)
    return df.select(
        col("OrderID"),
        col("OrderDate"),
        col("CustomerID"),
        col("Product"),
        col("Quantity"),
        col("Price"),
        (col("Quantity").cast("int") * col("Price").cast("double")).alias("TotalAmount")
    )

print("Delta live table created")

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-1461438442376509>, line 3[0m
[1;32m      1[0m [38;5;28;01mimport[39;00m [38;5;21;01mdlt[39;00m
[0;32m----> 3[0m [38;5;129m@dlt[39m[38;5;241m.[39mtable
[1;32m      4[0m [38;5;28;01mdef[39;00m [38;5;21msales_data[39m():
[1;32m      5[0m     df [38;5;241m=[39m spark[38;5;241m.[39mread[38;5;241m.[39mformat([38;5;124m"[39m[38;5;124mdelta[39m[38;5;124m"[39m)[38;5;241m.[39mload(delta_table_path)
[1;32m      6[0m     [38;5;28;01mreturn[39;00m df[38;5;241m.[39mselect(
[1;32m      7[0m         col([38;5;124m"[39m[38;5;124mOrderID[39m[38;5;124m"[39m),
[1;32m      8[0m         col([38;5;124m"[39m[38;5;124mOrderDate[39m[38;5;124m"[39m),
[0;32m   (...)[0m
[1;32m     13[0m         (col([38;5;124m"[39m[38;5;124mQuantity[39m[38;5;124m"[39m)[38;

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("DeltaOperationsSimpleExample") \
    .getOrCreate()

# Define Delta table path
delta_table_path = "/delta/simple_data"

# Define initial sample data
initial_data = [
    (1, 100),
    (2, 200),
    (3, 300)
]

# Define schema
schema = ["ID", "Value"]

# Create DataFrame for initial data
df_initial = spark.createDataFrame(initial_data, schema=schema)

# Write DataFrame to Delta table
df_initial.write.format("delta").mode("overwrite").save(delta_table_path)

print("Initial Delta table created and data written successfully.")

# Define new sample data
new_sample_data = [
    (2, 250),  # Existing ID with updated Value
    (4, 400)   # New ID
]

# Create DataFrame for new data
df_new = spark.createDataFrame(new_sample_data, schema=schema)

# Write the new data to Delta table in append mode
df_new.write.format("delta").mode("append").save(delta_table_path)

print("New data appended to Delta table successfully.")

# Create a temporary view for SQL operations
df_new.createOrReplaceTempView("new_data")

# Perform the merge operation
print("Merging new data into Delta table...")

spark.sql(f"""
MERGE INTO delta.`{delta_table_path}` AS target
USING new_data AS source
ON target.ID = source.ID
WHEN MATCHED THEN UPDATE SET
    target.Value = source.Value
WHEN NOT MATCHED THEN INSERT (
    ID,
    Value
) VALUES (
    source.ID,
    source.Value
)
""")

print("Data merged successfully.")

# Delta operations - History, Time Travel, and Vacuum
print("Viewing Delta table history...")
history_df = spark.sql(f"DESCRIBE HISTORY delta.`{delta_table_path}`")
history_df.show(truncate=False)

print("Querying Delta table as of version 0...")
df_time_travel = spark.read.format("delta").option("versionAsOf", 0).load(delta_table_path)
df_time_travel.show(truncate=False)

print("Vacuuming old files...")
spark.sql(f"VACUUM delta.`{delta_table_path}` RETAIN 168 HOURS")

print("Delta operations completed.")


Initial Delta table created and data written successfully.
New data appended to Delta table successfully.
Merging new data into Delta table...
Data merged successfully.
Viewing Delta table history...
+-------+-------------------+----------------+---------------------+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+------------------+--------------------+-----------+-----------------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------