In [0]:
# Load data
events = spark.read.csv("/path/to/sample.csv", header=True, inferSchema=True)

# Basic operations
events.select("event_type", "product_name", "price").show(10)
events.filter("price > 100").count()
events.groupBy("event_type").count().show()
top_brands = events.groupBy("brand").count().orderBy("count", ascending=False).limit(5)

In [0]:
events = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv", header=True, inferSchema=True)
# Basic operations
events.select("event_type","category_id","category_code","price").show(10)
events.filter("price > 100").count()
events.groupBy("event_type").count().show()
events.orderBy("price", ascending=False).show(10)
top_brands = events.groupBy("brand").count().orderBy("count", ascending=False).limit(5)

In [0]:
%python

events.write.mode("overwrite").option("header", "true").csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct-export.csv")

In [0]:
%python
events.coalesce(1).write.mode("overwrite").option("header", "true").csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct-export")

In [0]:
events = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv", header=True, inferSchema=True)
# Basic operations
events.select("event_type","category_id","category_code","price").show(10)
events.filter("price > 100").count()
events.groupBy("event_type").count().show()
events.orderBy("price", ascending=False).show(10)
top_brands = events.groupBy("brand").count().orderBy("count", ascending=False).limit(5)

events.coalesce(1).write.mode("overwrite").option("header", "true").csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov-export")

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

events = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv", header=True, inferSchema=True)
# Basic operations
events.select("event_type","category_id","category_code","price").show(10)
events.filter("price > 100").count()
events.groupBy("event_type").count().show()
events.orderBy("price", ascending=False).show(10)
top_brands = events.groupBy("brand").count().orderBy("count", ascending=False).limit(5)

# Top 5 products by revenue
revenue = events.filter(F.col("event_type") == "purchase") \
    .groupBy("product_id", "product_name") \
    .agg(F.sum("price").alias("revenue")) \
    .orderBy(F.desc("revenue")).limit(5)

# Running total per user
window = Window.partitionBy("user_id").orderBy("event_time")
events.withColumn("cumulative_events", F.count("*").over(window))

# Conversion rate by category
conversion = events.groupBy("category_code").agg(
    F.sum(F.when(F.col("event_type") == "purchase", 1).otherwise(0)).alias("purchase"),
    F.sum(F.when(F.col("event_type") == "view", 1).otherwise(0)).alias("view")
).withColumn(
    "conversion_rate",
    (F.col("purchase") / F.col("view")) * 100
)
display(conversion)

In [0]:
from pyspark.sql import functions as F

# Derived feature: discounted price (10% off)
events = events.withColumn(
    "discounted_price",
    F.col("price") * 0.9
)

# Derived feature: is_high_price (True if price > 100)
events = events.withColumn(
    "is_high_price",
    F.col("price") > 100
)

display(events)

In [0]:
events.write.format("delta").mode("overwrite").save("/delta/events")

# Create managed table
events.write.format("delta").saveAsTable("events_table")

# SQL approach
spark.sql("""
    CREATE TABLE events_delta
    USING DELTA
    AS SELECT * FROM events_table
""")

# Test schema enforcement
try:
    wrong_schema = spark.createDataFrame([("a","b","c")], ["x","y","z"])
    wrong_schema.write.format("delta").mode("append").save("/delta/events")
except Exception as e:
    print(f"Schema enforcement: {e}")


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

events = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv", header=True, inferSchema=True)

events.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/delta/events")

# Create managed table
events.write.format("delta").saveAsTable("workspace.ecommerce.events_table")

# SQL approachworkspace.ecommerce.events_table
spark.sql("""
    CREATE TABLE events_delta
    USING DELTA
    AS SELECT * FROM workspace.ecommerce.events_table
""")

# Test schema enforcement
try:
    wrong_schema = spark.createDataFrame([("a","b","c")], ["x","y","z"])
    wrong_schema.write.format("delta").mode("append").save("/Volumes/workspace/ecommerce/delta/events")
except Exception as e:
    print(f"Schema enforcement: {e}")


In [0]:
%sql
-- Create a managed volume in your schema
CREATE VOLUME delta
COMMENT 'Delta volume for ecommerce events'
IN CATALOG workspace
IN SCHEMA ecommerce;

In [0]:
%sql
CREATE VOLUME delta
COMMENT 'Delta volume for ecommerce events'
IN workspace.ecommerce;

In [0]:
%sql
CREATE VOLUME workspace.ecommerce.delta
COMMENT 'Delta volume for ecommerce events';

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

events = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv", header=True, inferSchema=True)

events.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/delta/events")

# Create managed table
events.write.format("delta").saveAsTable("workspace.ecommerce.events_table")

# SQL approachworkspace.ecommerce.events_table
spark.sql("""
    CREATE TABLE events_delta
    USING DELTA
    AS SELECT * FROM workspace.ecommerce.events_table
""")

# Test schema enforcement
try:
    wrong_schema = spark.createDataFrame([("a","b","c")], ["x","y","z"])
    wrong_schema.write.format("delta").mode("append").save("/Volumes/workspace/ecommerce/delta/events")
except Exception as e:
    print(f"Schema enforcement: {e}")

In [0]:
from delta.tables import DeltaTable

delta_path = "/Volumes/workspace/ecommerce/delta/events"
csv_path = "/Volumes/workspace/ecommerce/delta/new_data.csv"

# MERGE for incremental updates
deltaTable = DeltaTable.forPath(spark, delta_path)
updates = spark.read.csv(csv_path, header=True, inferSchema=True)

deltaTable.alias("t").merge(
    updates.alias("s"),
    "t.user_session = s.user_session AND t.event_time = s.event_time"
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

# Time travel
v0 = spark.read.format("delta").option("versionAsOf", 0).load(delta_path)
yesterday = spark.read.format("delta") \
    .option("timestampAsOf", "2024-01-01").load(delta_path)

# Optimize
spark.sql("OPTIMIZE workspace.ecommerce.events ZORDER BY (event_type, user_id)")
spark.sql("VACUUM workspace.ecommerce.events RETAIN 168 HOURS")

In [0]:
from delta.tables import DeltaTable

delta_path = "/Volumes/workspace/ecommerce/delta/events"
csv_path = "/Volumes/workspace/ecommerce/delta/new_data.csv"  # Upload your CSV here

deltaTable = DeltaTable.forPath(spark, delta_path)
updates = spark.read.csv(
    csv_path,
    header=True,
    inferSchema=True
)

deltaTable.alias("t").merge(
    updates.alias("s"),
    "t.user_session = s.user_session AND t.event_time = s.event_time"
).whenMatchedUpdateAll()\
.whenNotMatchedInsertAll()\
.execute()

v0 = spark.read.format("delta").option("versionAsOf", 0).load(delta_path)
yesterday = spark.read.format("delta")\
    .option("timestampAsOf", "2024-01-01").load(delta_path)

spark.sql("OPTIMIZE workspace.ecommerce.events ZORDER BY (event_type, user_id)")
spark.sql("VACUUM workspace.ecommerce.events RETAIN 168 HOURS")

In [0]:
events.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/delta/events")

# Create managed table
events.write.format("delta").saveAsTable("workspace.ecommerce.csv_path")

# SQL approachworkspace.ecommerce.csv_path
spark.sql("""
    CREATE TABLE events_csv
    USING DELTA
    AS SELECT * FROM workspace.ecommerce.csv_path
""")

In [0]:
# Save Delta table to Unity Catalog volume
events.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/delta/events")

# Create or replace managed table
events.write.format("delta").mode("overwrite").saveAsTable("workspace.ecommerce.csv_path")

# SQL approach: create or replace table
spark.sql("""
    CREATE OR REPLACE TABLE workspace.ecommerce.events_csv
    USING DELTA
    AS SELECT * FROM workspace.ecommerce.csv_path
""")

In [0]:
from delta.tables import DeltaTable

delta_path = "/Volumes/workspace/ecommerce/delta/events"
csv_path = "/Volumes/workspace/ecommerce/delta/new_data.csv"  # Upload your CSV here

spark.sql("""
    PUT '/tmp/new_data.csv' INTO '/Volumes/workspace/ecommerce/delta/new_data.csv' OVERWRITE
""")

deltaTable = DeltaTable.forPath(spark, delta_path)
updates = spark.read.csv(
    csv_path,
    header=True,
    inferSchema=True
)

deltaTable.alias("t").merge(
    updates.alias("s"),
    "t.user_session = s.user_session AND t.event_time = s.event_time"
).whenMatchedUpdateAll()\
.whenNotMatchedInsertAll()\
.execute()

v0 = spark.read.format("delta").option("versionAsOf", 0).load(delta_path)
yesterday = spark.read.format("delta")\
    .option("timestampAsOf", "2024-01-01").load(delta_path)

spark.sql("OPTIMIZE workspace.ecommerce.events ZORDER BY (event_type, user_id)")
spark.sql("VACUUM workspace.ecommerce.events RETAIN 168 HOURS")