In [0]:
from pyspark.sql import functions as f
from pyspark.sql.types import *
from delta.tables import DeltaTable

In [0]:
events_schema_df = StructType([
StructField("event_time", TimestampType(), True),
StructField("event_type", StringType(), True),
StructField("product_id", IntegerType(), True),
StructField("category_id", LongType(), True),
StructField("category_code", StringType(), True),
StructField("brand", StringType(), True),
StructField("price", DoubleType(), True),
StructField("user_id", IntegerType(), True),
StructField("user_session", StringType(), True)
])

In [0]:
events = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",
                        header=True, schema=events_schema_df)

In [0]:
events.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [0]:

display(events.limit(5))

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-11-01T00:00:00.000Z,view,1003461,2053013555631882655,electronics.smartphone,xiaomi,489.07,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33
2019-11-01T00:00:00.000Z,view,5000088,2053013566100866035,appliances.sewing_machine,janome,293.65,530496790,8e5f4f83-366c-4f70-860e-ca7417414283
2019-11-01T00:00:01.000Z,view,17302664,2053013553853497655,,creed,28.31,561587266,755422e7-9040-477b-9bd2-6a6e8fd97387
2019-11-01T00:00:01.000Z,view,3601530,2053013563810775923,appliances.kitchen.washer,lg,712.87,518085591,3bfb58cd-7892-48cc-8020-2f17e6de6e7f
2019-11-01T00:00:01.000Z,view,1004775,2053013555631882655,electronics.smartphone,xiaomi,183.27,558856683,313628f1-68b8-460d-84f6-cec7a8796ef2


In [0]:
events.write.format("delta")\
.mode("overwrite") \
.save("/Volumes/workspace/ecommerce/ecommerce_data/events_delta")

In [0]:
events.write.format("delta").mode("overwrite").saveAsTable("events_table")

In [0]:
spark.sql("""
CREATE TABLE event_delta
USING DELTA
AS SELECT * FROM events_table
""")

DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:

try:
    wrong_schema = spark.createDataFrame([("a","b","c")], ["x","y","z"])
    wrong_schema.write.format("delta").mode("append").save("/Volumes/workspace/ecommerce/ecommerce_data/event_delta")
except Exception as e:
    print(f"Schema enforcement: {e}")


In [0]:
# Count total rows before deduplication
pre_dedup_count = events.count()
print(f"Total rows before deduplication: {pre_dedup_count}")

Total rows before deduplication: 67501979


In [0]:
# Drop duplicates based on event_time, user_id, product_id
# Write deduplicated data to Delta

deduped_events = events.dropDuplicates(["event_time", "user_id", "product_id"])
deduped_events.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/ecommerce_data/events_delta")

# Count rows after deduplication
post_dedup_count = deduped_events.count()
print(f"Total rows after deduplication: {post_dedup_count}")
     

Total rows after deduplication: 67351679


In [0]:
# Test for remaining duplicates in deduplicated data

deduped_events.groupBy("event_time", "user_id", "product_id") \
    .agg(f.count("*").alias("cnt")) \
    .filter(f.col("cnt") > 1) \
    .show()

print("No Duplicates Found.")

+----------+-------+----------+---+
|event_time|user_id|product_id|cnt|
+----------+-------+----------+---+
+----------+-------+----------+---+

No Duplicates Found.
