###Reading Ecommerce Dataset

In [0]:
df=spark.read.format("csv").option("header","true").option("inferSchema","true").load("/Volumes/workspace/ecommerce/ecommerce_data")

In [0]:
df.show()

In [0]:
df.count()

### Converting to Delta Format

In [0]:
df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("workspace.ecommerce.ecommerce_data")


In [0]:
df.display()

In [0]:
try:
    wrong_schema_df = spark.createDataFrame(
        [("x", "y", "z")],
        ["col1", "col2", "col3"]
    )

    wrong_schema_df.write \
        .format("delta") \
        .mode("append") \
        .saveAsTable("workspace.ecommerce.ecommerce_data")

except Exception as e:
    print("Schema enforcement working:", e)


In [0]:
spark.table("workspace.ecommerce.ecommerce_data").display()

In [0]:
from pyspark.sql.functions import *

df.groupBy("user_session") \
  .agg(count("*").alias("cnt")) \
  .filter("cnt > 1").display()

In [0]:
df.columns

### Composite Key to check for duplicates

In [0]:
df.groupBy(
    "user_session",
    "event_time",
    "event_type",
    "product_id"
).count().filter("count > 1").display()


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number


In [0]:
window_spec = Window.partitionBy(
    "user_session",
    "event_time",
    "event_type",
    "product_id"
).orderBy("event_time")   # earliest first


###Assign row numbers


In [0]:
df_with_rn = df.withColumn(
    "rn",
    row_number().over(window_spec)
)


In [0]:
dedup_df = df_with_rn.filter("rn = 1").drop("rn")


### Verify Duplicates are gone

In [0]:
dedup_df.groupBy(
    "user_session",
    "event_time",
    "event_type",
    "product_id"
).count().filter("count > 1").display()


### Store cleaned data back to Delta table

In [0]:
dedup_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("workspace.ecommerce.ecommerce_data")
