### Convert CSV to Delta format

In [0]:
df1 = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv")

In [0]:
df2 = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv")

In [0]:
spark.sql("""
CREATE VOLUME IF NOT EXISTS workspace.ecommerce.delta
""")

In [0]:
df1.write \
  .format('delta') \
  .mode('overwrite') \
  .save('/Volumes/workspace/ecommerce/delta/events1')

In [0]:
df2.write \
  .format('delta') \
  .mode('overwrite') \
  .save('/Volumes/workspace/ecommerce/delta/events2')

### Create Delta tables (SQL and PySpark)

In [0]:
df1.createOrReplaceTempView("df1")

In [0]:
%sql
CREATE TABLE sales_delta_
USING DELTA
AS
SELECT *
FROM df1;


In [0]:
df1.write \
  .format("delta") \
  .mode("overwrite") \
  .saveAsTable("sales_delta_oct")


In [0]:
df1.write \
  .format("delta") \
  .mode("overwrite") \
  .saveAsTable("sales_delta_nov")


In [0]:
%sql
select * from ecommerce_delta limit 5

### Test schema enforcement

In [0]:
# Test schema enforcement
try:
    wrong_schema = spark.createDataFrame([("a","b","c")], ["x","y","z"])
    wrong_schema.write.format("delta").mode("append").save('/Volumes/workspace/ecommerce/delta/events')
except Exception as e:
    print(f"Schema enforcement: {e}")


### Handle duplicate inserts

In [0]:
from pyspark.sql.window import Window
import pyspark.sql.functions as F

window = Window.partitionBy('user_id').orderBy(F.desc('event_time'))
deduped_df = df1.withColumn('rn', F.row_number().over(window )).where(F.col('rn') == 1).drop('rn')

deduped_df.write.format('delta').mode("append").saveAsTable('sales_delta_oct')

In [0]:
deduped_df.show(5)

In [0]:
spark.sql("""
SELECT *
FROM sales_delta_oct
ORDER BY event_time DESC
LIMIT 5
""").show(truncate=False)