In [0]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, sum, window, desc

# Create a SparkSession
spark = SparkSession.builder.appName("Online Retail Cleaning").getOrCreate()


In [0]:
# 1. Ingestion: Load data into DataFrame
file_path1 = "dbfs:/user/hive/warehouse/online_retail_data_2009_10"

file_path2 = "dbfs:/user/hive/warehouse/online_retail_data_2010_11"

# Load the Delta Table into a DataFrame
df1 = spark.read.format("delta").load(file_path1)

df2 = spark.read.format("delta").load(file_path2)


In [0]:
# Show the first few rows of df1
df1.show(10)

+-------+---------+--------------------+--------+----------------+-----+-----------+--------------+
|Invoice|StockCode|         Description|Quantity|     InvoiceDate|Price|Customer ID|       Country|
+-------+---------+--------------------+--------+----------------+-----+-----------+--------------+
| 489434|    85048|15CM CHRISTMAS GL...|      12|01-12-2009 07:45| 6.95|      13085|United Kingdom|
| 489434|   79323P|  PINK CHERRY LIGHTS|      12|01-12-2009 07:45| 6.75|      13085|United Kingdom|
| 489434|   79323W| WHITE CHERRY LIGHTS|      12|01-12-2009 07:45| 6.75|      13085|United Kingdom|
| 489434|    22041|RECORD FRAME 7" S...|      48|01-12-2009 07:45|  2.1|      13085|United Kingdom|
| 489434|    21232|STRAWBERRY CERAMI...|      24|01-12-2009 07:45| 1.25|      13085|United Kingdom|
| 489434|    22064|PINK DOUGHNUT TRI...|      24|01-12-2009 07:45| 1.65|      13085|United Kingdom|
| 489434|    21871| SAVE THE PLANET MUG|      24|01-12-2009 07:45| 1.25|      13085|United Kingdom|


In [0]:
# Show the first few rows
df2.show(10)

+-------+---------+--------------------+--------+----------------+-----+-----------+--------------+
|Invoice|StockCode|         Description|Quantity|     InvoiceDate|Price|Customer ID|       Country|
+-------+---------+--------------------+--------+----------------+-----+-----------+--------------+
| 536365|   85123A|WHITE HANGING HEA...|       6|01-12-2010 08:26| 2.55|      17850|United Kingdom|
| 536365|    71053| WHITE METAL LANTERN|       6|01-12-2010 08:26| 3.39|      17850|United Kingdom|
| 536365|   84406B|CREAM CUPID HEART...|       8|01-12-2010 08:26| 2.75|      17850|United Kingdom|
| 536365|   84029G|KNITTED UNION FLA...|       6|01-12-2010 08:26| 3.39|      17850|United Kingdom|
| 536365|   84029E|RED WOOLLY HOTTIE...|       6|01-12-2010 08:26| 3.39|      17850|United Kingdom|
| 536365|    22752|SET 7 BABUSHKA NE...|       2|01-12-2010 08:26| 7.65|      17850|United Kingdom|
| 536365|    21730|GLASS STAR FROSTE...|       6|01-12-2010 08:26| 4.25|      17850|United Kingdom|


In [0]:
# CLEANING
# Combine datasets using union
combined_df = df1.unionByName(df2)

# Drop rows with null values in key columns
cleaned_df = combined_df.dropna(subset=["Invoice", "StockCode", "InvoiceDate", "Customer ID", "Quantity"])

# Remove duplicate rows
cleaned_df = cleaned_df.dropDuplicates()

In [0]:
cleaned_df.show(15)

+-------+---------+--------------------+--------+----------------+-----+-----------+--------------+
|Invoice|StockCode|         Description|Quantity|     InvoiceDate|Price|Customer ID|       Country|
+-------+---------+--------------------+--------+----------------+-----+-----------+--------------+
| 503063|    47566|       PARTY BUNTING|       5|29-03-2010 17:17| 4.65|      17664|United Kingdom|
| 503063|    21495|         SKULLS WRAP|      25|29-03-2010 17:17| 0.42|      17664|United Kingdom|
| 503063|    22084|PAPER CHAIN KIT E...|       6|29-03-2010 17:17| 2.95|      17664|United Kingdom|
| 503063|    21380|WOODEN HAPPY BIRT...|       6|29-03-2010 17:17| 2.95|      17664|United Kingdom|
| 503063|    20974|12 PENCILS SMALL ...|      24|29-03-2010 17:17| 0.65|      17664|United Kingdom|
| 503063|    20829|GLITTER HANGING B...|       8|29-03-2010 17:17|  2.1|      17664|United Kingdom|
| 503063|    20973|12 PENCIL SMALL T...|      24|29-03-2010 17:17| 0.65|      17664|United Kingdom|
