In [0]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F

dbutils.fs.cp("file:/Workspace/Shared/product_data.json", "dbfs:/FileStore/product_data.json")

product_df = spark.read.option("multiline", "true").json("/FileStore/product_data.json")

# First 10 rows
product_df.show(10)

# Data Cleaning
# Remove rows where stock is less than 30
product_df_cleaned = product_df.filter(col("Stock") >= 30)

# Filter products in Electronics category
df_electronics = product_df_cleaned.filter(col("Category") == "Electronics")
df_electronics.show()

# Data Aggregation
#  Calculate the total stock for products in the "Furniture" category.
df_furniture = product_df.filter(col("Category") == "Furniture").agg(F.sum("Stock").alias("TotalFurnitureStock"))
df_furniture.show()

#  Find the average price of all products in the dataset.
df_avg = product_df.agg(F.avg("Price").alias("AveragePrice"))
df_avg.show()

# Write to a Json file
df_electronics.write.format("json").mode("overwrite").save("file:/Workspace/Shared/cleaned_electronics.json")




+-----------+-----+---------+-----------+-----+
|   Category|Price|ProductID|ProductName|Stock|
+-----------+-----+---------+-----------+-----+
|Electronics| 1200|      101|     Laptop|   35|
|Electronics|  800|      102| Smartphone|   80|
|  Furniture|  150|      103| Desk Chair|   60|
|Electronics|  300|      104|    Monitor|   45|
|  Furniture|  350|      105|       Desk|   25|
+-----------+-----+---------+-----------+-----+

+-----------+-----+---------+-----------+-----+
|   Category|Price|ProductID|ProductName|Stock|
+-----------+-----+---------+-----------+-----+
|Electronics| 1200|      101|     Laptop|   35|
|Electronics|  800|      102| Smartphone|   80|
|Electronics|  300|      104|    Monitor|   45|
+-----------+-----+---------+-----------+-----+

+-------------------+
|TotalFurnitureStock|
+-------------------+
|                 85|
+-------------------+

+------------+
|AveragePrice|
+------------+
|       560.0|
+------------+

