In [0]:
spark

In [0]:
df_customers = spark.read.csv(r"file:/Workspace/Shared/customers.csv", header=True, inferSchema=True)
df_orders = spark.read.csv(r"file:/Workspace/Shared/orders.csv", header=True, inferSchema=True)
df_customers.show()
df_orders.show()

+----------+-----+---------+---+
|CustomerID| Name|     City|Age|
+----------+-----+---------+---+
|       101|Aditi|   Mumbai| 28|
|       102|Rohan|    Delhi| 35|
|       103|Meena|Bangalore| 41|
|       104|Kabir|Hyderabad| 30|
|       105| Zoya|  Chennai| 25|
+----------+-----+---------+---+

+-------+----------+-------+--------+-----+----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|
+-------+----------+-------+--------+-----+----------+
|   1001|       101| Laptop|       1|70000|2024-01-05|
|   1002|       102| Mobile|       2|25000|2024-02-10|
|   1003|       103|   Desk|       1|10000|2024-03-15|
|   1004|       104|  Mouse|       3| 1000|2024-04-01|
|   1005|       105|Monitor|       1|12000|2024-04-25|
+-------+----------+-------+--------+-----+----------+



In [0]:
#Spark Tasks
from pyspark.sql.functions import col, avg
# 1. Ingest the CSV files into two DataFrames
df_customers = spark.read.csv(r"file:/Workspace/Shared/customers.csv", header=True, inferSchema=True)
df_orders = spark.read.csv(r"file:/Workspace/Shared/orders.csv", header=True, inferSchema=True)
df_customers.show()
df_orders.show()

+----------+-----+---------+---+
|CustomerID| Name|     City|Age|
+----------+-----+---------+---+
|       101|Aditi|   Mumbai| 28|
|       102|Rohan|    Delhi| 35|
|       103|Meena|Bangalore| 41|
|       104|Kabir|Hyderabad| 30|
|       105| Zoya|  Chennai| 25|
+----------+-----+---------+---+

+-------+----------+-------+--------+-----+----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|
+-------+----------+-------+--------+-----+----------+
|   1001|       101| Laptop|       1|70000|2024-01-05|
|   1002|       102| Mobile|       2|25000|2024-02-10|
|   1003|       103|   Desk|       1|10000|2024-03-15|
|   1004|       104|  Mouse|       3| 1000|2024-04-01|
|   1005|       105|Monitor|       1|12000|2024-04-25|
+-------+----------+-------+--------+-----+----------+



In [0]:
# 2. Print schema
df_customers.printSchema()
df_orders.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Age: integer (nullable = true)

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: date (nullable = true)



In [0]:
# 3. Add TotalAmount = Quantity * Price
df_orders = df_orders.withColumn("TotalAmount", col("Quantity") * col("Price"))
df_orders.show()

+-------+----------+-------+--------+-----+----------+-----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|TotalAmount|
+-------+----------+-------+--------+-----+----------+-----------+
|   1001|       101| Laptop|       1|70000|2024-01-05|      70000|
|   1002|       102| Mobile|       2|25000|2024-02-10|      50000|
|   1003|       103|   Desk|       1|10000|2024-03-15|      10000|
|   1004|       104|  Mouse|       3| 1000|2024-04-01|       3000|
|   1005|       105|Monitor|       1|12000|2024-04-25|      12000|
+-------+----------+-------+--------+-----+----------+-----------+



In [0]:
# 4. Join both DataFrames on CustomerID
df_joined = df_orders.join(df_customers, on="CustomerID", how="inner")
df_joined.show()

+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|TotalAmount| Name|     City|Age|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|       101|   1001| Laptop|       1|70000|2024-01-05|      70000|Aditi|   Mumbai| 28|
|       102|   1002| Mobile|       2|25000|2024-02-10|      50000|Rohan|    Delhi| 35|
|       103|   1003|   Desk|       1|10000|2024-03-15|      10000|Meena|Bangalore| 41|
|       104|   1004|  Mouse|       3| 1000|2024-04-01|       3000|Kabir|Hyderabad| 30|
|       105|   1005|Monitor|       1|12000|2024-04-25|      12000| Zoya|  Chennai| 25|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+



In [0]:
# 5. Filter orders where TotalAmount > 20000
df_filtered = df_joined.filter(col("TotalAmount") > 20000)
df_filtered.show()

+----------+-------+-------+--------+-----+----------+-----------+-----+------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|TotalAmount| Name|  City|Age|
+----------+-------+-------+--------+-----+----------+-----------+-----+------+---+
|       101|   1001| Laptop|       1|70000|2024-01-05|      70000|Aditi|Mumbai| 28|
|       102|   1002| Mobile|       2|25000|2024-02-10|      50000|Rohan| Delhi| 35|
+----------+-------+-------+--------+-----+----------+-----------+-----+------+---+



In [0]:
# 6. Show customers who placed more than 1 order
from pyspark.sql.functions import countDistinct, count

df_order_counts = df_orders.groupBy("CustomerID").agg(count("OrderID").alias("OrderCount"))
df_order_counts.show()
df_multiple_orders = df_order_counts.filter(col("OrderCount") > 1)
df_multiple_orders.join(df_customers, "CustomerID").select("CustomerID", "Name", "OrderCount").show()

+----------+----------+
|CustomerID|OrderCount|
+----------+----------+
|       101|         1|
|       103|         1|
|       102|         1|
|       105|         1|
|       104|         1|
+----------+----------+

+----------+----+----------+
|CustomerID|Name|OrderCount|
+----------+----+----------+
+----------+----+----------+



In [0]:
# 7. Group orders by City and get average order value
df_avg_city = df_joined.groupBy("City").agg(avg("TotalAmount").alias("AvgOrderValue"))
df_avg_city.show()

+---------+-------------+
|     City|AvgOrderValue|
+---------+-------------+
|Bangalore|      10000.0|
|  Chennai|      12000.0|
|   Mumbai|      70000.0|
|    Delhi|      50000.0|
|Hyderabad|       3000.0|
+---------+-------------+



In [0]:
# 8. Sort orders by OrderDate in descending order
df_sorted = df_joined.orderBy(col("OrderDate").desc())
df_sorted.show()

+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|TotalAmount| Name|     City|Age|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|       105|   1005|Monitor|       1|12000|2024-04-25|      12000| Zoya|  Chennai| 25|
|       104|   1004|  Mouse|       3| 1000|2024-04-01|       3000|Kabir|Hyderabad| 30|
|       103|   1003|   Desk|       1|10000|2024-03-15|      10000|Meena|Bangalore| 41|
|       102|   1002| Mobile|       2|25000|2024-02-10|      50000|Rohan|    Delhi| 35|
|       101|   1001| Laptop|       1|70000|2024-01-05|      70000|Aditi|   Mumbai| 28|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+



In [0]:
# 9. Write final joined result as Parquet file partitioned by City
df_joined.write.mode("overwrite").partitionBy("City").parquet("dbfs:/FileStore/output/orders_parquet")

In [0]:
# 10. Create a temporary view
df_joined.createOrReplaceTempView("orders_view")