In [0]:
spark

<pyspark.sql.connect.session.SparkSession at 0xfff99c38e6d0>

In [0]:
from pyspark.sql.functions import col, sum as _sum

#Data
orders_df = spark.read.option("header", True).option("inferSchema", True) \
    .csv("/Volumes/workspace/csv_data/customersordersinsights/cleaned_orders.csv")

orders_df.show()

+--------+-----------+-------------+----------+-------------+---------+-----------------+----------+-------+
|order_id|customer_id|customer_name|order_date|delivery_date|   status|            issue|delay_days|delayed|
+--------+-----------+-------------+----------+-------------+---------+-----------------+----------+-------+
|       1|        101|  Amit Sharma|05-01-2024|   05-05-2024|Delivered|         No issue|       414|      1|
|       2|        102|  Priya Singh|05-03-2024|         NULL|  Delayed|    Late delivery|         0|      0|
|       3|        101|  Amit Sharma|05-10-2024|   05-09-2024|Delivered|         No issue|       291|      1|
|       4|        103|   Ravi Kumar|05-12-2024|         NULL|  Pending|Address not found|         0|      0|
|       5|        104|  Anita Desai|      NULL|         NULL|Delivered|         No issue|         0|      0|
|       6|        102|  Priya Singh|      NULL|         NULL|  Delayed|    Late delivery|         0|      0|
|       7|        1

In [0]:
customers_df = spark.read.option("header", True).option("inferSchema", True) \
    .csv("/Volumes/workspace/csv_data/customersordersinsights/customers.csv")

customers_df.show()

+-----------+-------------+--------------------+----------+------+
|customer_id|customer_name|               email|     Phone|Region|
+-----------+-------------+--------------------+----------+------+
|        101|  Amit Sharma|amit.sharma@examp...|9876543210| North|
|        102|  Priya Singh|priya.singh@examp...|9123456780| South|
|        103|    Raj Verma|raj.verma@example...|9988776655|  West|
|        104|  Sneha Reddy|sneha.reddy@examp...|9871234567| South|
|        105|    Manoj Das|manoj.das@example...|9123987654|  East|
|        106|Neha Kulkarni|neha.kulkarni@exa...|7894561230|  West|
|        107|  Karan Mehta|karan.mehta@examp...|7001239876| North|
|        108|   Divya Iyer|divya.iyer@exampl...|8080808080| South|
|        109|    Arjun Sen|arjun.sen@example...|9900990099|  East|
|        110|  Meera Joshi|meera.joshi@examp...|8500450060|  West|
+-----------+-------------+--------------------+----------+------+



In [0]:
joined_df = orders_df.join(customers_df, on="customer_id", how="inner")
joined_df.show()

+-----------+--------+-------------+----------+-------------+---------+-----------------+----------+-------+-------------+--------------------+----------+------+
|customer_id|order_id|customer_name|order_date|delivery_date|   status|            issue|delay_days|delayed|customer_name|               email|     Phone|Region|
+-----------+--------+-------------+----------+-------------+---------+-----------------+----------+-------+-------------+--------------------+----------+------+
|        101|       1|  Amit Sharma|05-01-2024|   05-05-2024|Delivered|         No issue|       414|      1|  Amit Sharma|amit.sharma@examp...|9876543210| North|
|        102|       2|  Priya Singh|05-03-2024|         NULL|  Delayed|    Late delivery|         0|      0|  Priya Singh|priya.singh@examp...|9123456780| South|
|        101|       3|  Amit Sharma|05-10-2024|   05-09-2024|Delivered|         No issue|       291|      1|  Amit Sharma|amit.sharma@examp...|9876543210| North|
|        103|       4|   Rav

In [0]:
delay_summary = (
    joined_df.groupBy("region")
    .agg(_sum("delayed").alias("total_delayed_orders"))
    .orderBy("total_delayed_orders", ascending=False)
)

delay_summary.show()

+------+--------------------+
|region|total_delayed_orders|
+------+--------------------+
| North|                   2|
|  East|                   0|
|  West|                   0|
| South|                   0|
+------+--------------------+



In [0]:
delay_summary.coalesce(1).write.option("header", True).csv(
    "/Volumes/workspace/csv_data/customersordersinsights/output/delays_by_region"
)