In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, avg, desc, when

# Initialize Spark session
spark = SparkSession.builder \
    .appName("ProductSalesAnalysis") \
    .getOrCreate()


In [3]:

csv_data = """OrderID,Product,Category,Quantity,UnitPrice,Region
1001,Mobile,Electronics,2,15000,North
1002,Laptop,Electronics,1,55000,South
1003,T-Shirt,Apparel,3,500,East
1004,Jeans,Apparel,2,1200,North
1005,TV,Electronics,1,40000,West
1006,Shoes,Footwear,4,2000,South
1007,Watch,Accessories,2,3000,East
1008,Headphones,Electronics,3,2500,North
"""


with open("/content/sales.csv", "w") as f:
    f.write(csv_data)

print("sales.csv file created in /content/")



sales.csv file created in /content/


In [4]:
df = spark.read.csv("/content/sales.csv", header=True, inferSchema=True)


df.show(5)
df.printSchema()


+-------+-------+-----------+--------+---------+------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|
+-------+-------+-----------+--------+---------+------+
|   1001| Mobile|Electronics|       2|    15000| North|
|   1002| Laptop|Electronics|       1|    55000| South|
|   1003|T-Shirt|    Apparel|       3|      500|  East|
|   1004|  Jeans|    Apparel|       2|     1200| North|
|   1005|     TV|Electronics|       1|    40000|  West|
+-------+-------+-----------+--------+---------+------+
only showing top 5 rows

root
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Region: string (nullable = true)



In [5]:
df = df.withColumn("TotalPrice", col("Quantity") * col("UnitPrice"))
df.show()


+-------+----------+-----------+--------+---------+------+----------+
|OrderID|   Product|   Category|Quantity|UnitPrice|Region|TotalPrice|
+-------+----------+-----------+--------+---------+------+----------+
|   1001|    Mobile|Electronics|       2|    15000| North|     30000|
|   1002|    Laptop|Electronics|       1|    55000| South|     55000|
|   1003|   T-Shirt|    Apparel|       3|      500|  East|      1500|
|   1004|     Jeans|    Apparel|       2|     1200| North|      2400|
|   1005|        TV|Electronics|       1|    40000|  West|     40000|
|   1006|     Shoes|   Footwear|       4|     2000| South|      8000|
|   1007|     Watch|Accessories|       2|     3000|  East|      6000|
|   1008|Headphones|Electronics|       3|     2500| North|      7500|
+-------+----------+-----------+--------+---------+------+----------+



In [6]:
df.agg(_sum("TotalPrice").alias("TotalRevenue")).show()


+------------+
|TotalRevenue|
+------------+
|      150400|
+------------+



In [7]:
df.groupBy("Category") \
  .agg(_sum("TotalPrice").alias("CategoryRevenue")) \
  .orderBy(desc("CategoryRevenue")) \
  .show()


+-----------+---------------+
|   Category|CategoryRevenue|
+-----------+---------------+
|Electronics|         132500|
|   Footwear|           8000|
|Accessories|           6000|
|    Apparel|           3900|
+-----------+---------------+



In [8]:
df.groupBy("Region").count().orderBy(desc("count")).show(1)


+------+-----+
|Region|count|
+------+-----+
| North|    3|
+------+-----+
only showing top 1 row



In [9]:
df.groupBy("Category").agg(avg("UnitPrice").alias("AvgUnitPrice")).show()


+-----------+------------+
|   Category|AvgUnitPrice|
+-----------+------------+
|    Apparel|       850.0|
|Electronics|     28125.0|
|   Footwear|      2000.0|
|Accessories|      3000.0|
+-----------+------------+



In [10]:
df.filter(col("TotalPrice") > 30000).show()


+-------+-------+-----------+--------+---------+------+----------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|TotalPrice|
+-------+-------+-----------+--------+---------+------+----------+
|   1002| Laptop|Electronics|       1|    55000| South|     55000|
|   1005|     TV|Electronics|       1|    40000|  West|     40000|
+-------+-------+-----------+--------+---------+------+----------+



In [11]:
df = df.withColumn("HighValueOrder", when(col("TotalPrice") > 20000, "Yes").otherwise("No"))
df.show()


+-------+----------+-----------+--------+---------+------+----------+--------------+
|OrderID|   Product|   Category|Quantity|UnitPrice|Region|TotalPrice|HighValueOrder|
+-------+----------+-----------+--------+---------+------+----------+--------------+
|   1001|    Mobile|Electronics|       2|    15000| North|     30000|           Yes|
|   1002|    Laptop|Electronics|       1|    55000| South|     55000|           Yes|
|   1003|   T-Shirt|    Apparel|       3|      500|  East|      1500|            No|
|   1004|     Jeans|    Apparel|       2|     1200| North|      2400|            No|
|   1005|        TV|Electronics|       1|    40000|  West|     40000|           Yes|
|   1006|     Shoes|   Footwear|       4|     2000| South|      8000|            No|
|   1007|     Watch|Accessories|       2|     3000|  East|      6000|            No|
|   1008|Headphones|Electronics|       3|     2500| North|      7500|            No|
+-------+----------+-----------+--------+---------+------+-------

In [12]:
df.filter((col("HighValueOrder") == "Yes") & (col("Region") == "North")).show()


+-------+-------+-----------+--------+---------+------+----------+--------------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|TotalPrice|HighValueOrder|
+-------+-------+-----------+--------+---------+------+----------+--------------+
|   1001| Mobile|Electronics|       2|    15000| North|     30000|           Yes|
+-------+-------+-----------+--------+---------+------+----------+--------------+



In [13]:
df.filter(col("HighValueOrder") == "Yes").groupBy("Region").count().show()


+------+-----+
|Region|count|
+------+-----+
| South|    1|
|  West|    1|
| North|    1|
+------+-----+



In [15]:

output_path = "/content/high_value_orders.csv"
df.filter(col("HighValueOrder") == "Yes") \
  .write.csv(output_path, header=True, mode="overwrite")

print("High value orders saved at:", output_path)


High value orders saved at: /content/high_value_orders.csv
