In [0]:
#Case Study: Sales Data Analysis
#Problem Statement
#You are given a dataset of sales transactions. Your task is to:

'''
1. Load the dataset into a PySpark DataFrame.

2. Perform basic transformations:
Filter sales greater than a specified threshold.
Group sales data by region and calculate the total revenue.
Find the top 3 regions with the highest sales revenue.

3.Display the results.'''

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, desc
df = spark.sql("SELECT * FROM sales_data")
df.show()

+--------------+------+----------+--------+------+
|transaction_id|region|   product|quantity| price|
+--------------+------+----------+--------+------+
|          T001| North|    Laptop|       2|1000.0|
|          T002| South|Smartphone|       5| 500.0|
|          T003|  East|    Tablet|       3| 300.0|
|          T004| North|Smartphone|       4| 400.0|
|          T005| South|    Laptop|       1|1200.0|
|          T006|  West|    Tablet|       2| 350.0|
|          T007| North|Smartphone|       6| 400.0|
|          T008| South|    Laptop|       3|1100.0|
|          T009|  East|Smartphone|       4| 450.0|
|          T010|  West|    Laptop|       1|1300.0|
+--------------+------+----------+--------+------+



In [0]:
sales_df = df.withColumn("total_revenue", col("quantity") * col("price"))
#sales_df.show()
# threshold , 1000
rajat = 1000
sales_df2 = sales_df.filter(col("total_revenue") > rajat)
sales_df2.show()


+--------------+------+----------+--------+------+-------------+
|transaction_id|region|   product|quantity| price|total_revenue|
+--------------+------+----------+--------+------+-------------+
|          T001| North|    Laptop|       2|1000.0|       2000.0|
|          T002| South|Smartphone|       5| 500.0|       2500.0|
|          T004| North|Smartphone|       4| 400.0|       1600.0|
|          T005| South|    Laptop|       1|1200.0|       1200.0|
|          T007| North|Smartphone|       6| 400.0|       2400.0|
|          T008| South|    Laptop|       3|1100.0|       3300.0|
|          T009|  East|Smartphone|       4| 450.0|       1800.0|
|          T010|  West|    Laptop|       1|1300.0|       1300.0|
+--------------+------+----------+--------+------+-------------+



In [0]:
sales_df3 = sales_df.groupBy("region").agg(sum("total_revenue").alias("revenue_asper_region"))
sales_df3.show()

+------+--------------------+
|region|revenue_asper_region|
+------+--------------------+
| South|              7000.0|
|  East|              2700.0|
|  West|              2000.0|
| North|              6000.0|
+------+--------------------+



In [0]:
top_3_df = sales_df3.orderBy(desc("revenue_asper_region")).limit(2)
top_3_df.show()

+------+--------------------+
|region|revenue_asper_region|
+------+--------------------+
| South|              7000.0|
| North|              6000.0|
+------+--------------------+

