In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .appName("Sales PySpark")\
        .getOrCreate()

spark

In [0]:
from pyspark.sql import Row

# Sample data
data = [
    Row(OrderID=101, Customer="Ali", Items=[{"Product":"Laptop", "Qty":1}, {"Product":"Mouse", "Qty":2}], Region="Asia", Amount=1200.0),
    Row(OrderID=102, Customer="Zara", Items=[{"Product":"Tablet", "Qty":1}], Region="Europe", Amount=650.0),
    Row(OrderID=103, Customer="Mohan", Items=[{"Product":"Phone", "Qty":2}, {"Product":"Charger", "Qty":1}], Region="Asia", Amount=890.0),
    Row(OrderID=104, Customer="Sara", Items=[{"Product":"Desk", "Qty":1}], Region="US", Amount=450.0)
]

# Create DataFrame
df_sales = spark.createDataFrame(data)
df_sales.show(truncate=False)

+-------+--------+--------------------------------------------------------------+------+------+
|OrderID|Customer|Items                                                         |Region|Amount|
+-------+--------+--------------------------------------------------------------+------+------+
|101    |Ali     |[{Product -> Laptop, Qty -> 1}, {Product -> Mouse, Qty -> 2}] |Asia  |1200.0|
|102    |Zara    |[{Product -> Tablet, Qty -> 1}]                               |Europe|650.0 |
|103    |Mohan   |[{Product -> Phone, Qty -> 2}, {Product -> Charger, Qty -> 1}]|Asia  |890.0 |
|104    |Sara    |[{Product -> Desk, Qty -> 1}]                                 |US    |450.0 |
+-------+--------+--------------------------------------------------------------+------+------+



In [0]:
#PySpark Exercises – Set 4 (SQL, JSON, Advanced Functions)
#Working with JSON & Nested Fields
# 1. Flatten the Items array using explode() to create one row per product.
from pyspark.sql.functions import explode, col

df_flat = df_sales.withColumn("Item", explode("Items")) \
                  .select("OrderID", "Customer", "Region", "Amount",
                          col("Item.Product").alias("Product"),
                          col("Item.Qty").alias("Quantity"))
df_flat.show()

# 2. Count total quantity sold per product.
from pyspark.sql.types import IntegerType
df_flat = df_flat.withColumn("Quantity", col("Quantity").cast(IntegerType()))
df_flat.groupBy("Product").sum("Quantity") \
    .withColumnRenamed("sum(Quantity)", "TotalQty").show()

# 3. Count number of orders per region.
df_sales.groupBy("Region").count().show()

+-------+--------+------+------+-------+--------+
|OrderID|Customer|Region|Amount|Product|Quantity|
+-------+--------+------+------+-------+--------+
|    101|     Ali|  Asia|1200.0| Laptop|       1|
|    101|     Ali|  Asia|1200.0|  Mouse|       2|
|    102|    Zara|Europe| 650.0| Tablet|       1|
|    103|   Mohan|  Asia| 890.0|  Phone|       2|
|    103|   Mohan|  Asia| 890.0|Charger|       1|
|    104|    Sara|    US| 450.0|   Desk|       1|
+-------+--------+------+------+-------+--------+

+-------+--------+
|Product|TotalQty|
+-------+--------+
| Laptop|       1|
|  Mouse|       2|
| Tablet|       1|
|  Phone|       2|
|Charger|       1|
|   Desk|       1|
+-------+--------+

+------+-----+
|Region|count|
+------+-----+
|  Asia|    2|
|Europe|    1|
|    US|    1|
+------+-----+



In [0]:
#Using when and otherwise
# 4. Create a new column HighValueOrder :
#         "Yes" if Amount > 1000
#         "No" otherwise
from pyspark.sql.functions import when

df_sales = df_sales.withColumn("HighValueOrder", when(col("Amount") > 1000, "Yes").otherwise("No"))
df_sales.select("OrderID", "Amount", "HighValueOrder").show()

# 5. Add a column ShippingZone :
#     Asia → "Zone A", Europe → "Zone B", US → "Zone C"
df_sales = df_sales.withColumn(
    "ShippingZone",
    when(col("Region") == "Asia", "Zone A")
    .when(col("Region") == "Europe", "Zone B")
    .when(col("Region") == "US", "Zone C")
    .otherwise("Other")
)
df_sales.select("OrderID", "Region", "ShippingZone").show()

+-------+------+--------------+
|OrderID|Amount|HighValueOrder|
+-------+------+--------------+
|    101|1200.0|           Yes|
|    102| 650.0|            No|
|    103| 890.0|            No|
|    104| 450.0|            No|
+-------+------+--------------+

+-------+------+------------+
|OrderID|Region|ShippingZone|
+-------+------+------------+
|    101|  Asia|      Zone A|
|    102|Europe|      Zone B|
|    103|  Asia|      Zone A|
|    104|    US|      Zone C|
+-------+------+------------+



In [0]:
#Temporary & Permanent Views
# 6. Register df_sales as a temporary view named sales_view .
df_sales.createOrReplaceTempView("sales_view")

# 7. Write a SQL query to:
# Count orders by Region
# Find average amount per region
spark.sql("""
    SELECT Region, COUNT(*) AS OrderCount, ROUND(AVG(Amount), 2) AS AvgAmount
    FROM sales_view
    GROUP BY Region
""").show()

# 8. Create a permanent view using saveAsTable() .
df_sales.write.mode("overwrite").saveAsTable("sales_permanent_view")

+------+----------+---------+
|Region|OrderCount|AvgAmount|
+------+----------+---------+
|  Asia|         2|   1045.0|
|Europe|         1|    650.0|
|    US|         1|    450.0|
+------+----------+---------+



In [0]:
# SQL Queries via Spark
# 9. Use SQL to filter all orders with more than 1 item.
spark.sql("""
    SELECT * FROM sales_view WHERE size(Items) > 1
""").show(truncate=False)

# 10. Use SQL to extract customer names where Amount > 800.
spark.sql("""
    SELECT Customer FROM sales_view WHERE Amount > 800
""").show()

+-------+--------+--------------------------------------------------------------+------+------+--------------+------------+
|OrderID|Customer|Items                                                         |Region|Amount|HighValueOrder|ShippingZone|
+-------+--------+--------------------------------------------------------------+------+------+--------------+------------+
|101    |Ali     |[{Product -> Laptop, Qty -> 1}, {Product -> Mouse, Qty -> 2}] |Asia  |1200.0|Yes           |Zone A      |
|103    |Mohan   |[{Product -> Phone, Qty -> 2}, {Product -> Charger, Qty -> 1}]|Asia  |890.0 |No            |Zone A      |
+-------+--------+--------------------------------------------------------------+------+------+--------------+------------+

+--------+
|Customer|
+--------+
|     Ali|
|   Mohan|
+--------+



In [0]:
# Saving as Parquet and Reading Again
# 11. Save the exploded product-level DataFrame as a partitioned Parquet file by Region .
df_flat.write.mode("overwrite").partitionBy("Region").parquet("/tmp/sales_parquet")

# 12. Read the parquet back and perform a group-by on Product .
df_parquet = spark.read.parquet("/tmp/sales_parquet")
df_parquet.groupBy("Product").sum("Quantity").withColumnRenamed("sum(Quantity)", "TotalQty").show()

+-------+--------+
|Product|TotalQty|
+-------+--------+
|  Phone|       2|
|Charger|       1|
| Laptop|       1|
|  Mouse|       2|
| Tablet|       1|
|   Desk|       1|
+-------+--------+

