### You have a dataset of sales transactions with columns store_id, product_id, quantity_sold, and price_per_unit. Write a query to calculate the total revenue (quantity sold * price per unit) per store.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum



# Sample data
data = [
    (1, 101, 30, 25.00),
    (1, 102, 20, 50.00),
    (2, 101, 40, 25.00),
    (2, 103, 10, 75.00),
    (3, 102, 15, 50.00),
    (3, 104, 50, 60.00),
    (1, 103, 10, 75.00),
    (2, 104, 20, 60.00),
    (3, 101, 25, 25.00),
    (3, 102, 30, 50.00)
]

# Define schema
columns = ["store_id", "product_id", "quantity_sold", "price_per_unit"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Show the DataFrame
df.show()


+--------+----------+-------------+--------------+
|store_id|product_id|quantity_sold|price_per_unit|
+--------+----------+-------------+--------------+
|       1|       101|           30|          25.0|
|       1|       102|           20|          50.0|
|       2|       101|           40|          25.0|
|       2|       103|           10|          75.0|
|       3|       102|           15|          50.0|
|       3|       104|           50|          60.0|
|       1|       103|           10|          75.0|
|       2|       104|           20|          60.0|
|       3|       101|           25|          25.0|
|       3|       102|           30|          50.0|
+--------+----------+-------------+--------------+



In [0]:
# Calculate total revenue per store
df_with_revenue = df.withColumn("revenue", col("quantity_sold") * col("price_per_unit"))

# Group by store_id and sum the revenue
total_revenue_per_store = df_with_revenue.groupBy("store_id").agg(_sum("revenue").alias("total_revenue"))

# Show the result
total_revenue_per_store.show()
