### You have a dataset of customer purchases with columns customer_id, purchase_date, and amount_spent. Write a query to find the total amount spent and the average amount spent per customer

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum as _sum, avg

# Sample data
data = [
    (1, "2024-01-05", 100.0),
    (1, "2024-02-10", 150.0),
    (2, "2024-01-15", 200.0),
    (2, "2024-03-12", 100.0),
    (3, "2024-02-20", 300.0),
    (3, "2024-03-25", 250.0),
    (1, "2024-04-01", 200.0),
    (2, "2024-04-15", 150.0)
]

# Define schema
columns = ["customer_id", "purchase_date", "amount_spent"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Show the sample DataFrame
df.show()


+-----------+-------------+------------+
|customer_id|purchase_date|amount_spent|
+-----------+-------------+------------+
|          1|   2024-01-05|       100.0|
|          1|   2024-02-10|       150.0|
|          2|   2024-01-15|       200.0|
|          2|   2024-03-12|       100.0|
|          3|   2024-02-20|       300.0|
|          3|   2024-03-25|       250.0|
|          1|   2024-04-01|       200.0|
|          2|   2024-04-15|       150.0|
+-----------+-------------+------------+



In [0]:
# Aggregate total and average amount spent per customer
agg_df = df.groupBy("customer_id").agg(
    _sum("amount_spent").alias("total_spent"),
    avg("amount_spent").alias("avg_spent")
)

# Show the result
agg_df.show()


+-----------+-----------+---------+
|customer_id|total_spent|avg_spent|
+-----------+-----------+---------+
|          1|      450.0|    150.0|
|          2|      450.0|    150.0|
|          3|      550.0|    275.0|
+-----------+-----------+---------+

