### You have a dataset of product reviews with columns product_id, rating, and review_date. Some ratings are null. Write a query to calculate the average rating for each product, ignoring null values.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

# Sample data
data = [
    (101, 4.0, "2024-01-10"),
    (101, None, "2024-01-12"),
    (101, 5.0, "2024-01-15"),
    (102, 3.0, "2024-02-01"),
    (102, 2.0, "2024-02-05"),
    (102, None, "2024-02-07"),
    (103, None, "2024-03-01"),
    (103, None, "2024-03-10"),
    (104, 5.0, "2024-04-01"),
    (104, 4.0, "2024-04-03"),
]

# Define schema
columns = ["product_id", "rating", "review_date"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Show the sample DataFrame
df.show()


+----------+------+-----------+
|product_id|rating|review_date|
+----------+------+-----------+
|       101|   4.0| 2024-01-10|
|       101|  null| 2024-01-12|
|       101|   5.0| 2024-01-15|
|       102|   3.0| 2024-02-01|
|       102|   2.0| 2024-02-05|
|       102|  null| 2024-02-07|
|       103|  null| 2024-03-01|
|       103|  null| 2024-03-10|
|       104|   5.0| 2024-04-01|
|       104|   4.0| 2024-04-03|
+----------+------+-----------+



In [0]:
# Compute average rating per product, ignoring nulls
avg_rating = df.groupBy("product_id") \
  .agg(avg("rating").alias("avg_rating")) \
  .na.drop()   # Removes rows with null average ratings

# Show the result
avg_rating.show()


+----------+----------+
|product_id|avg_rating|
+----------+----------+
|       101|       4.5|
|       102|       2.5|
|       104|       4.5|
+----------+----------+

