#Problem Statement
product_id is the primary key for this table. 
period_start and period_end indicates the start and end date for sales period, both dates are inclusive.
The average_daily_sales column holds the average daily sales amount of the items for the period.
Write an SQL query to report the Total sales amount of each item for each year, with corresponding product name, product_id, product_name and report_year.

Dates of the sales years are between 2018 to 2020. Return the result table ordered by product_id and report_year.

The query result format is in the following example:

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, sequence, year, col, expr
from pyspark.sql.types import StructType, StructField, IntegerType, DateType
from datetime import datetime

# Initialize Spark session
spark = SparkSession.builder.appName("CreateSalesTable").getOrCreate()

# Define the schema for the sales table
schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("period_start", DateType(), True),
    StructField("period_end", DateType(), True),
    StructField("average_daily_sales", IntegerType(), True)
])

# Create data to be inserted into the sales table with datetime.date objects
data = [
    (1, datetime.strptime('2019-01-25', '%Y-%m-%d').date(),
        datetime.strptime('2019-02-28', '%Y-%m-%d').date(), 100),
    (2, datetime.strptime('2018-12-01', '%Y-%m-%d').date(),
        datetime.strptime('2020-01-01', '%Y-%m-%d').date(), 10),
    (3, datetime.strptime('2019-12-01', '%Y-%m-%d').date(),
        datetime.strptime('2020-01-31', '%Y-%m-%d').date(), 1)
]

# Convert data to DataFrame
df = spark.createDataFrame(data, schema=schema)

# Show the DataFrame
df.show()

+----------+------------+----------+-------------------+
|product_id|period_start|period_end|average_daily_sales|
+----------+------------+----------+-------------------+
|         1|  2019-01-25|2019-02-28|                100|
|         2|  2018-12-01|2020-01-01|                 10|
|         3|  2019-12-01|2020-01-31|                  1|
+----------+------------+----------+-------------------+



#Pyspark

In [0]:
# Generate a DataFrame with each day in the period
expanded_df = df.withColumn(
    "day", explode(sequence(col("period_start"), col("period_end")))
)

# Calculate total sales per year
result_df = (
    expanded_df.withColumn("report_year", year(col("day")))
    .groupBy("product_id", "report_year")
    .agg(expr("sum(average_daily_sales) as total_amount"))
)

# Show the result DataFrame
result_df.orderBy("product_id", "report_year").show()

+----------+-----------+------------+
|product_id|report_year|total_amount|
+----------+-----------+------------+
|         1|       2019|        3500|
|         2|       2018|         310|
|         2|       2019|        3650|
|         2|       2020|          10|
|         3|       2019|          31|
|         3|       2020|          31|
+----------+-----------+------------+



#Spark SQL

In [0]:
df.createOrReplaceTempView('sales')

In [0]:
# Use Spark SQL to perform the required operations
result_df = spark.sql(
    """
    WITH all_dates AS (
        SELECT 
            product_id,
            period_start,
            period_end,
            average_daily_sales,
            explode(sequence(period_start, period_end, interval 1 day)) AS day
        FROM sales
    ),
    daily_sales AS (
        SELECT
            product_id,
            year(day) AS report_year,
            average_daily_sales
        FROM all_dates
    )
    SELECT
        product_id,
        report_year,
        SUM(average_daily_sales) AS total_amount
    FROM daily_sales
    GROUP BY product_id, report_year
    ORDER BY product_id, report_year
"""
)

# Show the result DataFrame
result_df.show()

+----------+-----------+------------+
|product_id|report_year|total_amount|
+----------+-----------+------------+
|         1|       2019|        3500|
|         2|       2018|         310|
|         2|       2019|        3650|
|         2|       2020|          10|
|         3|       2019|          31|
|         3|       2020|          31|
+----------+-----------+------------+

