#Problrm Statement
You are the business owner and would like to obtain a sales report for category items and day of the week.

Write an SQL query to report how many units in each category have been ordered on each day of the week.

Return the result table ordered by category.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, dayofweek, sum as _sum, when, coalesce, lit
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Orders and Items DataFrame") \
    .getOrCreate()

# Define schema for Orders table
orders_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("order_date", StringType(), True),  # Temporarily use StringType
    StructField("item_id", IntegerType(), True),
    StructField("quantity", IntegerType(), True)
])

# Define schema for Items table
items_schema = StructType([
    StructField("item_id", IntegerType(), True),
    StructField("item_name", StringType(), True),
    StructField("item_category", StringType(), True)
])

# Sample data for Orders
orders_data = [
    (1, 1, "2020-06-01", 1, 10),
    (2, 1, "2020-06-08", 2, 10),
    (3, 2, "2020-06-02", 1, 5),
    (4, 3, "2020-06-03", 3, 5),
    (5, 4, "2020-06-04", 4, 1),
    (6, 4, "2020-06-05", 5, 5),
    (7, 5, "2020-06-05", 1, 10),
    (8, 5, "2020-06-14", 4, 5),
    (9, 5, "2020-06-21", 3, 5)
]

# Sample data for Items
items_data = [
    (1, "LC Alg. Book", "Book"),
    (2, "LC DB. Book", "Book"),
    (3, "LC SmarthPhone", "Phone"),
    (4, "LC Phone 2020", "Phone"),
    (5, "LC SmartGlass", "Glasses"),
    (6, "LC T-Shirt XL", "T-Shirt")
]

# Create DataFrame for Orders
orders_df = spark.createDataFrame(data=orders_data, schema=orders_schema)

# Convert order_date from string to date type
orders_df = orders_df.withColumn("order_date", orders_df["order_date"].cast("date"))

# Create DataFrame for Items
items_df = spark.createDataFrame(data=items_data, schema=items_schema)

# Join Orders and Items DataFrames
joined_df = items_df.join(orders_df, items_df["item_id"] == orders_df["item_id"], "left") \
    .groupBy("item_category") \
    .agg(
        coalesce(_sum(when(dayofweek(col("order_date")) == 2, col("quantity"))), lit(0)).alias("Monday"),
        coalesce(_sum(when(dayofweek(col("order_date")) == 3, col("quantity"))), lit(0)).alias("Tuesday"),
        coalesce(_sum(when(dayofweek(col("order_date")) == 4, col("quantity"))), lit(0)).alias("Wednesday"),
        coalesce(_sum(when(dayofweek(col("order_date")) == 5, col("quantity"))), lit(0)).alias("Thursday"),
        coalesce(_sum(when(dayofweek(col("order_date")) == 6, col("quantity"))), lit(0)).alias("Friday"),
        coalesce(_sum(when(dayofweek(col("order_date")) == 7, col("quantity"))), lit(0)).alias("Saturday"),
        coalesce(_sum(when(dayofweek(col("order_date")) == 1, col("quantity"))), lit(0)).alias("Sunday")
    ) \
    .orderBy("item_category")

# Show the result DataFrame
joined_df.display()

In [0]:
salary_df.printSchema()
employee_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- employee_id: integer (nullable = true)
 |-- amount: integer (nullable = true)
 |-- pay_date: date (nullable = true)
 |-- pay_month: double (nullable = true)

root
 |-- employee_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)



In [0]:
items_df.printSchema()
orders_df.printSchema()

root
 |-- item_id: integer (nullable = true)
 |-- item_name: string (nullable = true)
 |-- item_category: string (nullable = true)

root
 |-- order_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)



In [0]:
items_df.createOrReplaceTempView("Items")
orders_df.createOrReplaceTempView("Orders")


In [0]:
%sql
select i.item_category as Category,
       coalesce(sum(case when dayofweek(o.order_date) = 2 then o.quantity end), 0) as Monday,
coalesce(sum(case when dayofweek(o.order_date) = 3 then o.quantity end), 0) as Tuesday,
coalesce(sum(case when dayofweek(o.order_date) = 4 then o.quantity end), 0) as Wednesday,
coalesce(sum(case when dayofweek(o.order_date) = 5 then o.quantity end), 0) as Thursday,
coalesce(sum(case when dayofweek(o.order_date) = 6 then o.quantity end), 0) as Friday,
coalesce(sum(case when dayofweek(o.order_date) = 7 then o.quantity end), 0) as Saturday,
coalesce(sum(case when dayofweek(o.order_date) = 1 then o.quantity end), 0) as Sunday
from Items i
left join
Orders o
on i.item_id = o.item_id
group by 1
order by 1
;

Category,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
Book,20,5,0,0,10,0,0
Glasses,0,0,0,0,5,0,0
Phone,0,0,5,1,0,0,10
T-Shirt,0,0,0,0,0,0,0
