#Problem Statement
-- Question: Write an SQL query to find for each seller, whether the brand of the second item (by date)
-- they sold is their favorite brand.
-- If a seller sold less than two items, report the answer for that seller NO.

###PySpark

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("example") \
    .getOrCreate()


In [0]:
from datetime import datetime
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType


def parse_date(date_str):
    return datetime.strptime(date_str, "%Y-%m-%d").date()

In [0]:
# Define schema for users table
users_schema = StructType(
    [
        StructField("user_id", IntegerType(), True),
        StructField("join_date", DateType(), True),
        StructField("favorite_brand", StringType(), True),
    ]
)

users_data = [
    (1, parse_date("2019-01-01"), "Lenovo"),
    (2, parse_date("2019-02-09"), "Samsung"),
    (3, parse_date("2019-01-19"), "LG"),
    (4, parse_date("2019-05-21"), "HP"),
]

users_df = spark.createDataFrame(users_data, schema=users_schema)
users_df.display()

# Define schema for orders table
orders_schema = StructType(
    [
        StructField("order_id", IntegerType(), True),
        StructField("order_date", DateType(), True),
        StructField("item_id", IntegerType(), True),
        StructField("buyer_id", IntegerType(), True),
        StructField("seller_id", IntegerType(), True),
    ]
)

orders_data = [
    (1, parse_date("2019-08-01"), 4, 1, 2),
    (2, parse_date("2019-08-02"), 2, 1, 3),
    (3, parse_date("2019-08-03"), 3, 2, 3),
    (4, parse_date("2019-08-04"), 1, 4, 2),
    (5, parse_date("2019-08-04"), 1, 3, 4),
    (6, parse_date("2019-08-05"), 2, 2, 4),
]

orders_df = spark.createDataFrame(orders_data, schema=orders_schema)
orders_df.display()

# Define schema for items table
items_schema = StructType(
    [
        StructField("item_id", IntegerType(), True),
        StructField("item_brand", StringType(), True),
    ]
)

items_data = [(1, "Samsung"), (2, "Lenovo"), (3, "LG"), (4, "HP")]

items_df = spark.createDataFrame(items_data, schema=items_schema)
items_df.display()

user_id,join_date,favorite_brand
1,2019-01-01,Lenovo
2,2019-02-09,Samsung
3,2019-01-19,LG
4,2019-05-21,HP


order_id,order_date,item_id,buyer_id,seller_id
1,2019-08-01,4,1,2
2,2019-08-02,2,1,3
3,2019-08-03,3,2,3
4,2019-08-04,1,4,2
5,2019-08-04,1,3,4
6,2019-08-05,2,2,4


item_id,item_brand
1,Samsung
2,Lenovo
3,LG
4,HP


In [0]:
# Join the DataFrames
from pyspark.sql.functions import col, row_number, count, when
from pyspark.sql.window import Window

joined_df = (
    users_df.alias("u")
    .join(orders_df.alias("o"), col("o.seller_id") == col("u.user_id"), "left")
    .join(items_df.alias("i"), col("i.item_id") == col("o.item_id"), "left")
)

# Define the window specifications
window_spec_user = Window.partitionBy("u.user_id").orderBy("o.order_date")
window_spec_seller = Window.partitionBy("o.seller_id")

# Add row number and total items sales columns
cte_df = joined_df.select(
    col("u.user_id"),
    col("u.favorite_brand"),
    row_number().over(window_spec_user).alias("rn"),
    col("i.item_brand"),
    count("o.item_id").over(window_spec_seller).alias("total_items_sales"),
)

# Filter and add status column
result_df = cte_df.filter((col("rn") == 2) | (col("total_items_sales") < 2)).select(
    col("user_id"),
    when(col("favorite_brand") == col("item_brand"), "Yes")
    .otherwise("No")
    .alias("status"),
)

result_df.display()

user_id,status
1,No
2,Yes
3,Yes
4,No


###Spark SQL

In [0]:
users_df.createOrReplaceTempView("users")
orders_df.createOrReplaceTempView("orders")
items_df.createOrReplaceTempView("items")

In [0]:
%sql
with cte as (
  SELECT
    u.user_id,
    u.favorite_brand,
    row_number() over(
      partition by user_id
      order by
        order_date asc
    ) as rn,
    i.item_brand,
    count(o.item_id) over(partition by o.seller_id) as total_items_sales
  FROM
    users u
    LEFT JOIN orders o ON o.seller_id = u.user_id
    LEFT JOIN items i ON i.item_id = o.item_id
)
SELECT
  user_id,
  CASE
    WHEN favorite_brand = item_brand THEN 'Yes'
    ELSE 'No'
  END AS status
FROM
  cte
WHERE
  rn = 2
  or total_items_sales < 2;

user_id,status
1,No
2,Yes
3,Yes
4,No
