In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType,StringType,DateType
from pyspark.sql.functions import col,quarter
from datetime import datetime

spark = SparkSession.builder.master("local[2]").appName("app").getOrCreate()

In [0]:
schema = StructType([
    StructField("product_id",IntegerType(),False),
    StructField("product_name",StringType(),False),
    StructField("unit_price",IntegerType(),False)
])

data = [
( 1          , "S8"           , 1000)       ,
( 2          , "G4"           , 800 )       ,
( 3          , "iPhone"       , 1400)
]

product = spark.createDataFrame(data,schema)
product.show()

+----------+------------+----------+
|product_id|product_name|unit_price|
+----------+------------+----------+
|         1|          S8|      1000|
|         2|          G4|       800|
|         3|      iPhone|      1400|
+----------+------------+----------+



In [0]:
schema = StructType([
    StructField("seller_id",IntegerType(),False),
    StructField("product_id",IntegerType(),False),
    StructField("buyer_id",IntegerType(),False),
    StructField("sale_date",DateType(),False),
    StructField("quantity",IntegerType(),False),
    StructField("price",IntegerType(),False)
])

data = [
(1         , 1          , 1        , datetime(2019,1,21) , 2        , 2000)  ,
(1         , 2          , 2        , datetime(2019,2,17) , 1        , 800 )  ,
(2         , 2          , 3        , datetime(2019,6,2)  , 1        , 800 )  ,
(3         , 3          , 4        , datetime(2019,5,13) , 2        , 2800)
]

sales = spark.createDataFrame(data,schema)
sales.show()

+---------+----------+--------+----------+--------+-----+
|seller_id|product_id|buyer_id| sale_date|quantity|price|
+---------+----------+--------+----------+--------+-----+
|        1|         1|       1|2019-01-21|       2| 2000|
|        1|         2|       2|2019-02-17|       1|  800|
|        2|         2|       3|2019-06-02|       1|  800|
|        3|         3|       4|2019-05-13|       2| 2800|
+---------+----------+--------+----------+--------+-----+



In [0]:
# Write a solution to report the products that were only sold in the first quarter of 2019. That is, between 2019-01-01 and 2019-03-31 inclusive.
# Return the result table in any order.
# products_to_skip = sales.where(quarter(col("sale_date"))>1).select(col("product_id")).distinct().rdd.flatMap(lambda x: x).collect()
products_to_skip = sales.where(quarter(col("sale_date"))>1).select(col("product_id")).distinct().rdd.map(lambda x: x[0]).collect()
sales.filter(~col("product_id").isin(products_to_skip)).join(product,sales.product_id==product.product_id,"inner").select(sales.product_id,"product_name").distinct().show()

+----------+------------+
|product_id|product_name|
+----------+------------+
|         1|          S8|
+----------+------------+



In [0]:
sales.createOrReplaceTempView("s")
product.createOrReplaceTempView("p")
spark.sql("select distinct s.product_id,p.product_name from s join p using (product_id) where product_id not in(select product_id from s where quarter(sale_date)<>1)").show()

+----------+------------+
|product_id|product_name|
+----------+------------+
|         1|          S8|
+----------+------------+



In [0]:
spark.stop()