In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,DateType
from pyspark.sql.functions import col,date_format,sum
from datetime import datetime

spark = SparkSession.builder.appName("app").master("local[2]").getOrCreate()

In [0]:
schema = StructType([
    StructField("product_id",IntegerType(),False),
    StructField("product_name",StringType(),False),
    StructField("product_category",StringType(),False)
])

data = [
    ( 1           , "Leetcode Solutions"    , "Book"             ),
    ( 2           , "Jewels of Stringology" , "Book"             ),
    ( 3           , "HP"                    , "Laptop"           ),
    ( 4           , "Lenovo"                , "Laptop"           ),
    ( 5           , "Leetcode Kit"          , "T-shirt"          )
]

products = spark.createDataFrame(data,schema)
products.show(truncate=False)

+----------+---------------------+----------------+
|product_id|product_name         |product_category|
+----------+---------------------+----------------+
|1         |Leetcode Solutions   |Book            |
|2         |Jewels of Stringology|Book            |
|3         |HP                   |Laptop          |
|4         |Lenovo               |Laptop          |
|5         |Leetcode Kit         |T-shirt         |
+----------+---------------------+----------------+



In [0]:
schema = StructType([
    StructField("product_id",IntegerType(),False),
    StructField("order_date",DateType(),False),
    StructField("unit",IntegerType(),False)
])

data = [
    ( 1            , datetime(2020,2,5 )   , 60       ),
    ( 1            , datetime(2020,2,10)   , 70       ),
    ( 2            , datetime(2020,1,18)   , 30       ),
    ( 2            , datetime(2020,2,11)   , 80       ),
    ( 3            , datetime(2020,2,17)   , 2        ),
    ( 3            , datetime(2020,2,24)   , 3        ),
    ( 4            , datetime(2020,3,1 )   , 20       ),
    ( 4            , datetime(2020,3,4 )   , 30       ),
    ( 4            , datetime(2020,3,4 )   , 60       ),
    ( 5            , datetime(2020,2,25)   , 50       ),
    ( 5            , datetime(2020,2,27)   , 50       ),
    ( 5            , datetime(2020,3,1 )   , 50       )
]

orders = spark.createDataFrame(data,schema)
orders.show(truncate=False)


+----------+----------+----+
|product_id|order_date|unit|
+----------+----------+----+
|1         |2020-02-05|60  |
|1         |2020-02-10|70  |
|2         |2020-01-18|30  |
|2         |2020-02-11|80  |
|3         |2020-02-17|2   |
|3         |2020-02-24|3   |
|4         |2020-03-01|20  |
|4         |2020-03-04|30  |
|4         |2020-03-04|60  |
|5         |2020-02-25|50  |
|5         |2020-02-27|50  |
|5         |2020-03-01|50  |
+----------+----------+----+



In [0]:
# Write a solution to get the names of products that have at least 100 units ordered in February 2020 and their amount.
# Return the result table in any order.

orders.alias('o').filter(date_format('o.order_date','yyyy-MM')=='2020-02')\
    .groupBy("product_id")\
    .agg(sum("unit").alias("unit")).filter(col("unit")>=100)\
    .join(products.alias("p"),col("o.product_id")==col("p.product_id"),'inner')\
    .select("product_name","unit").show()

+------------------+----+
|      product_name|unit|
+------------------+----+
|Leetcode Solutions| 130|
|      Leetcode Kit| 100|
+------------------+----+



In [0]:
products.createOrReplaceTempView("p")
orders.createOrReplaceTempView("o")

spark.sql("select p.product_name,sum(unit) as unit from o join p using(product_id) where date_format(order_date,'yyyy-MM')=='2020-02' group by o.product_id,p.product_name having sum(unit)>=100").show()

+------------------+----+
|      product_name|unit|
+------------------+----+
|Leetcode Solutions| 130|
|      Leetcode Kit| 100|
+------------------+----+



In [0]:
spark.stop()