In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,DateType
from pyspark.sql.functions import col,year,count
from datetime import datetime

spark = SparkSession.builder.appName("app").master("local[4]").getOrCreate()

In [0]:
schema = StructType([
    StructField("user_id",IntegerType(),False),
    StructField("join_date",DateType(),False),
    StructField("favorite_brand",StringType(),False)
])
data = [
    ( 1       , datetime(2018,1, 1) , 'Lenovo '   )     ,
    ( 2       , datetime(2018,2, 9) , 'Samsung'   )     ,
    ( 3       , datetime(2018,1,19) , 'LG'        )     ,
    ( 4       , datetime(2018,5,21) , 'HP'        )     
]
users = spark.createDataFrame(data,schema)
users.show()

+-------+----------+--------------+
|user_id| join_date|favorite_brand|
+-------+----------+--------------+
|      1|2018-01-01|       Lenovo |
|      2|2018-02-09|       Samsung|
|      3|2018-01-19|            LG|
|      4|2018-05-21|            HP|
+-------+----------+--------------+



In [0]:
schema = StructType([
    StructField("order_id",IntegerType(),False),
    StructField("order_date",DateType(),False),
    StructField("item_id",IntegerType(),False),
    StructField("buyer_id",IntegerType(),False),
    StructField("seller_id",IntegerType(),False)
])
data = [
    ( 1        , datetime(2019,8,1) , 4       , 1        , 2 )        ,
    ( 2        , datetime(2018,8,2) , 2       , 1        , 3 )        ,
    ( 3        , datetime(2019,8,3) , 3       , 2        , 3 )        ,
    ( 4        , datetime(2018,8,4) , 1       , 4        , 2 )        ,
    ( 5        , datetime(2018,8,4) , 1       , 3        , 4 )        ,
    ( 6        , datetime(2019,8,5) , 2       , 2        , 4 )        
]
orders = spark.createDataFrame(data,schema)
orders.show()

+--------+----------+-------+--------+---------+
|order_id|order_date|item_id|buyer_id|seller_id|
+--------+----------+-------+--------+---------+
|       1|2019-08-01|      4|       1|        2|
|       2|2018-08-02|      2|       1|        3|
|       3|2019-08-03|      3|       2|        3|
|       4|2018-08-04|      1|       4|        2|
|       5|2018-08-04|      1|       3|        4|
|       6|2019-08-05|      2|       2|        4|
+--------+----------+-------+--------+---------+



In [0]:
schema = StructType([
    StructField("item_id",IntegerType(),False),
    StructField("item_brand",StringType(),False)
])
data = [
    ( 1       , 'Samsung'   ) ,
    ( 2       , 'Lenovo'    ) ,
    ( 3       , 'LG'        ) ,
    ( 4       , 'HP'        ) 
]
items = spark.createDataFrame(data,schema)
items.show()

+-------+----------+
|item_id|item_brand|
+-------+----------+
|      1|   Samsung|
|      2|    Lenovo|
|      3|        LG|
|      4|        HP|
+-------+----------+



In [0]:
# Write a solution to find for each user, the join date and the number of orders they made as a buyer in 2019. Return the result table in any order.

orders.filter(year(col("order_date"))==2019)\
    .groupBy("buyer_id").agg(count("buyer_id").alias("orders_in_2019"))\
    .join(users,orders.buyer_id==users.user_id,'right')\
    .select(col("user_id").alias("buyer_id"),"join_date","orders_in_2019").fillna(0).show()

+--------+----------+--------------+
|buyer_id| join_date|orders_in_2019|
+--------+----------+--------------+
|       1|2018-01-01|             1|
|       2|2018-02-09|             2|
|       3|2018-01-19|             0|
|       4|2018-05-21|             0|
+--------+----------+--------------+



In [0]:
orders.createOrReplaceTempView("o")
users.createOrReplaceTempView("u")

# With cte
spark.sql("""with cte as
          (select buyer_id, count(*) as orders_in_2019 from o where year(order_date)=2019 group by 1) 
          select u.user_id buyer_id,u.join_date,ifnull(c.orders_in_2019,0) orders_in_2019 from cte c right join u on u.user_id=c.buyer_id""").show()

# Without cte
spark.sql("""
          select u.user_id buyer_id,min(u.join_date) join_date ,count(o.buyer_id) orders_in_2019 from o right join u on u.user_id=o.buyer_id and year(o.order_date)=2019 group by 1""").show()

+--------+----------+--------------+
|buyer_id| join_date|orders_in_2019|
+--------+----------+--------------+
|       1|2018-01-01|             1|
|       2|2018-02-09|             2|
|       3|2018-01-19|             0|
|       4|2018-05-21|             0|
+--------+----------+--------------+

+--------+----------+--------------+
|buyer_id| join_date|orders_in_2019|
+--------+----------+--------------+
|       1|2018-01-01|             1|
|       2|2018-02-09|             2|
|       3|2018-01-19|             0|
|       4|2018-05-21|             0|
+--------+----------+--------------+



In [0]:
spark.stop()