In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
from pyspark.sql.functions import col,first
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("app").master("local[4]").getOrCreate()

In [0]:
schema = StructType([
    StructField("sale_id",IntegerType(),False),
    StructField("product_id",IntegerType(),False),
    StructField("year",IntegerType(),False),
    StructField("quantity",IntegerType(),False),
    StructField("price",IntegerType(),False)
])
data = [
( 1       , 100        , 2008 , 10       , 5000)  ,
( 2       , 100        , 2009 , 12       , 5000)  ,
( 7       , 200        , 2011 , 15       , 9000)  
]
sale = spark.createDataFrame(data,schema)
sale.show()

+-------+----------+----+--------+-----+
|sale_id|product_id|year|quantity|price|
+-------+----------+----+--------+-----+
|      1|       100|2008|      10| 5000|
|      2|       100|2009|      12| 5000|
|      7|       200|2011|      15| 9000|
+-------+----------+----+--------+-----+



In [0]:
schema = StructType([
    StructField("product_id",IntegerType(),False),
    StructField("product_name",StringType(),False)
])
data = [
    ( 100        , 'Nokia'       ) ,
    ( 200        , 'Apple'       ) ,
    ( 300        , 'Samsung'     ) 
]
prod = spark.createDataFrame(data,schema)
prod.show()

+----------+------------+
|product_id|product_name|
+----------+------------+
|       100|       Nokia|
|       200|       Apple|
|       300|     Samsung|
+----------+------------+



In [0]:
# Write a solution to select the product name, year, quantity, and price for the first year of every product sold. Return the resulting table in any order.
window_spec= Window.partitionBy("product_id").orderBy("year")
sale\
    .withColumn("quantity",first("quantity").over(window_spec))\
    .withColumn("first_year",first("price").over(window_spec))\
    .withColumn("year",first("year").over(window_spec))\
    .join(prod,prod.product_id==sale.product_id,'inner')\
    .select("product_name","first_year","quantity","price").distinct().show()

+------------+----------+--------+-----+
|product_name|first_year|quantity|price|
+------------+----------+--------+-----+
|       Nokia|      5000|      10| 5000|
|       Apple|      9000|      15| 9000|
+------------+----------+--------+-----+



In [0]:
sale.createOrReplaceTempView("sale")
prod.createOrReplaceTempView("prod")
spark.sql("""select product_name,year first_year,quantity,price from sale s join prod p using (product_id) where (product_id,year) in (select product_id,min(year) from sale group by 1 )
                                    """).show()

+------------+----------+--------+-----+
|product_name|first_year|quantity|price|
+------------+----------+--------+-----+
|       Nokia|      2008|      10| 5000|
|       Apple|      2011|      15| 9000|
+------------+----------+--------+-----+



In [0]:
spark.stop()