In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType,StructField,IntegerType,DateType
from pyspark.sql.functions import col,round, min,sum,count_distinct,when
from pyspark.sql.window import Window
from datetime import datetime

spark = SparkSession.builder.appName("app").master("local[3]").getOrCreate()

In [0]:
schema = StructType([
    StructField("delivery_id",IntegerType(),False),
    StructField("customer_id",IntegerType(),False),
    StructField("order_date",DateType(),False),
    StructField("customer_pref_delivery_date",DateType(),False)

])
data = [
( 1           , 1           , datetime(2019,8, 1) , datetime(2019,8, 2))                  ,
( 2           , 2           , datetime(2019,8, 2) , datetime(2019,8, 2))                  ,
( 3           , 1           , datetime(2019,8,11) , datetime(2019,8,12))                  ,
( 4           , 3           , datetime(2019,8,24) , datetime(2019,8,24))                  ,
( 5           , 3           , datetime(2019,8,21) , datetime(2019,8,22))                  ,
( 6           , 2           , datetime(2019,8,11) , datetime(2019,8,13))                  ,
( 7           , 4           , datetime(2019,8, 9) , datetime(2019,8, 9))  
]
delivery = spark.createDataFrame(data,schema)
delivery.show()                

+-----------+-----------+----------+---------------------------+
|delivery_id|customer_id|order_date|customer_pref_delivery_date|
+-----------+-----------+----------+---------------------------+
|          1|          1|2019-08-01|                 2019-08-02|
|          2|          2|2019-08-02|                 2019-08-02|
|          3|          1|2019-08-11|                 2019-08-12|
|          4|          3|2019-08-24|                 2019-08-24|
|          5|          3|2019-08-21|                 2019-08-22|
|          6|          2|2019-08-11|                 2019-08-13|
|          7|          4|2019-08-09|                 2019-08-09|
+-----------+-----------+----------+---------------------------+



In [0]:
# If the customer's preferred delivery date is the same as the order date, then the order is called immediate; otherwise, it is called scheduled.
# The first order of a customer is the order with the earliest order date that the customer made. It is guaranteed that a customer has precisely one first order.
# Write a solution to find the percentage of immediate orders in the first orders of all customers, rounded to 2 decimal places.
window_spec = Window.partitionBy("customer_id")
total_entities = delivery.select("customer_id").distinct().count()
delivery.withColumn("first_order",min("order_date").over(window_spec)).agg(round(100*sum(when(col("first_order")==col("customer_pref_delivery_date"),1).otherwise(0))/total_entities,2).alias("immediate_percentage")).show()

+--------------------+
|immediate_percentage|
+--------------------+
|                50.0|
+--------------------+



In [0]:
delivery.createOrReplaceTempView("del")
spark.sql("""with cte as 
                    (
                        select customer_id,customer_pref_delivery_date, min(order_date) over(partition by customer_id) as first_order from del
                    )
            select round(100*sum(case when first_order=customer_pref_delivery_date then 1 else 0 end)/count(distinct customer_id),2) immediate_percentage from cte
                    """).show()

+--------------------+
|immediate_percentage|
+--------------------+
|                50.0|
+--------------------+



In [0]:
spark.stop()