In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructField,StructType,StringType,IntegerType,DateType
from pyspark.sql.functions import col,sum,cast,round,min,datediff,lit
from pyspark.sql.window import Window
from datetime import datetime

spark = SparkSession.builder.appName("app").master("local[3]").getOrCreate()

In [0]:

schema = StructType([
    StructField('customer_id',IntegerType(),False) , 
    StructField('name' ,StringType(),False) , 
    StructField('visited_on',DateType(),False)   , 
    StructField('amount' ,IntegerType(),False)
     ])
data = [
( 1           , 'Jhon'         , datetime(2019,1,1)   , 100  )       ,
( 2           , 'Daniel'       , datetime(2019,1,2)   , 110  )       ,
( 3           , 'Jade'         , datetime(2019,1,3)   , 120  )       ,
( 4           , 'Khaled'       , datetime(2019,1,4)   , 130  )       ,
( 5           , 'Winston'      , datetime(2019,1,5)   , 110  )       , 
( 6           , 'Elvis'        , datetime(2019,1,6)   , 140  )       , 
( 7           , 'Anna'         , datetime(2019,1,7)   , 150  )       ,
( 8           , 'Maria'        , datetime(2019,1,8)   , 80   )       ,
( 9           , 'Jaze'         , datetime(2019,1,9)   , 110  )       , 
( 1           , 'Jhon'         , datetime(2019,1,10)  , 130  )       , 
( 3           , 'Jade'         , datetime(2019,1,10)  , 150  )       
]
sales = spark.createDataFrame(data,schema)
sales.show()

+-----------+-------+----------+------+
|customer_id|   name|visited_on|amount|
+-----------+-------+----------+------+
|          1|   Jhon|2019-01-01|   100|
|          2| Daniel|2019-01-02|   110|
|          3|   Jade|2019-01-03|   120|
|          4| Khaled|2019-01-04|   130|
|          5|Winston|2019-01-05|   110|
|          6|  Elvis|2019-01-06|   140|
|          7|   Anna|2019-01-07|   150|
|          8|  Maria|2019-01-08|    80|
|          9|   Jaze|2019-01-09|   110|
|          1|   Jhon|2019-01-10|   130|
|          3|   Jade|2019-01-10|   150|
+-----------+-------+----------+------+



In [0]:
# You are the restaurant owner and you want to analyze a possible expansion (there will be at least one customer every day).
# Compute the moving average of how much the customer paid in a seven days window (i.e., current day + 6 days before). average_amount should be rounded to two decimal places.
# Return the result table ordered by visited_on in ascending order.
days = lambda i: i * 86400 
window_spec = Window.orderBy(col("visited_on").cast("timestamp").cast("long")).rangeBetween(-days(6),0)
min_date = sales.select("visited_on").agg(min("visited_on")).collect()
sales.withColumn("amount",sum("amount").over(window_spec))\
    .dropDuplicates(['visited_on'])\
    .filter(datediff(col("visited_on"),lit(min_date[0][0]))>=6)\
    .select("visited_on","amount",round(col('amount')/7,2).alias("average_amount")).show()

+----------+------+--------------+
|visited_on|amount|average_amount|
+----------+------+--------------+
|2019-01-07|   860|        122.86|
|2019-01-08|   840|         120.0|
|2019-01-09|   840|         120.0|
|2019-01-10|  1000|        142.86|
+----------+------+--------------+



In [0]:
sales.createOrReplaceTempView("sales")
spark.sql("""
          with cte as 
          (select distinct visited_on, sum(amount) over(order by cast(visited_on as timestamp) range between interval 6 days preceding and current row) amount from sales)  
          select visited_on, amount, round(amount/7,2) average_amount from cte where date_diff(cast(visited_on as timestamp),(select cast(min(visited_on) as timestamp) from sales))>=6
          """).show()

+----------+------+--------------+
|visited_on|amount|average_amount|
+----------+------+--------------+
|2019-01-07|   860|        122.86|
|2019-01-08|   840|         120.0|
|2019-01-09|   840|         120.0|
|2019-01-10|  1000|        142.86|
+----------+------+--------------+



In [0]:
spark.stop()