<a href="https://colab.research.google.com/github/sandeepgundeboina/LearningSpark/blob/main/SparkDeltaWindow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install delta-spark==2.0.0
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("SparkDeltaWindow") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.0.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

In [None]:
from delta.tables import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [None]:
df=spark.read.format('csv').option('header','true').option('inferSchema',True).load('/content/drive/MyDrive/Abc/Ola_data/products.csv')

In [None]:
df.show()

+-------------------+---------------+----------+--------+------+----------+-------+
|            product|          store|product_id|store_id|   MRP|     state|country|
+-------------------+---------------+----------+--------+------+----------+-------+
|      Wired Earbuds|Electronics Hub|   PROD101| STORE01| 25.99|California|    USA|
|   Portable Charger|    Mobile Mart|   PROD102| STORE02|  NULL|     Texas|    USA|
|  Bluetooth Speaker|Electronics Hub|   PROD103| STORE01| 79.50|California|    USA|
|   Smart Light Bulb|   Home Gadgets|   PROD104| STORE03| 15.00|   Florida|    USA|
|               NULL|    Mobile Mart|   PROD105| STORE02| 49.99|     Texas|    USA|
|     Wireless Mouse|Electronics Hub|   PROD106| STORE01|  NULL|California|    USA|
|Mechanical Keyboard|      PC Palace|   PROD107| STORE04|120.00|  New York|    USA|
|             Webcam|Electronics Hub|   PROD108| STORE01| 55.00|      NULL|    USA|
|     Gaming Headset|      PC Palace|   PROD109| STORE04| 90.00|  New York| 

In [None]:
from pyspark.sql import functions as F

In [None]:
df_details=df.withColumn('file_details',F.input_file_name())

In [None]:
df_details.show()

+-------------------+---------------+----------+--------+------+----------+-------+--------------------+
|            product|          store|product_id|store_id|   MRP|     state|country|        file_details|
+-------------------+---------------+----------+--------+------+----------+-------+--------------------+
|      Wired Earbuds|Electronics Hub|   PROD101| STORE01| 25.99|California|    USA|file:///content/d...|
|   Portable Charger|    Mobile Mart|   PROD102| STORE02|  NULL|     Texas|    USA|file:///content/d...|
|  Bluetooth Speaker|Electronics Hub|   PROD103| STORE01| 79.50|California|    USA|file:///content/d...|
|   Smart Light Bulb|   Home Gadgets|   PROD104| STORE03| 15.00|   Florida|    USA|file:///content/d...|
|               NULL|    Mobile Mart|   PROD105| STORE02| 49.99|     Texas|    USA|file:///content/d...|
|     Wireless Mouse|Electronics Hub|   PROD106| STORE01|  NULL|California|    USA|file:///content/d...|
|Mechanical Keyboard|      PC Palace|   PROD107| STORE0

In [None]:
from pyspark.sql.window import Window,WindowSpec

In [None]:
windowSpec=Window.partitionBy('state').orderBy('product_id')

In [None]:
df_new=df.withColumn('MRP_lag',lag('MRP',1).over(windowSpec))

In [None]:
df_new.show()

+--------------------+---------------+----------+--------+------+----------+-------+-------+
|             product|          store|product_id|store_id|   MRP|     state|country|MRP_lag|
+--------------------+---------------+----------+--------+------+----------+-------+-------+
|       Wired Earbuds|Electronics Hub|   PROD101| STORE01| 25.99|California|    USA|   null|
|   Bluetooth Speaker|Electronics Hub|   PROD103| STORE01| 79.50|California|    USA|  25.99|
|      Wireless Mouse|Electronics Hub|   PROD106| STORE01|  NULL|California|    USA|  79.50|
|             USB Hub|Electronics Hub|   PROD112| STORE01|  NULL|California|    USA|   NULL|
|        External SSD|Electronics Hub|   PROD114| STORE01|150.00|California|    USA|   NULL|
|             Scanner|Electronics Hub|   PROD118| STORE01|180.00|California|   NULL| 150.00|
|               Drone|Electronics Hub|   PROD123| STORE01|300.00|California|    USA| 180.00|
|         Memory Card|Electronics Hub|   PROD126| STORE01| 20.00|Calif

In [None]:
df_new=df_new.withColumn('MRp_lead',lead('MRP',1).over(windowSpec))
df_new.show()

+--------------------+---------------+----------+--------+------+----------+-------+-------+--------+
|             product|          store|product_id|store_id|   MRP|     state|country|MRP_lag|MRp_lead|
+--------------------+---------------+----------+--------+------+----------+-------+-------+--------+
|       Wired Earbuds|Electronics Hub|   PROD101| STORE01| 25.99|California|    USA|   null|   79.50|
|   Bluetooth Speaker|Electronics Hub|   PROD103| STORE01| 79.50|California|    USA|  25.99|    NULL|
|      Wireless Mouse|Electronics Hub|   PROD106| STORE01|  NULL|California|    USA|  79.50|    NULL|
|             USB Hub|Electronics Hub|   PROD112| STORE01|  NULL|California|    USA|   NULL|  150.00|
|        External SSD|Electronics Hub|   PROD114| STORE01|150.00|California|    USA|   NULL|  180.00|
|             Scanner|Electronics Hub|   PROD118| STORE01|180.00|California|   NULL| 150.00|  300.00|
|               Drone|Electronics Hub|   PROD123| STORE01|300.00|California|    US

In [None]:
df_new.explain(mode='formatted')

== Physical Plan ==
AdaptiveSparkPlan (5)
+- Window (4)
   +- Sort (3)
      +- Exchange (2)
         +- Scan csv  (1)


(1) Scan csv 
Output [7]: [product#16, store#17, product_id#18, store_id#19, MRP#20, state#21, country#22]
Batched: false
Location: InMemoryFileIndex [file:/content/drive/MyDrive/Abc/Ola_data/products.csv]
ReadSchema: struct<product:string,store:string,product_id:string,store_id:string,MRP:string,state:string,country:string>

(2) Exchange
Input [7]: [product#16, store#17, product_id#18, store_id#19, MRP#20, state#21, country#22]
Arguments: hashpartitioning(state#21, 200), ENSURE_REQUIREMENTS, [plan_id=205]

(3) Sort
Input [7]: [product#16, store#17, product_id#18, store_id#19, MRP#20, state#21, country#22]
Arguments: [state#21 ASC NULLS FIRST, product_id#18 ASC NULLS FIRST], false, 0

(4) Window
Input [7]: [product#16, store#17, product_id#18, store_id#19, MRP#20, state#21, country#22]
Arguments: [lag(MRP#20, -1, null) windowspecdefinition(state#21, product_id#18 ASC

#####**END OF CODE**