In [1]:
# Set the PySpark environment variables
import os
import sys
os.environ['SPARK_HOME'] = r"C:\_dev\spark-3.5.1-hadoop3"
# os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
# os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = sys.executable

In [4]:
# importing pyspark
from pyspark.sql.window import Window
import pyspark

# importing sparksession
from pyspark.sql import SparkSession

# creating a sparksession object
# and providing appName
spark = SparkSession.builder.appName("pyspark_window").getOrCreate()

# sample data for dataframe
sampleData = (
    ("Ram", 28, "Sales", 3000),
    ("Meena", 33, "Sales", 4600),
    ("Robin", 40, "Sales", 4100),
    ("Kunal", 25, "Finance", 3000),
    ("Ram", 28, "Sales", 3000),
    ("Srishti", 46, "Management", 3300),
    ("Jeny", 26, "Finance", 3900),
    ("Hitesh", 30, "Marketing", 3000),
    ("Kailash", 29, "Marketing", 2000),
    ("Sharad", 39, "Sales", 4100),
)

# column names for dataframe
columns = ["Employee_Name", "Age", "Department", "Salary"]

# creating the dataframe df
df = spark.createDataFrame(data=sampleData, schema=columns)


In [5]:
# creating a window
# partition of dataframe
windowPartition = Window.partitionBy("Department").orderBy("Age")
 
# print schema
df.printSchema()
 
# show df
df.show()

root
 |-- Employee_Name: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)

+-------------+---+----------+------+
|Employee_Name|Age|Department|Salary|
+-------------+---+----------+------+
|          Ram| 28|     Sales|  3000|
|        Meena| 33|     Sales|  4600|
|        Robin| 40|     Sales|  4100|
|        Kunal| 25|   Finance|  3000|
|          Ram| 28|     Sales|  3000|
|      Srishti| 46|Management|  3300|
|         Jeny| 26|   Finance|  3900|
|       Hitesh| 30| Marketing|  3000|
|      Kailash| 29| Marketing|  2000|
|       Sharad| 39|     Sales|  4100|
+-------------+---+----------+------+



In [6]:
# importing cume_dist()
# from pyspark.sql.functions
from pyspark.sql.functions import cume_dist

# applying window function with
# the help of DataFrame.withColumn cumulative distribution ??????
df.withColumn("cume_dist",
			cume_dist().over(windowPartition)).show()


+-------------+---+----------+------+---------+
|Employee_Name|Age|Department|Salary|cume_dist|
+-------------+---+----------+------+---------+
|        Kunal| 25|   Finance|  3000|      0.5|
|         Jeny| 26|   Finance|  3900|      1.0|
|      Srishti| 46|Management|  3300|      1.0|
|      Kailash| 29| Marketing|  2000|      0.5|
|       Hitesh| 30| Marketing|  3000|      1.0|
|          Ram| 28|     Sales|  3000|      0.4|
|          Ram| 28|     Sales|  3000|      0.4|
|        Meena| 33|     Sales|  4600|      0.6|
|       Sharad| 39|     Sales|  4100|      0.8|
|        Robin| 40|     Sales|  4100|      1.0|
+-------------+---+----------+------+---------+



In [8]:
# importing lag() from pyspark.sql.functions
from pyspark.sql.functions import lag

df.withColumn("Lag", lag("Salary", 1).over(windowPartition)) \
	.show()


+-------------+---+----------+------+----+
|Employee_Name|Age|Department|Salary| Lag|
+-------------+---+----------+------+----+
|        Kunal| 25|   Finance|  3000|NULL|
|         Jeny| 26|   Finance|  3900|3000|
|      Srishti| 46|Management|  3300|NULL|
|      Kailash| 29| Marketing|  2000|NULL|
|       Hitesh| 30| Marketing|  3000|2000|
|          Ram| 28|     Sales|  3000|NULL|
|          Ram| 28|     Sales|  3000|3000|
|        Meena| 33|     Sales|  4600|3000|
|       Sharad| 39|     Sales|  4100|4600|
|        Robin| 40|     Sales|  4100|4100|
+-------------+---+----------+------+----+

