In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.getOrCreate()
data = [
    (1, "Alice", "HR", 60000),
    (2, "Bob", "HR", 60000),
    (3, "Charlie", "HR", 50000),
    (4, "David", "IT", 70000),
    (5, "Eve", "IT", 65000)
]

cols = ["id", "name", "dept", "salary"]
df = spark.createDataFrame(data, cols)
df.show()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/16 09:55:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/01/16 09:55:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
26/01/16 09:55:08 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
26/01/16 09:55:08 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
                                                                                

+---+-------+----+------+
| id|   name|dept|salary|
+---+-------+----+------+
|  1|  Alice|  HR| 60000|
|  2|    Bob|  HR| 60000|
|  3|Charlie|  HR| 50000|
|  4|  David|  IT| 70000|
|  5|    Eve|  IT| 65000|
+---+-------+----+------+



In [4]:
from pyspark.sql.window import Window

In [5]:
w = Window.partitionBy("dept").orderBy(df.salary.desc())

Window functions require over() with partition and order.

In [6]:
df.withColumn("row_num", row_number().over(w)).show()

+---+-------+----+------+-------+
| id|   name|dept|salary|row_num|
+---+-------+----+------+-------+
|  1|  Alice|  HR| 60000|      1|
|  2|    Bob|  HR| 60000|      2|
|  3|Charlie|  HR| 50000|      3|
|  4|  David|  IT| 70000|      1|
|  5|    Eve|  IT| 65000|      2|
+---+-------+----+------+-------+



row_number

Definition: Assigns a unique sequential number to rows within a partition.

Interview points:

Always unique

No ties

Used for deduplication / top-N problems

rank

Definition: Assigns rank within a partition, leaving gaps for ties.

In [7]:
df.withColumn("rank", rank().over(w)).show()

+---+-------+----+------+----+
| id|   name|dept|salary|rank|
+---+-------+----+------+----+
|  1|  Alice|  HR| 60000|   1|
|  2|    Bob|  HR| 60000|   1|
|  3|Charlie|  HR| 50000|   3|
|  4|  David|  IT| 70000|   1|
|  5|    Eve|  IT| 65000|   2|
+---+-------+----+------+----+



dense_rank

Definition: Assigns rank without gaps for tied values.

In [8]:
df.withColumn("dense_rank", dense_rank().over(w)).show()

+---+-------+----+------+----------+
| id|   name|dept|salary|dense_rank|
+---+-------+----+------+----------+
|  1|  Alice|  HR| 60000|         1|
|  2|    Bob|  HR| 60000|         1|
|  3|Charlie|  HR| 50000|         2|
|  4|  David|  IT| 70000|         1|
|  5|    Eve|  IT| 65000|         2|
+---+-------+----+------+----------+



lag

Definition: Fetches value from the previous row within a window.

In [9]:
df.withColumn("prev_salary", lag("salary").over(w)).show()

+---+-------+----+------+-----------+
| id|   name|dept|salary|prev_salary|
+---+-------+----+------+-----------+
|  1|  Alice|  HR| 60000|       NULL|
|  2|    Bob|  HR| 60000|      60000|
|  3|Charlie|  HR| 50000|      60000|
|  4|  David|  IT| 70000|       NULL|
|  5|    Eve|  IT| 65000|      70000|
+---+-------+----+------+-----------+




lead

Definition: Fetches value from the next row within a window.

In [10]:
df.withColumn("next_salary", lead("salary").over(w)).show()

+---+-------+----+------+-----------+
| id|   name|dept|salary|next_salary|
+---+-------+----+------+-----------+
|  1|  Alice|  HR| 60000|      60000|
|  2|    Bob|  HR| 60000|      50000|
|  3|Charlie|  HR| 50000|       NULL|
|  4|  David|  IT| 70000|      65000|
|  5|    Eve|  IT| 65000|       NULL|
+---+-------+----+------+-----------+

