<a href="https://colab.research.google.com/github/rahulrajpr/prepare-anytime/blob/main/spark/functions/14_spark_sql_window_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Spark Window Functions**
https://spark.apache.org/docs/latest/sql-ref-functions-builtin.html#window-functions

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spark-functions').getOrCreate()

In [None]:
from pyspark.sql.functions import to_date, desc, rank,col,dense_rank,expr
from pyspark.sql.types import StructType,StructField, IntegerType, StringType

In [None]:

data = [
    ("2023-01-01", "Alice", "Electronics", 1000),
    ("2023-01-01", "Bob", "Electronics", 1200),
    ("2023-01-01", "Charlie", "Clothing", 800),
    ("2023-01-01", "Diana", "Electronics", 1500),
    ("2023-01-01", "Eve", "Clothing", 600),
    ("2023-02-01", "Alice", "Electronics", 1100),
    ("2023-02-01", "Bob", "Electronics", 900),
    ("2023-02-01", "Charlie", "Clothing", 950),
    ("2023-02-01", "Diana", "Electronics", 1300),
    ("2023-02-01", "Eve", "Clothing", 700),
    ("2023-02-01", "Frank", "Electronics", None),
    ("2023-03-01", "Alice", "Electronics", 1400),
    ("2023-03-01", "Bob", "Electronics", 1000),
    ("2023-03-01", "Charlie", "Clothing", 1200),
    ("2023-03-01", "Diana", "Electronics", 1600)
]

# Define schema
schema = StructType([
    StructField("mn", StringType(), True),
    StructField("salesperson", StringType(), True),
    StructField("department", StringType(), True),
    StructField("sales", IntegerType(), True)
])

dataframe = spark.createDataFrame(data, schema)
dataframe = dataframe.withColumn('mn',to_date(col('mn'), 'yyyy-MM-dd'))
dataframe.printSchema()
dataframe.show(truncate = False)

root
 |-- mn: date (nullable = true)
 |-- salesperson: string (nullable = true)
 |-- department: string (nullable = true)
 |-- sales: integer (nullable = true)

+----------+-----------+-----------+-----+
|mn        |salesperson|department |sales|
+----------+-----------+-----------+-----+
|2023-01-01|Alice      |Electronics|1000 |
|2023-01-01|Bob        |Electronics|1200 |
|2023-01-01|Charlie    |Clothing   |800  |
|2023-01-01|Diana      |Electronics|1500 |
|2023-01-01|Eve        |Clothing   |600  |
|2023-02-01|Alice      |Electronics|1100 |
|2023-02-01|Bob        |Electronics|900  |
|2023-02-01|Charlie    |Clothing   |950  |
|2023-02-01|Diana      |Electronics|1300 |
|2023-02-01|Eve        |Clothing   |700  |
|2023-02-01|Frank      |Electronics|NULL |
|2023-03-01|Alice      |Electronics|1400 |
|2023-03-01|Bob        |Electronics|1000 |
|2023-03-01|Charlie    |Clothing   |1200 |
|2023-03-01|Diana      |Electronics|1600 |
+----------+-----------+-----------+-----+



In [None]:
dataframe.createOrReplaceTempView('dataframe_view')

In [None]:
# rank

# saprk sqkl

sql = '''
with cte as
(
  select *, rank() over(partition by department order by sales desc NULLS LAST) as rn
  from dataframe_view
)
select *
from cte
where rn = 1
'''
spark.sql(sql).show(truncate = False)

+----------+-----------+-----------+-----+---+
|mn        |salesperson|department |sales|rn |
+----------+-----------+-----------+-----+---+
|2023-03-01|Charlie    |Clothing   |1200 |1  |
|2023-03-01|Diana      |Electronics|1600 |1  |
+----------+-----------+-----------+-----+---+



In [None]:
from pyspark.sql.window import Window

In [None]:
# rank (psyspark dataframe api)

win = Window.partitionBy('department').orderBy(desc('sales'))
dataframe.withColumn('rn',rank().over(win)).filter('rn == 1').show()

+----------+-----------+-----------+-----+---+
|        mn|salesperson| department|sales| rn|
+----------+-----------+-----------+-----+---+
|2023-03-01|    Charlie|   Clothing| 1200|  1|
|2023-03-01|      Diana|Electronics| 1600|  1|
+----------+-----------+-----------+-----+---+



In [None]:
# dense_rank()

# saprk sql

sql = '''
with cte as
(
  select *, dense_rank() over(partition by department order by sales desc NULLS LAST) as rn
  from dataframe_view
)
select *
from cte
where rn = 2
'''
spark.sql(sql).show(truncate = False)

+----------+-----------+-----------+-----+---+
|mn        |salesperson|department |sales|rn |
+----------+-----------+-----------+-----+---+
|2023-02-01|Charlie    |Clothing   |950  |2  |
|2023-01-01|Diana      |Electronics|1500 |2  |
+----------+-----------+-----------+-----+---+



In [None]:
# dense_rank (psyspark dataframe api)

from pyspark.sql.window import Window
from pyspark.sql.functions import expr, dense_rank, desc, desc_nulls_last

win = Window.partitionBy('department').orderBy(desc_nulls_last('sales'))
dataframe.withColumn('dnsrk', dense_rank().over(win)).filter('dnsrk == 2').show(truncate=False)

+----------+-----------+-----------+-----+-----+
|mn        |salesperson|department |sales|dnsrk|
+----------+-----------+-----------+-----+-----+
|2023-02-01|Charlie    |Clothing   |950  |2    |
|2023-01-01|Diana      |Electronics|1500 |2    |
+----------+-----------+-----------+-----+-----+



In [None]:
# row_number()

sql = '''
with cte as
(
  select *, row_number() over(partition by department order by sales desc NULLS LAST) as rn
  from dataframe_view
)
select *
from cte
where rn = 1
'''
spark.sql(sql).show(truncate = False)

+----------+-----------+-----------+-----+---+
|mn        |salesperson|department |sales|rn |
+----------+-----------+-----------+-----+---+
|2023-03-01|Charlie    |Clothing   |1200 |1  |
|2023-03-01|Diana      |Electronics|1600 |1  |
+----------+-----------+-----------+-----+---+



In [None]:
# row_number() - pysprk dataframe api

from pyspark.sql.functions import row_number, desc_nulls_last,col, exp
from pyspark.sql.window import Window

win = Window.partitionBy('department').orderBy(desc_nulls_last('sales'))
dataframe.withColumn('rwNum',row_number().over(win)).filter(expr('rwNum = 1')).show(truncate = False)

dataframe.withColumn('rwNum',expr('row_number() over(partition by department order by sales desc nulls last)')).filter(expr('rwNum = 1')).show(truncate = False)

+----------+-----------+-----------+-----+-----+
|mn        |salesperson|department |sales|rwNum|
+----------+-----------+-----------+-----+-----+
|2023-03-01|Charlie    |Clothing   |1200 |1    |
|2023-03-01|Diana      |Electronics|1600 |1    |
+----------+-----------+-----------+-----+-----+

+----------+-----------+-----------+-----+-----+
|mn        |salesperson|department |sales|rwNum|
+----------+-----------+-----------+-----+-----+
|2023-03-01|Charlie    |Clothing   |1200 |1    |
|2023-03-01|Diana      |Electronics|1600 |1    |
+----------+-----------+-----------+-----+-----+



In [None]:
# percent_rank() and cume_dist() :

sql = '''
select *,
    percent_rank() over(partition by department order by sales desc nulls last) as perRank,
    cume_dist() over(partition by department order by sales desc nulls last) as CumDist
from dataframe_view
'''
spark.sql(sql).show(truncate = False)

+----------+-----------+-----------+-----+------------------+-------+
|mn        |salesperson|department |sales|perRank           |CumDist|
+----------+-----------+-----------+-----+------------------+-------+
|2023-03-01|Charlie    |Clothing   |1200 |0.0               |0.2    |
|2023-02-01|Charlie    |Clothing   |950  |0.25              |0.4    |
|2023-01-01|Charlie    |Clothing   |800  |0.5               |0.6    |
|2023-02-01|Eve        |Clothing   |700  |0.75              |0.8    |
|2023-01-01|Eve        |Clothing   |600  |1.0               |1.0    |
|2023-03-01|Diana      |Electronics|1600 |0.0               |0.1    |
|2023-01-01|Diana      |Electronics|1500 |0.1111111111111111|0.2    |
|2023-03-01|Alice      |Electronics|1400 |0.2222222222222222|0.3    |
|2023-02-01|Diana      |Electronics|1300 |0.3333333333333333|0.4    |
|2023-01-01|Bob        |Electronics|1200 |0.4444444444444444|0.5    |
|2023-02-01|Alice      |Electronics|1100 |0.5555555555555556|0.6    |
|2023-01-01|Alice   

In [None]:
# cume_dist() vs percent_rank()

# spark dataframe api (with expr)

from pyspark.sql.functions import expr, round

dataframe.withColumn('cumDist', round(expr('cume_dist() over(partition by department order by sales desc nulls first)').cast('double'),2))\
         .withColumn('percRank', round(expr('percent_rank() over(partition by department order by sales desc nulls first)').cast('double'),2))\
         .show(truncate = False)

+----------+-----------+-----------+-----+-------+--------+
|mn        |salesperson|department |sales|cumDist|percRank|
+----------+-----------+-----------+-----+-------+--------+
|2023-03-01|Charlie    |Clothing   |1200 |0.2    |0.0     |
|2023-02-01|Charlie    |Clothing   |950  |0.4    |0.25    |
|2023-01-01|Charlie    |Clothing   |800  |0.6    |0.5     |
|2023-02-01|Eve        |Clothing   |700  |0.8    |0.75    |
|2023-01-01|Eve        |Clothing   |600  |1.0    |1.0     |
|2023-02-01|Frank      |Electronics|NULL |0.1    |0.0     |
|2023-03-01|Diana      |Electronics|1600 |0.2    |0.11    |
|2023-01-01|Diana      |Electronics|1500 |0.3    |0.22    |
|2023-03-01|Alice      |Electronics|1400 |0.4    |0.33    |
|2023-02-01|Diana      |Electronics|1300 |0.5    |0.44    |
|2023-01-01|Bob        |Electronics|1200 |0.6    |0.56    |
|2023-02-01|Alice      |Electronics|1100 |0.7    |0.67    |
|2023-01-01|Alice      |Electronics|1000 |0.9    |0.78    |
|2023-03-01|Bob        |Electronics|1000

In [None]:
# person_rank() : to access the relative standing of a row

# spark dataframe api (with all pyspark)

from pyspark.sql.window import Window
from pyspark.sql.functions import percent_rank, cume_dist,desc_nulls_first, col

dataframe.withColumn('cumDist',   round(cume_dist().over(Window.partitionBy(col('department')).orderBy(desc_nulls_first(col('sales')))).cast('double'),2))\
         .withColumn('percRank', round(percent_rank().over(Window.partitionBy(col('department')).orderBy(desc_nulls_first(col('sales')))).cast('double'),2))\
         .show(truncate = False)

+----------+-----------+-----------+-----+-------+--------+
|mn        |salesperson|department |sales|cumDist|percRank|
+----------+-----------+-----------+-----+-------+--------+
|2023-03-01|Charlie    |Clothing   |1200 |0.2    |0.0     |
|2023-02-01|Charlie    |Clothing   |950  |0.4    |0.25    |
|2023-01-01|Charlie    |Clothing   |800  |0.6    |0.5     |
|2023-02-01|Eve        |Clothing   |700  |0.8    |0.75    |
|2023-01-01|Eve        |Clothing   |600  |1.0    |1.0     |
|2023-02-01|Frank      |Electronics|NULL |0.1    |0.0     |
|2023-03-01|Diana      |Electronics|1600 |0.2    |0.11    |
|2023-01-01|Diana      |Electronics|1500 |0.3    |0.22    |
|2023-03-01|Alice      |Electronics|1400 |0.4    |0.33    |
|2023-02-01|Diana      |Electronics|1300 |0.5    |0.44    |
|2023-01-01|Bob        |Electronics|1200 |0.6    |0.56    |
|2023-02-01|Alice      |Electronics|1100 |0.7    |0.67    |
|2023-01-01|Alice      |Electronics|1000 |0.9    |0.78    |
|2023-03-01|Bob        |Electronics|1000

##### **CUME_DIST vs PERCENT_RANK**
---
##### Key Difference:
- **CUME_DIST** = What % of ALL rows have values ≤ **current row's value**
- **PERCENT_RANK** = What % of OTHER rows have values < **current row's value**
---
##### Quick Formulas:
- **CUME_DIST** = `(Rows with value ≤ current row) / (Total rows)`
- **PERCENT_RANK** = `(Rows with value < current row) / (Total rows - 1)`
---
##### Example: Test Scores [55, 65, 75, 85, 85, 95]
---
| Student | Score | CUME_DIST | PERCENT_RANK |
|---------|-------|-----------|--------------|
| Frank   | 55    | 0.17      | 0.0          |
| Eve     | 65    | 0.33      | 0.2          |
| Diana   | 75    | 0.50      | 0.4          |
| Charlie | 85    | 0.83      | 0.6          |
| Bob     | 85    | 0.83      | 0.6          |
| Alice   | 95    | 1.00      | 1.0          |

---
##### When to Use:

##### Use CUME_DIST:
> *"What percentile am I in?"*
- Answers: "Where do I stand in the entire group?"
- Includes yourself in the calculation
---
##### Use PERCENT_RANK:
> *"What percentage of people did I beat?"*  
- Answers: "How do I rank against others?"
- Excludes yourself from comparison
---
##### Bottom Line:
> **CUME_DIST** includes you in the count  
> **PERCENT_RANK** compares you against others

##### **`asc_nulls_first` vs `asc_nulls_last`**

| Aspect | `asc_nulls_first` | `asc_nulls_last` |
|--------|-------------------|------------------|
| **NULL Position** | NULLs at start | NULLs at end |
| **Sort Order** | NULLs → Ascending values | Ascending values → NULLs |
| **Default** | Yes | No |
---
##### **Example:**
- `asc_nulls_first`: `[NULL, NULL, 10, 25, 50]`
- `asc_nulls_last`: `[10, 25, 50, NULL, NULL]`
---
**That's it.** Both sort ascending - only NULL placement differs.

In [None]:
# asc_nulls_first : this is actually default in the order by clause

from pyspark.sql.functions import asc_nulls_first,col
from pyspark.sql.types import StructType,StructField,IntegerType

data = [[100],[-20],[30],[19],[None],[1],[20],[40]]
schema = StructType([StructField('nums',IntegerType())])

dataframe = spark.createDataFrame(data, schema)
dataframe.orderBy(col('nums')).show(truncate = False)
dataframe.orderBy(asc_nulls_first(col('nums'))).show(truncate = False)

+----+
|nums|
+----+
|NULL|
|-20 |
|1   |
|19  |
|20  |
|30  |
|40  |
|100 |
+----+

+----+
|nums|
+----+
|NULL|
|-20 |
|1   |
|19  |
|20  |
|30  |
|40  |
|100 |
+----+



In [None]:
# asc_nulls_last : this is actually default in the order by clause

from pyspark.sql.functions import asc_nulls_last,col
from pyspark.sql.types import StructType,StructField,IntegerType

data = [[100],[-20],[30],[19],[None],[1],[20],[40]]
schema = StructType([StructField('nums',IntegerType())])

dataframeNums = spark.createDataFrame(data, schema)
dataframeNums.orderBy(col('nums')).show(truncate = False)
dataframeNums.orderBy(asc_nulls_last(col('nums'))).show(truncate = False)

+----+
|nums|
+----+
|NULL|
|-20 |
|1   |
|19  |
|20  |
|30  |
|40  |
|100 |
+----+

+----+
|nums|
+----+
|-20 |
|1   |
|19  |
|20  |
|30  |
|40  |
|100 |
|NULL|
+----+



In [None]:
# desc_nulls_first : this is actually default in the order by clause

from pyspark.sql.functions import desc_nulls_first,col, expr, desc
from pyspark.sql.types import StructType,StructField,IntegerType

data = [[100],[-20],[30],[19],[None],[1],[20],[40]]
schema = StructType([StructField('nums',IntegerType())])

dataframeNums = spark.createDataFrame(data, schema)
dataframeNums.orderBy(desc(col('nums'))).show(truncate = False)
dataframeNums.orderBy(desc_nulls_first(col('nums'))).show(truncate = False)

+----+
|nums|
+----+
|100 |
|40  |
|30  |
|20  |
|19  |
|1   |
|-20 |
|NULL|
+----+

+----+
|nums|
+----+
|NULL|
|100 |
|40  |
|30  |
|20  |
|19  |
|1   |
|-20 |
+----+



In [None]:
# desc_nulls_last : this is actually default in the order by clause

from pyspark.sql.functions import desc_nulls_last,col, expr, desc
from pyspark.sql.types import StructType,StructField,IntegerType

data = [[100],[-20],[30],[19],[None],[1],[20],[40]]
schema = StructType([StructField('nums',IntegerType())])

dataframe = spark.createDataFrame(data, schema)
dataframe.orderBy(desc(col('nums'))).show(truncate = False)
dataframe.orderBy(desc_nulls_last(col('nums'))).show(truncate = False)

+----+
|nums|
+----+
|100 |
|40  |
|30  |
|20  |
|19  |
|1   |
|-20 |
|NULL|
+----+

+----+
|nums|
+----+
|100 |
|40  |
|30  |
|20  |
|19  |
|1   |
|-20 |
|NULL|
+----+



##### **PySpark NULL Ordering Defaults:**

✅ **ASC + NULLs first** = Default behavior  
✅ **DESC + NULLs last** = Default behavior  
✅ **asc_nulls_first()** and **desc_nulls_last()** make default behavior explicit

In [None]:
# ntile : simple bucketing the dataset

sql = '''
select *, ntile(2) over(partition by department order by sales) as perRank
from dataframe_view
'''
spark.sql(sql).show(truncate = False)

+----------+-----------+-----------+-----+-------+
|mn        |salesperson|department |sales|perRank|
+----------+-----------+-----------+-----+-------+
|2023-01-01|Eve        |Clothing   |600  |1      |
|2023-02-01|Eve        |Clothing   |700  |1      |
|2023-01-01|Charlie    |Clothing   |800  |1      |
|2023-02-01|Charlie    |Clothing   |950  |2      |
|2023-03-01|Charlie    |Clothing   |1200 |2      |
|2023-02-01|Frank      |Electronics|NULL |1      |
|2023-02-01|Bob        |Electronics|900  |1      |
|2023-01-01|Alice      |Electronics|1000 |1      |
|2023-03-01|Bob        |Electronics|1000 |1      |
|2023-02-01|Alice      |Electronics|1100 |1      |
|2023-01-01|Bob        |Electronics|1200 |2      |
|2023-02-01|Diana      |Electronics|1300 |2      |
|2023-03-01|Alice      |Electronics|1400 |2      |
|2023-01-01|Diana      |Electronics|1500 |2      |
|2023-03-01|Diana      |Electronics|1600 |2      |
+----------+-----------+-----------+-----+-------+



In [None]:
# ntile : simple bucketing the dataset

# pyspark api (with expr)

from pyspark.sql.functions import col, asc_nulls_last
from pyspark.sql.window import Window

dataframe.withColumn('nthile_col', expr('''ntile(2) over(partition by department order by sales desc nulls last)'''))\
         .show(truncate = False)

+----------+-----------+-----------+-----+----------+
|mn        |salesperson|department |sales|nthile_col|
+----------+-----------+-----------+-----+----------+
|2023-03-01|Charlie    |Clothing   |1200 |1         |
|2023-02-01|Charlie    |Clothing   |950  |1         |
|2023-01-01|Charlie    |Clothing   |800  |1         |
|2023-02-01|Eve        |Clothing   |700  |2         |
|2023-01-01|Eve        |Clothing   |600  |2         |
|2023-03-01|Diana      |Electronics|1600 |1         |
|2023-01-01|Diana      |Electronics|1500 |1         |
|2023-03-01|Alice      |Electronics|1400 |1         |
|2023-02-01|Diana      |Electronics|1300 |1         |
|2023-01-01|Bob        |Electronics|1200 |1         |
|2023-02-01|Alice      |Electronics|1100 |2         |
|2023-01-01|Alice      |Electronics|1000 |2         |
|2023-03-01|Bob        |Electronics|1000 |2         |
|2023-02-01|Bob        |Electronics|900  |2         |
|2023-02-01|Frank      |Electronics|NULL |2         |
+----------+-----------+----

In [None]:
# ntile : simple bucketing the dataset

# pyspark api

from pyspark.sql.functions import col, desc_nulls_last,ntile
from pyspark.sql.window import Window

dataframe.withColumn('ntile_col',ntile(2).over(Window.partitionBy('department').orderBy(desc_nulls_last(col('sales')))))\
         .show(truncate = False)

+----------+-----------+-----------+-----+---------+
|mn        |salesperson|department |sales|ntile_col|
+----------+-----------+-----------+-----+---------+
|2023-03-01|Charlie    |Clothing   |1200 |1        |
|2023-02-01|Charlie    |Clothing   |950  |1        |
|2023-01-01|Charlie    |Clothing   |800  |1        |
|2023-02-01|Eve        |Clothing   |700  |2        |
|2023-01-01|Eve        |Clothing   |600  |2        |
|2023-03-01|Diana      |Electronics|1600 |1        |
|2023-01-01|Diana      |Electronics|1500 |1        |
|2023-03-01|Alice      |Electronics|1400 |1        |
|2023-02-01|Diana      |Electronics|1300 |1        |
|2023-01-01|Bob        |Electronics|1200 |1        |
|2023-02-01|Alice      |Electronics|1100 |2        |
|2023-01-01|Alice      |Electronics|1000 |2        |
|2023-03-01|Bob        |Electronics|1000 |2        |
|2023-02-01|Bob        |Electronics|900  |2        |
|2023-02-01|Frank      |Electronics|NULL |2        |
+----------+-----------+-----------+-----+----

In [None]:
# nth_value

# full sql value

sql = '''
select *,
nth_value(sales,2) over(partition by department order by sales desc nulls last) as nth_valueValue
from dataframe_view
'''
spark.sql(sql).show(truncate = False)

+----------+-----------+-----------+-----+--------------+
|mn        |salesperson|department |sales|nth_valueValue|
+----------+-----------+-----------+-----+--------------+
|2023-03-01|Charlie    |Clothing   |1200 |NULL          |
|2023-02-01|Charlie    |Clothing   |950  |950           |
|2023-01-01|Charlie    |Clothing   |800  |950           |
|2023-02-01|Eve        |Clothing   |700  |950           |
|2023-01-01|Eve        |Clothing   |600  |950           |
|2023-03-01|Diana      |Electronics|1600 |NULL          |
|2023-01-01|Diana      |Electronics|1500 |1500          |
|2023-03-01|Alice      |Electronics|1400 |1500          |
|2023-02-01|Diana      |Electronics|1300 |1500          |
|2023-01-01|Bob        |Electronics|1200 |1500          |
|2023-02-01|Alice      |Electronics|1100 |1500          |
|2023-01-01|Alice      |Electronics|1000 |1500          |
|2023-03-01|Bob        |Electronics|1000 |1500          |
|2023-02-01|Bob        |Electronics|900  |1500          |
|2023-02-01|Fr

In [None]:
# nth_value

# spark (with expr)

from pyspark.sql.functions import col, nth_value, desc_nulls_last
from pyspark.sql.window import Window

dataframe.withColumn('nthValue', expr('nth_value(sales,2) over(partition by department order by sales desc nulls last)'))\
         .show(truncate = False)

+----------+-----------+-----------+-----+--------+
|mn        |salesperson|department |sales|nthValue|
+----------+-----------+-----------+-----+--------+
|2023-03-01|Charlie    |Clothing   |1200 |NULL    |
|2023-02-01|Charlie    |Clothing   |950  |950     |
|2023-01-01|Charlie    |Clothing   |800  |950     |
|2023-02-01|Eve        |Clothing   |700  |950     |
|2023-01-01|Eve        |Clothing   |600  |950     |
|2023-03-01|Diana      |Electronics|1600 |NULL    |
|2023-01-01|Diana      |Electronics|1500 |1500    |
|2023-03-01|Alice      |Electronics|1400 |1500    |
|2023-02-01|Diana      |Electronics|1300 |1500    |
|2023-01-01|Bob        |Electronics|1200 |1500    |
|2023-02-01|Alice      |Electronics|1100 |1500    |
|2023-01-01|Alice      |Electronics|1000 |1500    |
|2023-03-01|Bob        |Electronics|1000 |1500    |
|2023-02-01|Bob        |Electronics|900  |1500    |
|2023-02-01|Frank      |Electronics|NULL |1500    |
+----------+-----------+-----------+-----+--------+



In [None]:
# nth_value

# spark (with expr)

from pyspark.sql.functions import col, nth_value, desc_nulls_last
from pyspark.sql.window import Window

dataframe.withColumn('nthValue',nth_value(col('sales'),2).over(Window.partitionBy('department').orderBy(desc_nulls_last(col('sales')))))\
         .show(truncate = False)

+----------+-----------+-----------+-----+--------+
|mn        |salesperson|department |sales|nthValue|
+----------+-----------+-----------+-----+--------+
|2023-03-01|Charlie    |Clothing   |1200 |NULL    |
|2023-02-01|Charlie    |Clothing   |950  |950     |
|2023-01-01|Charlie    |Clothing   |800  |950     |
|2023-02-01|Eve        |Clothing   |700  |950     |
|2023-01-01|Eve        |Clothing   |600  |950     |
|2023-03-01|Diana      |Electronics|1600 |NULL    |
|2023-01-01|Diana      |Electronics|1500 |1500    |
|2023-03-01|Alice      |Electronics|1400 |1500    |
|2023-02-01|Diana      |Electronics|1300 |1500    |
|2023-01-01|Bob        |Electronics|1200 |1500    |
|2023-02-01|Alice      |Electronics|1100 |1500    |
|2023-01-01|Alice      |Electronics|1000 |1500    |
|2023-03-01|Bob        |Electronics|1000 |1500    |
|2023-02-01|Bob        |Electronics|900  |1500    |
|2023-02-01|Frank      |Electronics|NULL |1500    |
+----------+-----------+-----------+-----+--------+



In [None]:
# lead

sql = '''
select *,
lead(sales,1) over(partition by department,salesperson order by mn asc) as nextSales
from dataframe_view
'''
spark.sql(sql).show(truncate = False)

+----------+-----------+-----------+-----+---------+
|mn        |salesperson|department |sales|nextSales|
+----------+-----------+-----------+-----+---------+
|2023-01-01|Charlie    |Clothing   |800  |950      |
|2023-02-01|Charlie    |Clothing   |950  |1200     |
|2023-03-01|Charlie    |Clothing   |1200 |NULL     |
|2023-01-01|Eve        |Clothing   |600  |700      |
|2023-02-01|Eve        |Clothing   |700  |NULL     |
|2023-01-01|Alice      |Electronics|1000 |1100     |
|2023-02-01|Alice      |Electronics|1100 |1400     |
|2023-03-01|Alice      |Electronics|1400 |NULL     |
|2023-01-01|Bob        |Electronics|1200 |900      |
|2023-02-01|Bob        |Electronics|900  |1000     |
|2023-03-01|Bob        |Electronics|1000 |NULL     |
|2023-01-01|Diana      |Electronics|1500 |1300     |
|2023-02-01|Diana      |Electronics|1300 |1600     |
|2023-03-01|Diana      |Electronics|1600 |NULL     |
|2023-02-01|Frank      |Electronics|NULL |NULL     |
+----------+-----------+-----------+-----+----

In [None]:
# lead
# pyspark (with expr)

dataframe.withColumn('nextSales', expr('''lead(sales,1) over(partition by department,salesperson order by mn asc)'''))\
         .orderBy(['department','salesperson'])\
         .show(truncate = False)

+----------+-----------+-----------+-----+---------+
|mn        |salesperson|department |sales|nextSales|
+----------+-----------+-----------+-----+---------+
|2023-01-01|Charlie    |Clothing   |800  |950      |
|2023-02-01|Charlie    |Clothing   |950  |1200     |
|2023-03-01|Charlie    |Clothing   |1200 |NULL     |
|2023-01-01|Eve        |Clothing   |600  |700      |
|2023-02-01|Eve        |Clothing   |700  |NULL     |
|2023-01-01|Alice      |Electronics|1000 |1100     |
|2023-02-01|Alice      |Electronics|1100 |1400     |
|2023-03-01|Alice      |Electronics|1400 |NULL     |
|2023-01-01|Bob        |Electronics|1200 |900      |
|2023-02-01|Bob        |Electronics|900  |1000     |
|2023-03-01|Bob        |Electronics|1000 |NULL     |
|2023-01-01|Diana      |Electronics|1500 |1300     |
|2023-02-01|Diana      |Electronics|1300 |1600     |
|2023-03-01|Diana      |Electronics|1600 |NULL     |
|2023-02-01|Frank      |Electronics|NULL |NULL     |
+----------+-----------+-----------+-----+----

In [None]:
# lead
# pyspark

from pyspark.sql.functions import lead,col, asc_nulls_last

dataframe.withColumn('nextSales',lead(col('sales'),1).over(Window.partitionBy('department','salesperson').orderBy(asc_nulls_last(col('mn')))))\
         .orderBy(['department','salesperson','mn'])\
         .show(truncate = False)

+----------+-----------+-----------+-----+---------+
|mn        |salesperson|department |sales|nextSales|
+----------+-----------+-----------+-----+---------+
|2023-01-01|Charlie    |Clothing   |800  |950      |
|2023-02-01|Charlie    |Clothing   |950  |1200     |
|2023-03-01|Charlie    |Clothing   |1200 |NULL     |
|2023-01-01|Eve        |Clothing   |600  |700      |
|2023-02-01|Eve        |Clothing   |700  |NULL     |
|2023-01-01|Alice      |Electronics|1000 |1100     |
|2023-02-01|Alice      |Electronics|1100 |1400     |
|2023-03-01|Alice      |Electronics|1400 |NULL     |
|2023-01-01|Bob        |Electronics|1200 |900      |
|2023-02-01|Bob        |Electronics|900  |1000     |
|2023-03-01|Bob        |Electronics|1000 |NULL     |
|2023-01-01|Diana      |Electronics|1500 |1300     |
|2023-02-01|Diana      |Electronics|1300 |1600     |
|2023-03-01|Diana      |Electronics|1600 |NULL     |
|2023-02-01|Frank      |Electronics|NULL |NULL     |
+----------+-----------+-----------+-----+----

In [None]:
# lag

sql = '''
select *,
lag(sales,1) over(partition by department,salesperson order by mn asc) as nextSales
from dataframe_view
'''
spark.sql(sql).show(truncate = False)

+----------+-----------+-----------+-----+---------+
|mn        |salesperson|department |sales|nextSales|
+----------+-----------+-----------+-----+---------+
|2023-01-01|Charlie    |Clothing   |800  |NULL     |
|2023-02-01|Charlie    |Clothing   |950  |800      |
|2023-03-01|Charlie    |Clothing   |1200 |950      |
|2023-01-01|Eve        |Clothing   |600  |NULL     |
|2023-02-01|Eve        |Clothing   |700  |600      |
|2023-01-01|Alice      |Electronics|1000 |NULL     |
|2023-02-01|Alice      |Electronics|1100 |1000     |
|2023-03-01|Alice      |Electronics|1400 |1100     |
|2023-01-01|Bob        |Electronics|1200 |NULL     |
|2023-02-01|Bob        |Electronics|900  |1200     |
|2023-03-01|Bob        |Electronics|1000 |900      |
|2023-01-01|Diana      |Electronics|1500 |NULL     |
|2023-02-01|Diana      |Electronics|1300 |1500     |
|2023-03-01|Diana      |Electronics|1600 |1300     |
|2023-02-01|Frank      |Electronics|NULL |NULL     |
+----------+-----------+-----------+-----+----

In [None]:
# lag
# pyspark (with expr)

dataframe.withColumn('nextSales', expr('''lag(sales,1) over(partition by department,salesperson order by mn asc)'''))\
         .orderBy(['department','salesperson'])\
         .show(truncate = False)

+----------+-----------+-----------+-----+---------+
|mn        |salesperson|department |sales|nextSales|
+----------+-----------+-----------+-----+---------+
|2023-01-01|Charlie    |Clothing   |800  |NULL     |
|2023-02-01|Charlie    |Clothing   |950  |800      |
|2023-03-01|Charlie    |Clothing   |1200 |950      |
|2023-01-01|Eve        |Clothing   |600  |NULL     |
|2023-02-01|Eve        |Clothing   |700  |600      |
|2023-01-01|Alice      |Electronics|1000 |NULL     |
|2023-02-01|Alice      |Electronics|1100 |1000     |
|2023-03-01|Alice      |Electronics|1400 |1100     |
|2023-01-01|Bob        |Electronics|1200 |NULL     |
|2023-02-01|Bob        |Electronics|900  |1200     |
|2023-03-01|Bob        |Electronics|1000 |900      |
|2023-01-01|Diana      |Electronics|1500 |NULL     |
|2023-02-01|Diana      |Electronics|1300 |1500     |
|2023-03-01|Diana      |Electronics|1600 |1300     |
|2023-02-01|Frank      |Electronics|NULL |NULL     |
+----------+-----------+-----------+-----+----

In [None]:
# lag
# pyspark

from pyspark.sql.functions import lag,col, asc_nulls_last

dataframe.withColumn('nextSales',lag(col('sales'),1).over(Window.partitionBy('department','salesperson').orderBy(asc_nulls_last(col('mn')))))\
         .orderBy(['department','salesperson','mn'])\
         .show(truncate = False)

+----------+-----------+-----------+-----+---------+
|mn        |salesperson|department |sales|nextSales|
+----------+-----------+-----------+-----+---------+
|2023-01-01|Charlie    |Clothing   |800  |NULL     |
|2023-02-01|Charlie    |Clothing   |950  |800      |
|2023-03-01|Charlie    |Clothing   |1200 |950      |
|2023-01-01|Eve        |Clothing   |600  |NULL     |
|2023-02-01|Eve        |Clothing   |700  |600      |
|2023-01-01|Alice      |Electronics|1000 |NULL     |
|2023-02-01|Alice      |Electronics|1100 |1000     |
|2023-03-01|Alice      |Electronics|1400 |1100     |
|2023-01-01|Bob        |Electronics|1200 |NULL     |
|2023-02-01|Bob        |Electronics|900  |1200     |
|2023-03-01|Bob        |Electronics|1000 |900      |
|2023-01-01|Diana      |Electronics|1500 |NULL     |
|2023-02-01|Diana      |Electronics|1300 |1500     |
|2023-03-01|Diana      |Electronics|1600 |1300     |
|2023-02-01|Frank      |Electronics|NULL |NULL     |
+----------+-----------+-----------+-----+----