In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.appName('window functions').master('local[*]').getOrCreate()

In [3]:
emp_df = spark.read.csv('file:///home/jovyan/work/HR-Dataset/core_dataset.csv',header=True,inferSchema=True)

In [4]:
emp_df.createOrReplaceTempView('emp_tbl')

In [23]:
spark.sql('''
select `Employee Name`, `Employee Number`, `Pay Rate`, Department,
-- rank() over (order by `Pay Rate` desc) as pay_rank,
-- dense_rank() over (order by `Pay Rate` desc) as pay_dense_rank,
-- row_number() over (order by `Pay Rate` desc) as row_num,
dense_rank() over (partition by department order by `Pay Rate` desc) as dept_rank
from emp_tbl
order by department, `Pay Rate` desc
''').show()

+--------------------+---------------+--------+----------------+---------+
|       Employee Name|Employee Number|Pay Rate|      Department|dept_rank|
+--------------------+---------------+--------+----------------+---------+
|                NULL|           NULL|    NULL|            NULL|        1|
| LeBlanc, Brandon  R|     1102024115|    55.0|   Admin Offices|        1|
|         Quinn, Sean|     1206043417|    55.0|   Admin Offices|        1|
|   Boutwell, Bonalyn|     1307060188|   34.95|   Admin Offices|        2|
|   Foster-Baker, Amy|     1201031308|   34.95|   Admin Offices|        2|
|    Steans, Tyrone  |     1302053333|    29.0|   Admin Offices|        3|
|          Brown, Mia|     1103024456|    28.5|   Admin Offices|        4|
|LaRotonda, William  |     1106026572|    23.0|   Admin Offices|        5|
|     Howard, Estelle|     1211050782|    21.5|   Admin Offices|        6|
|    Smith, Leigh Ann|      711007713|    20.5|   Admin Offices|        7|
|         Singh, Nan |   

In [24]:
from pyspark.sql.window import Window

In [38]:
window1 = Window.orderBy(col('Pay Rate').desc())
window2 = Window.partitionBy('Department').orderBy(col('Pay Rate').desc())

In [40]:
emp_df.\
withColumn('dense_pay_rank', dense_rank().over(window1)).\
withColumn('pay_rank', rank().over(window1)).\
withColumn('dept_dense_pay_rank', dense_rank().over(window2)).\
select('Employee Name', 'Department', 'Pay Rate', 'pay_rank', 'dense_pay_rank','dept_dense_pay_rank').show()

+--------------------+----------------+--------+--------+--------------+-------------------+
|       Employee Name|      Department|Pay Rate|pay_rank|dense_pay_rank|dept_dense_pay_rank|
+--------------------+----------------+--------+--------+--------------+-------------------+
|                NULL|            NULL|    NULL|     302|            92|                  1|
| LeBlanc, Brandon  R|   Admin Offices|    55.0|      27|            18|                  1|
|         Quinn, Sean|   Admin Offices|    55.0|      27|            18|                  1|
|   Boutwell, Bonalyn|   Admin Offices|   34.95|      89|            48|                  2|
|   Foster-Baker, Amy|   Admin Offices|   34.95|      89|            48|                  2|
|    Steans, Tyrone  |   Admin Offices|    29.0|      96|            53|                  3|
|          Brown, Mia|   Admin Offices|    28.5|     107|            56|                  4|
|LaRotonda, William  |   Admin Offices|    23.0|     159|            6

In [42]:
emp_df.\
withColumn('dept_dense_pay_rank', dense_rank().over(window2)).\
select('Employee Name', 'Department', 'Pay Rate', 'dept_dense_pay_rank').\
where(col('dept_dense_pay_rank')==1).\
show()

+-------------------+--------------------+--------+-------------------+
|      Employee Name|          Department|Pay Rate|dept_dense_pay_rank|
+-------------------+--------------------+--------+-------------------+
|               NULL|                NULL|    NULL|                  1|
|LeBlanc, Brandon  R|       Admin Offices|    55.0|                  1|
|        Quinn, Sean|       Admin Offices|    55.0|                  1|
|        King, Janet|    Executive Office|    80.0|                  1|
|   Zamora, Jennifer|               IT/IS|    65.0|                  1|
|        Foss, Jason|               IT/IS|    65.0|                  1|
|    Bramante, Elisa|   Production       |    60.0|                  1|
|Kampew, Donysha    |               Sales|   60.25|                  1|
|  Del Bosque, Keyla|Software Engineering|   57.12|                  1|
|   Sweetwater, Alex|Software Engineer...|    27.0|                  1|
+-------------------+--------------------+--------+-------------

In [49]:
spark.sql('''
select `Employee Name`, `Employee Number`, `Pay Rate`, Department,
lag(`Pay Rate`) over (order by `Pay Rate` desc) as last,
lead(`Pay Rate`) over (order by `Pay Rate` desc) as lead,
sum(`Pay Rate`) over (partition by department order by `Pay Rate`) cum_sum
from emp_tbl
-- order by department, `Pay Rate` desc
''').show()

+--------------------+---------------+--------+----------------+-----+-----+------------------+
|       Employee Name|Employee Number|Pay Rate|      Department| last| lead|           cum_sum|
+--------------------+---------------+--------+----------------+-----+-----+------------------+
|                NULL|           NULL|    NULL|            NULL| 14.0| NULL|              NULL|
|         Singh, Nan |     1307059817|   16.56|   Admin Offices|16.75| 16.0|             16.56|
|    Smith, Leigh Ann|      711007713|    20.5|   Admin Offices| 21.0| 20.0|             37.06|
|     Howard, Estelle|     1211050782|    21.5|   Admin Offices| 22.0|21.25|             58.56|
|LaRotonda, William  |     1106026572|    23.0|   Admin Offices| 23.5| 23.0|             81.56|
|          Brown, Mia|     1103024456|    28.5|   Admin Offices|28.75| 28.0|            110.06|
|    Steans, Tyrone  |     1302053333|    29.0|   Admin Offices| 30.2| 29.0|            139.06|
|   Boutwell, Bonalyn|     1307060188|  