In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [3]:
spark=SparkSession.builder\
    .appName('MySparkApp')\
    .master('local[*]')\
    .getOrCreate()
spark.version

'4.0.1'

In [4]:
data = [
    (1, 1, 20),
    (2, 1, 20),
    (1, 2, 30),
    (2, 2, 30),
    (3, 2, 40),
    (1, 3, 40),
    (3, 3, 60),
    (1, 4, 60),
    (3, 4, 70)
]

In [5]:
employee_df=spark.createDataFrame(data,['id','month','salary'])
employee_df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+---+-----+------+
| id|month|salary|
+---+-----+------+
|  1|    1|    20|
|  2|    1|    20|
|  1|    2|    30|
|  2|    2|    30|
|  3|    2|    40|
|  1|    3|    40|
|  3|    3|    60|
|  1|    4|    60|
|  3|    4|    70|
+---+-----+------+



                                                                                

In [7]:
w=Window.partitionBy('id').orderBy('month')
employee_df=employee_df.withColumn('running_amount',sum('salary').over(w))
employee_df.show()

+---+-----+------+--------------+
| id|month|salary|running_amount|
+---+-----+------+--------------+
|  1|    1|    20|            20|
|  1|    2|    30|            50|
|  1|    3|    40|            90|
|  1|    4|    60|           150|
|  2|    1|    20|            20|
|  2|    2|    30|            50|
|  3|    2|    40|            40|
|  3|    3|    60|           100|
|  3|    4|    70|           170|
+---+-----+------+--------------+



In [8]:
w=Window.partitionBy('id').orderBy(col('running_amount').desc())
employee_df=employee_df.withColumn('rnk',rank().over(w))
employee_df.show()

+---+-----+------+--------------+---+
| id|month|salary|running_amount|rnk|
+---+-----+------+--------------+---+
|  1|    4|    60|           150|  1|
|  1|    3|    40|            90|  2|
|  1|    2|    30|            50|  3|
|  1|    1|    20|            20|  4|
|  2|    2|    30|            50|  1|
|  2|    1|    20|            20|  2|
|  3|    4|    70|           170|  1|
|  3|    3|    60|           100|  2|
|  3|    2|    40|            40|  3|
+---+-----+------+--------------+---+



In [9]:
employee_df=employee_df[(employee_df['rnk']!=1)&(employee_df['rnk']<=4)]
employee_df.show()

+---+-----+------+--------------+---+
| id|month|salary|running_amount|rnk|
+---+-----+------+--------------+---+
|  1|    3|    40|            90|  2|
|  1|    2|    30|            50|  3|
|  1|    1|    20|            20|  4|
|  2|    1|    20|            20|  2|
|  3|    3|    60|           100|  2|
|  3|    2|    40|            40|  3|
+---+-----+------+--------------+---+



In [10]:
employee_df=employee_df[['id','month',col('running_amount').alias('salary')]]
employee_df.show()

+---+-----+------+
| id|month|salary|
+---+-----+------+
|  1|    3|    90|
|  1|    2|    50|
|  1|    1|    20|
|  2|    1|    20|
|  3|    3|   100|
|  3|    2|    40|
+---+-----+------+

