In [0]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window

data = [("Anderson","Sales","NY",90000),
    ("Kenedy","Sales","CA",86000),
    ("Kenny","Sales","CA",86000),
    ("Billy","Sales","NY",81000),
    ("Andy","Finance","CA",90000),
    ("Mary","Finance","NY",99000),
    ("Eduardo","Finance","NY",83000),
    ("Mendes","Finance","CA",79000),
    ("Keyth","Marketing","CA",80000),
    ("Truman","Marketing","NY",91000)
  ]

schema = ["name","dep_name","state","salary"]
df = spark.createDataFrame(data=data, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- dep_name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)

+--------+---------+-----+------+
|name    |dep_name |state|salary|
+--------+---------+-----+------+
|Anderson|Sales    |NY   |90000 |
|Kenedy  |Sales    |CA   |86000 |
|Kenny   |Sales    |CA   |86000 |
|Billy   |Sales    |NY   |81000 |
|Andy    |Finance  |CA   |90000 |
|Mary    |Finance  |NY   |99000 |
|Eduardo |Finance  |NY   |83000 |
|Mendes  |Finance  |CA   |79000 |
|Keyth   |Marketing|CA   |80000 |
|Truman  |Marketing|NY   |91000 |
+--------+---------+-----+------+



In [0]:
w0  = Window.partitionBy(F.col("dep_name")).orderBy(F.col("salary"))
df.withColumn("row_number",F.row_number().over(w0)).show(truncate=False)

+--------+---------+-----+------+----------+
|name    |dep_name |state|salary|row_number|
+--------+---------+-----+------+----------+
|Mendes  |Finance  |CA   |79000 |1         |
|Eduardo |Finance  |NY   |83000 |2         |
|Andy    |Finance  |CA   |90000 |3         |
|Mary    |Finance  |NY   |99000 |4         |
|Keyth   |Marketing|CA   |80000 |1         |
|Truman  |Marketing|NY   |91000 |2         |
|Billy   |Sales    |NY   |81000 |1         |
|Kenedy  |Sales    |CA   |86000 |2         |
|Kenny   |Sales    |CA   |86000 |3         |
|Anderson|Sales    |NY   |90000 |4         |
+--------+---------+-----+------+----------+



In [0]:
df.withColumn("rank",F.rank().over(w0)).show(truncate=False)

+--------+---------+-----+------+----+
|name    |dep_name |state|salary|rank|
+--------+---------+-----+------+----+
|Mendes  |Finance  |CA   |79000 |1   |
|Eduardo |Finance  |NY   |83000 |2   |
|Andy    |Finance  |CA   |90000 |3   |
|Mary    |Finance  |NY   |99000 |4   |
|Keyth   |Marketing|CA   |80000 |1   |
|Truman  |Marketing|NY   |91000 |2   |
|Billy   |Sales    |NY   |81000 |1   |
|Kenedy  |Sales    |CA   |86000 |2   |
|Kenny   |Sales    |CA   |86000 |2   |
|Anderson|Sales    |NY   |90000 |4   |
+--------+---------+-----+------+----+



In [0]:
df.withColumn("dense_rank",F.dense_rank().over(w0)).show(truncate=False)

+--------+---------+-----+------+----------+
|name    |dep_name |state|salary|dense_rank|
+--------+---------+-----+------+----------+
|Mendes  |Finance  |CA   |79000 |1         |
|Eduardo |Finance  |NY   |83000 |2         |
|Andy    |Finance  |CA   |90000 |3         |
|Mary    |Finance  |NY   |99000 |4         |
|Keyth   |Marketing|CA   |80000 |1         |
|Truman  |Marketing|NY   |91000 |2         |
|Billy   |Sales    |NY   |81000 |1         |
|Kenedy  |Sales    |CA   |86000 |2         |
|Kenny   |Sales    |CA   |86000 |2         |
|Anderson|Sales    |NY   |90000 |3         |
+--------+---------+-----+------+----------+



In [0]:
df.withColumn("percent_rank",F.percent_rank().over(w0)).show(truncate=False)

+--------+---------+-----+------+------------------+
|name    |dep_name |state|salary|percent_rank      |
+--------+---------+-----+------+------------------+
|Mendes  |Finance  |CA   |79000 |0.0               |
|Eduardo |Finance  |NY   |83000 |0.3333333333333333|
|Andy    |Finance  |CA   |90000 |0.6666666666666666|
|Mary    |Finance  |NY   |99000 |1.0               |
|Keyth   |Marketing|CA   |80000 |0.0               |
|Truman  |Marketing|NY   |91000 |1.0               |
|Billy   |Sales    |NY   |81000 |0.0               |
|Kenedy  |Sales    |CA   |86000 |0.3333333333333333|
|Kenny   |Sales    |CA   |86000 |0.3333333333333333|
|Anderson|Sales    |NY   |90000 |1.0               |
+--------+---------+-----+------+------------------+



In [0]:
df.withColumn("ntile",F.ntile(2).over(w0)).show(truncate=False)

+--------+---------+-----+------+-----+
|name    |dep_name |state|salary|ntile|
+--------+---------+-----+------+-----+
|Mendes  |Finance  |CA   |79000 |1    |
|Eduardo |Finance  |NY   |83000 |1    |
|Andy    |Finance  |CA   |90000 |2    |
|Mary    |Finance  |NY   |99000 |2    |
|Keyth   |Marketing|CA   |80000 |1    |
|Truman  |Marketing|NY   |91000 |2    |
|Billy   |Sales    |NY   |81000 |1    |
|Kenedy  |Sales    |CA   |86000 |1    |
|Kenny   |Sales    |CA   |86000 |2    |
|Anderson|Sales    |NY   |90000 |2    |
+--------+---------+-----+------+-----+



In [0]:
df.withColumn("cume_dist",F.cume_dist().over(w0)).show(truncate=False)

+--------+---------+-----+------+---------+
|name    |dep_name |state|salary|cume_dist|
+--------+---------+-----+------+---------+
|Mendes  |Finance  |CA   |79000 |0.25     |
|Eduardo |Finance  |NY   |83000 |0.5      |
|Andy    |Finance  |CA   |90000 |0.75     |
|Mary    |Finance  |NY   |99000 |1.0      |
|Keyth   |Marketing|CA   |80000 |0.5      |
|Truman  |Marketing|NY   |91000 |1.0      |
|Billy   |Sales    |NY   |81000 |0.25     |
|Kenedy  |Sales    |CA   |86000 |0.75     |
|Kenny   |Sales    |CA   |86000 |0.75     |
|Anderson|Sales    |NY   |90000 |1.0      |
+--------+---------+-----+------+---------+



In [0]:
df.withColumn("lag",F.lag("salary",1).over(w0)).show(truncate=False)

+--------+---------+-----+------+-----+
|name    |dep_name |state|salary|lag  |
+--------+---------+-----+------+-----+
|Mendes  |Finance  |CA   |79000 |null |
|Eduardo |Finance  |NY   |83000 |79000|
|Andy    |Finance  |CA   |90000 |83000|
|Mary    |Finance  |NY   |99000 |90000|
|Keyth   |Marketing|CA   |80000 |null |
|Truman  |Marketing|NY   |91000 |80000|
|Billy   |Sales    |NY   |81000 |null |
|Kenedy  |Sales    |CA   |86000 |81000|
|Kenny   |Sales    |CA   |86000 |86000|
|Anderson|Sales    |NY   |90000 |86000|
+--------+---------+-----+------+-----+



In [0]:
df.withColumn("lead",F.lead("salary",2).over(w0)).show(truncate=False)

+--------+---------+-----+------+-----+
|name    |dep_name |state|salary|lead |
+--------+---------+-----+------+-----+
|Mendes  |Finance  |CA   |79000 |90000|
|Eduardo |Finance  |NY   |83000 |99000|
|Andy    |Finance  |CA   |90000 |null |
|Mary    |Finance  |NY   |99000 |null |
|Keyth   |Marketing|CA   |80000 |null |
|Truman  |Marketing|NY   |91000 |null |
|Billy   |Sales    |NY   |81000 |86000|
|Kenedy  |Sales    |CA   |86000 |90000|
|Kenny   |Sales    |CA   |86000 |null |
|Anderson|Sales    |NY   |90000 |null |
+--------+---------+-----+------+-----+



In [0]:
w01 = Window.partitionBy(F.col("dep_name")).orderBy(F.col("salary"))

df.withColumn("row",F.row_number().over(w01)) \
  .withColumn("avg", F.avg(F.col("salary")).over(w01)) \
  .withColumn("sum", F.sum(F.col("salary")).over(w01)) \
  .withColumn("min", F.min(F.col("salary")).over(w01)) \
  .withColumn("max", F.max(F.col("salary")).over(w01)) \
  .where(F.col("row")==1).select("dep_name","avg","sum","min","max") \
  .show(truncate=False)

+---------+-------+-----+-----+-----+
|dep_name |avg    |sum  |min  |max  |
+---------+-------+-----+-----+-----+
|Finance  |79000.0|79000|79000|79000|
|Marketing|80000.0|80000|80000|80000|
|Sales    |81000.0|81000|81000|81000|
+---------+-------+-----+-----+-----+

