# Window Function (PySpark)

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import Window

In [2]:
spark = SparkSession.builder.getOrCreate()

In [13]:
df = spark.read.csv("emp.csv", inferSchema=True,
                   header=True)

In [14]:
df.show()

+----+--------+--------+
|year|    dept|  salary|
+----+--------+--------+
|2004|      IT| 3324172|
|2004|      IT| 3324172|
|2004|Accounts| 4409580|
|2004|      HR|  211648|
|2004|   Sales|  902053|
|2004|      IT| 1005417|
|2004|Accounts|    3645|
|2005|      IT| 2974005|
|2005|      IT| 2974005|
|2005|Accounts| 4239440|
|2005|      HR|  114120|
|2005|   Sales| 1215112|
|2005|      IT|  773217|
|2005|Accounts|    3101|
|2006|      IT| 3285138|
|2006|      IT|65932248|
|2006|Accounts|    3642|
|2006|      HR|   67236|
|2006|   Sales|  836424|
|2006|      IT| 1095922|
+----+--------+--------+
only showing top 20 rows



In [15]:
df.count()

119

In [16]:
windowSpec  = Window.partitionBy("dept").orderBy("salary")

In [17]:
df.withColumn("rank",F.rank().over(windowSpec)) \
    .show()

+----+-----+-------+----+
|year| dept| salary|rank|
+----+-----+-------+----+
|2015|Sales| 164874|   1|
|2016|Sales| 164874|   1|
|2014|Sales| 170952|   3|
|2019|Sales| 192922|   4|
|2020|Sales| 192922|   4|
|2018|Sales| 196338|   6|
|2017|Sales| 203907|   7|
|2010|Sales| 211866|   8|
|2013|Sales| 217037|   9|
|2012|Sales| 257114|  10|
|2011|Sales| 285572|  11|
|2007|Sales| 772173|  12|
|2008|Sales| 772173|  12|
|2009|Sales| 772173|  12|
|2006|Sales| 836424|  15|
|2004|Sales| 902053|  16|
|2005|Sales|1215112|  17|
|2017|   HR|  51834|   1|
|2018|   HR|  51834|   1|
|2020|   HR|  53407|   3|
+----+-----+-------+----+
only showing top 20 rows



In [18]:
df.withColumn("dense_rank",F.dense_rank().over(windowSpec)) \
    .show()

+----+-----+-------+----------+
|year| dept| salary|dense_rank|
+----+-----+-------+----------+
|2015|Sales| 164874|         1|
|2016|Sales| 164874|         1|
|2014|Sales| 170952|         2|
|2019|Sales| 192922|         3|
|2020|Sales| 192922|         3|
|2018|Sales| 196338|         4|
|2017|Sales| 203907|         5|
|2010|Sales| 211866|         6|
|2013|Sales| 217037|         7|
|2012|Sales| 257114|         8|
|2011|Sales| 285572|         9|
|2007|Sales| 772173|        10|
|2008|Sales| 772173|        10|
|2009|Sales| 772173|        10|
|2006|Sales| 836424|        11|
|2004|Sales| 902053|        12|
|2005|Sales|1215112|        13|
|2017|   HR|  51834|         1|
|2018|   HR|  51834|         1|
|2020|   HR|  53407|         2|
+----+-----+-------+----------+
only showing top 20 rows



In [19]:
df.withColumn("percent_rank",F.percent_rank().over(windowSpec)) \
    .show()

+----+-----+-------+------------+
|year| dept| salary|percent_rank|
+----+-----+-------+------------+
|2015|Sales| 164874|         0.0|
|2016|Sales| 164874|         0.0|
|2014|Sales| 170952|       0.125|
|2019|Sales| 192922|      0.1875|
|2020|Sales| 192922|      0.1875|
|2018|Sales| 196338|      0.3125|
|2017|Sales| 203907|       0.375|
|2010|Sales| 211866|      0.4375|
|2013|Sales| 217037|         0.5|
|2012|Sales| 257114|      0.5625|
|2011|Sales| 285572|       0.625|
|2007|Sales| 772173|      0.6875|
|2008|Sales| 772173|      0.6875|
|2009|Sales| 772173|      0.6875|
|2006|Sales| 836424|       0.875|
|2004|Sales| 902053|      0.9375|
|2005|Sales|1215112|         1.0|
|2017|   HR|  51834|         0.0|
|2018|   HR|  51834|         0.0|
|2020|   HR|  53407|       0.125|
+----+-----+-------+------------+
only showing top 20 rows

