# Window Function (PySpark)

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import Window

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.read.csv("emp.csv", inferSchema=True,
                   header=True)

In [4]:
df.show()

+----+--------+--------+
|year|    dept|  salary|
+----+--------+--------+
|2004|      IT| 3324172|
|2004|      IT| 3324172|
|2004|Accounts| 4409580|
|2004|      HR|  211648|
|2004|   Sales|  902053|
|2004|      IT| 1005417|
|2004|Accounts|    3645|
|2005|      IT| 2974005|
|2005|      IT| 2974005|
|2005|Accounts| 4239440|
|2005|      HR|  114120|
|2005|   Sales| 1215112|
|2005|      IT|  773217|
|2005|Accounts|    3101|
|2006|      IT| 3285138|
|2006|      IT|65932248|
|2006|Accounts|    3642|
|2006|      HR|   67236|
|2006|   Sales|  836424|
|2006|      IT| 1095922|
+----+--------+--------+
only showing top 20 rows



In [5]:
df.count()

119

In [6]:
windowSpec  = Window.partitionBy("dept").orderBy("salary")

In [10]:
df.withColumn("row_number",F.row_number().over(windowSpec)) \
    .show()

+----+-----+-------+----------+
|year| dept| salary|row_number|
+----+-----+-------+----------+
|2015|Sales| 164874|         1|
|2016|Sales| 164874|         2|
|2014|Sales| 170952|         3|
|2019|Sales| 192922|         4|
|2020|Sales| 192922|         5|
|2018|Sales| 196338|         6|
|2017|Sales| 203907|         7|
|2010|Sales| 211866|         8|
|2013|Sales| 217037|         9|
|2012|Sales| 257114|        10|
|2011|Sales| 285572|        11|
|2007|Sales| 772173|        12|
|2008|Sales| 772173|        13|
|2009|Sales| 772173|        14|
|2006|Sales| 836424|        15|
|2004|Sales| 902053|        16|
|2005|Sales|1215112|        17|
|2017|   HR|  51834|         1|
|2018|   HR|  51834|         2|
|2020|   HR|  53407|         3|
+----+-----+-------+----------+
only showing top 20 rows



In [8]:
df.withColumn("ntile",F.ntile(2).over(windowSpec)) \
    .show()

+----+-----+-------+-----+
|year| dept| salary|ntile|
+----+-----+-------+-----+
|2015|Sales| 164874|    1|
|2016|Sales| 164874|    1|
|2014|Sales| 170952|    1|
|2019|Sales| 192922|    1|
|2020|Sales| 192922|    1|
|2018|Sales| 196338|    1|
|2017|Sales| 203907|    1|
|2010|Sales| 211866|    1|
|2013|Sales| 217037|    1|
|2012|Sales| 257114|    2|
|2011|Sales| 285572|    2|
|2007|Sales| 772173|    2|
|2008|Sales| 772173|    2|
|2009|Sales| 772173|    2|
|2006|Sales| 836424|    2|
|2004|Sales| 902053|    2|
|2005|Sales|1215112|    2|
|2017|   HR|  51834|    1|
|2018|   HR|  51834|    1|
|2020|   HR|  53407|    1|
+----+-----+-------+-----+
only showing top 20 rows



In [9]:
df.withColumn("cume_dist",F.cume_dist().over(windowSpec)) \
   .show()

+----+-----+-------+-------------------+
|year| dept| salary|          cume_dist|
+----+-----+-------+-------------------+
|2015|Sales| 164874|0.11764705882352941|
|2016|Sales| 164874|0.11764705882352941|
|2014|Sales| 170952|0.17647058823529413|
|2019|Sales| 192922|0.29411764705882354|
|2020|Sales| 192922|0.29411764705882354|
|2018|Sales| 196338|0.35294117647058826|
|2017|Sales| 203907| 0.4117647058823529|
|2010|Sales| 211866|0.47058823529411764|
|2013|Sales| 217037| 0.5294117647058824|
|2012|Sales| 257114| 0.5882352941176471|
|2011|Sales| 285572| 0.6470588235294118|
|2007|Sales| 772173| 0.8235294117647058|
|2008|Sales| 772173| 0.8235294117647058|
|2009|Sales| 772173| 0.8235294117647058|
|2006|Sales| 836424| 0.8823529411764706|
|2004|Sales| 902053| 0.9411764705882353|
|2005|Sales|1215112|                1.0|
|2017|   HR|  51834|0.11764705882352941|
|2018|   HR|  51834|0.11764705882352941|
|2020|   HR|  53407|0.17647058823529413|
+----+-----+-------+-------------------+
only showing top