In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.conf import SparkConf
config = SparkConf()
# config.set("property", "value")
config.setMaster("local[4]").setAppName("WindowFunction")

from pyspark.sql import SparkSession
# spark Session, entry point for Spark SQL, DataFrame
spark = SparkSession.builder\
                    .config(conf=config)\
                    .getOrCreate()

sc = spark.sparkContext

22/03/08 19:50:36 WARN Utils: Your hostname, ubuntu-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.80.128 instead (on interface ens33)
22/03/08 19:50:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/03/08 19:50:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/08 19:50:38 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/03/08 19:50:38 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/03/08 19:50:38 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/03/08 19:50:38 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
22

In [4]:
data = [ ("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
   ]

empDf = spark.createDataFrame(data=data, schema=['name', 'dept', 'salary'])
empDf.printSchema()
empDf.show()

empDf.rdd.getNumPartitions()

root
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- salary: long (nullable = true)

+-------+---------+------+
|   name|     dept|salary|
+-------+---------+------+
|  James|    Sales|  3000|
|Michael|    Sales|  4600|
| Robert|    Sales|  4100|
|  Maria|  Finance|  3000|
|  James|    Sales|  3000|
|  Scott|  Finance|  3300|
|    Jen|  Finance|  3900|
|   Jeff|Marketing|  3000|
|  Kumar|Marketing|  2000|
|   Saif|    Sales|  4100|
+-------+---------+------+



4

In [6]:
empDf.rdd.glom().collect()

[[Row(name='James', dept='Sales', salary=3000),
  Row(name='Michael', dept='Sales', salary=4600)],
 [Row(name='Robert', dept='Sales', salary=4100),
  Row(name='Maria', dept='Finance', salary=3000)],
 [Row(name='James', dept='Sales', salary=3000),
  Row(name='Scott', dept='Finance', salary=3300)],
 [Row(name='Jen', dept='Finance', salary=3900),
  Row(name='Jeff', dept='Marketing', salary=3000),
  Row(name='Kumar', dept='Marketing', salary=2000),
  Row(name='Saif', dept='Sales', salary=4100)]]

In [7]:
empDf.write.option("header", True)\
        .partitionBy("dept")\
        .csv("/home/ubuntu/employees")

                                                                                

In [9]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

windowSpec = Window.partitionBy("dept").orderBy("salary")

df = empDf.withColumn("row_number", row_number().over(windowSpec))

df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- row_number: integer (nullable = true)



                                                                                

+-------+---------+------+----------+
|   name|     dept|salary|row_number|
+-------+---------+------+----------+
|  James|    Sales|  3000|         1|
|  James|    Sales|  3000|         2|
| Robert|    Sales|  4100|         3|
|   Saif|    Sales|  4100|         4|
|Michael|    Sales|  4600|         5|
|  Maria|  Finance|  3000|         1|
|  Scott|  Finance|  3300|         2|
|    Jen|  Finance|  3900|         3|
|  Kumar|Marketing|  2000|         1|
|   Jeff|Marketing|  3000|         2|
+-------+---------+------+----------+



In [10]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

windowSpec = Window.partitionBy("dept").orderBy("salary")

df =empDf.withColumn("rank", rank().over(windowSpec))
#spec =rank().over(windowSpec)

df.show()

+-------+---------+------+----+
|   name|     dept|salary|rank|
+-------+---------+------+----+
|  James|    Sales|  3000|   1|
|  James|    Sales|  3000|   1|
| Robert|    Sales|  4100|   3|
|   Saif|    Sales|  4100|   3|
|Michael|    Sales|  4600|   5|
|  Maria|  Finance|  3000|   1|
|  Scott|  Finance|  3300|   2|
|    Jen|  Finance|  3900|   3|
|  Kumar|Marketing|  2000|   1|
|   Jeff|Marketing|  3000|   2|
+-------+---------+------+----+



In [13]:
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank, desc

windowSpec = Window.partitionBy("dept").orderBy(desc("salary"))

df =empDf.withColumn("rank", dense_rank().over(windowSpec))
df.show()

                                                                                

+-------+---------+------+----+
|   name|     dept|salary|rank|
+-------+---------+------+----+
|Michael|    Sales|  4600|   1|
| Robert|    Sales|  4100|   2|
|   Saif|    Sales|  4100|   2|
|  James|    Sales|  3000|   3|
|  James|    Sales|  3000|   3|
|    Jen|  Finance|  3900|   1|
|  Scott|  Finance|  3300|   2|
|  Maria|  Finance|  3000|   3|
|   Jeff|Marketing|  3000|   1|
|  Kumar|Marketing|  2000|   2|
+-------+---------+------+----+



In [14]:
from pyspark.sql.window import Window
from pyspark.sql.functions import percent_rank, desc

windowSpec = Window.partitionBy("dept").orderBy(desc("salary"))

df =empDf.withColumn("rank", percent_rank().over(windowSpec))
df.show()

+-------+---------+------+----+
|   name|     dept|salary|rank|
+-------+---------+------+----+
|Michael|    Sales|  4600| 0.0|
| Robert|    Sales|  4100|0.25|
|   Saif|    Sales|  4100|0.25|
|  James|    Sales|  3000|0.75|
|  James|    Sales|  3000|0.75|
|    Jen|  Finance|  3900| 0.0|
|  Scott|  Finance|  3300| 0.5|
|  Maria|  Finance|  3000| 1.0|
|   Jeff|Marketing|  3000| 0.0|
|  Kumar|Marketing|  2000| 1.0|
+-------+---------+------+----+



In [15]:
from pyspark.sql.window import Window
from pyspark.sql.functions import ntile, desc

windowSpec = Window.partitionBy("dept").orderBy(desc("salary"))

df =empDf.withColumn("rank", ntile(5).over(windowSpec))
df.show()

+-------+---------+------+----+
|   name|     dept|salary|rank|
+-------+---------+------+----+
|Michael|    Sales|  4600|   1|
| Robert|    Sales|  4100|   2|
|   Saif|    Sales|  4100|   3|
|  James|    Sales|  3000|   4|
|  James|    Sales|  3000|   5|
|    Jen|  Finance|  3900|   1|
|  Scott|  Finance|  3300|   2|
|  Maria|  Finance|  3000|   3|
|   Jeff|Marketing|  3000|   1|
|  Kumar|Marketing|  2000|   2|
+-------+---------+------+----+



In [16]:
from pyspark.sql.window import Window
from pyspark.sql.functions import cume_dist, desc

windowSpec = Window.partitionBy("dept").orderBy(desc("salary"))

df =empDf.withColumn("cume_dist", cume_dist().over(windowSpec))
df.show()

+-------+---------+------+------------------+
|   name|     dept|salary|         cume_dist|
+-------+---------+------+------------------+
|Michael|    Sales|  4600|               0.2|
| Robert|    Sales|  4100|               0.6|
|   Saif|    Sales|  4100|               0.6|
|  James|    Sales|  3000|               1.0|
|  James|    Sales|  3000|               1.0|
|    Jen|  Finance|  3900|0.3333333333333333|
|  Scott|  Finance|  3300|0.6666666666666666|
|  Maria|  Finance|  3000|               1.0|
|   Jeff|Marketing|  3000|               0.5|
|  Kumar|Marketing|  2000|               1.0|
+-------+---------+------+------------------+



In [17]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, desc

windowSpec = Window.partitionBy("dept").orderBy(desc("salary"))

df =empDf.withColumn("lag", lag("salary",1).over(windowSpec))
df.show()

+-------+---------+------+----+
|   name|     dept|salary| lag|
+-------+---------+------+----+
|Michael|    Sales|  4600|null|
| Robert|    Sales|  4100|4600|
|   Saif|    Sales|  4100|4100|
|  James|    Sales|  3000|4100|
|  James|    Sales|  3000|3000|
|    Jen|  Finance|  3900|null|
|  Scott|  Finance|  3300|3900|
|  Maria|  Finance|  3000|3300|
|   Jeff|Marketing|  3000|null|
|  Kumar|Marketing|  2000|3000|
+-------+---------+------+----+



In [19]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lead, desc

windowSpec = Window.partitionBy("dept").orderBy("salary")

df =empDf.withColumn("lead", lead("salary",1).over(windowSpec))
df.show()

+-------+---------+------+----+
|   name|     dept|salary|lead|
+-------+---------+------+----+
|  James|    Sales|  3000|3000|
|  James|    Sales|  3000|4100|
| Robert|    Sales|  4100|4100|
|   Saif|    Sales|  4100|4600|
|Michael|    Sales|  4600|null|
|  Maria|  Finance|  3000|3300|
|  Scott|  Finance|  3300|3900|
|    Jen|  Finance|  3900|null|
|  Kumar|Marketing|  2000|3000|
|   Jeff|Marketing|  3000|null|
+-------+---------+------+----+



In [20]:
from pyspark.sql.window import Window
from pyspark.sql.functions import avg, sum, min, max, count, col

windowSpec = Window.partitionBy("dept")

df =empDf.withColumn("min", min(col("salary")).over(windowSpec))\
            .withColumn("max", max(col("salary")).over(windowSpec))\
            .withColumn("avg", avg(col("salary")).over(windowSpec))\
            .withColumn("count", count(col("salary")).over(windowSpec))\
            .withColumn("sum", sum(col("salary")).over(windowSpec))\

df.show()

                                                                                

+-------+---------+------+----+----+------+-----+-----+
|   name|     dept|salary| min| max|   avg|count|  sum|
+-------+---------+------+----+----+------+-----+-----+
|  James|    Sales|  3000|3000|4600|3760.0|    5|18800|
|Michael|    Sales|  4600|3000|4600|3760.0|    5|18800|
| Robert|    Sales|  4100|3000|4600|3760.0|    5|18800|
|  James|    Sales|  3000|3000|4600|3760.0|    5|18800|
|   Saif|    Sales|  4100|3000|4600|3760.0|    5|18800|
|  Maria|  Finance|  3000|3000|3900|3400.0|    3|10200|
|  Scott|  Finance|  3300|3000|3900|3400.0|    3|10200|
|    Jen|  Finance|  3900|3000|3900|3400.0|    3|10200|
|   Jeff|Marketing|  3000|2000|3000|2500.0|    2| 5000|
|  Kumar|Marketing|  2000|2000|3000|2500.0|    2| 5000|
+-------+---------+------+----+----+------+-----+-----+



In [22]:
from pyspark.sql.window import Window
from pyspark.sql.functions import avg, sum, min, max, count, col
# FIXME
windowSpec = Window.partitionBy("dept").orderBy(desc("salary"))

df = empDf.drop("name")\
          .withColumn("row_number",  row_number().over(windowSpec))\
          .withColumn("min", min(col("salary")).over(windowSpec))\
          .withColumn("max", max(col("salary")).over(windowSpec))\
          .withColumn("avg", avg(col("salary")).over(windowSpec))\
          .withColumn("count", count(col("salary")).over(windowSpec))\
          .withColumn("sum", sum(col("salary")).over(windowSpec))\
          

df.show()

                                                                                

+---------+------+----------+----+----+-----------------+-----+-----+
|     dept|salary|row_number| min| max|              avg|count|  sum|
+---------+------+----------+----+----+-----------------+-----+-----+
|    Sales|  4600|         1|4600|4600|           4600.0|    1| 4600|
|    Sales|  4100|         2|4100|4600|4266.666666666667|    3|12800|
|    Sales|  4100|         3|4100|4600|4266.666666666667|    3|12800|
|    Sales|  3000|         4|3000|4600|           3760.0|    5|18800|
|    Sales|  3000|         5|3000|4600|           3760.0|    5|18800|
|  Finance|  3900|         1|3900|3900|           3900.0|    1| 3900|
|  Finance|  3300|         2|3300|3900|           3600.0|    2| 7200|
|  Finance|  3000|         3|3000|3900|           3400.0|    3|10200|
|Marketing|  3000|         1|3000|3000|           3000.0|    1| 3000|
|Marketing|  2000|         2|2000|3000|           2500.0|    2| 5000|
+---------+------+----------+----+----+-----------------+-----+-----+

