In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

### Define the window specification

`from pyspark.sql.window import Window`

`# Defines partitioning specification and ordering specification.`
`windowSpec = \
  Window \
    .partitionBy(...) \
    .orderBy(...)`
    
`# Defines a Window Specification with a ROW frame.`

`windowSpec.rowsBetween(start, end)`

`# Defines a Window Specification with a RANGE frame.`

`windowSpec.rangeBetween(start, end)`

Here, frame_type can be either ROWS (for ROW frame) or RANGE (for RANGE frame); 

start can be any of UNBOUNDED PRECEDING, CURRENT ROW, <value> PRECEDING, and <value> FOLLOWING; and end can be any of UNBOUNDED FOLLOWING, CURRENT ROW, <value> PRECEDING, and <value> FOLLOWING.

In [2]:
employeeList = [("sales", 1, 5000),
("personnel", 2, 3900),
("sales", 3, 4800),
("sales", 4, 4800),
("personnel", 5, 3500),
("develop", 7, 4200),
("develop", 8, 6000),
("develop", 9, 4500),
("develop", 10, 5200),
("develop", 11, 5200)]

In [3]:
empDf = spark.createDataFrame(employeeList, ["deptName", "empID", "salary"])

In [4]:
empDf.show()

+---------+-----+------+
| deptName|empID|salary|
+---------+-----+------+
|    sales|    1|  5000|
|personnel|    2|  3900|
|    sales|    3|  4800|
|    sales|    4|  4800|
|personnel|    5|  3500|
|  develop|    7|  4200|
|  develop|    8|  6000|
|  develop|    9|  4500|
|  develop|   10|  5200|
|  develop|   11|  5200|
+---------+-----+------+



In [5]:
from pyspark.sql.window import Window

### Get dept wise avg salary

___Using normal groupBy___

In [6]:
avgSalDf = empDf. \
    groupBy("deptName").\
    avg("salary")

avgSalDf.show()

+---------+-----------------+
| deptName|      avg(salary)|
+---------+-----------------+
|  develop|           5020.0|
|    sales|4866.666666666667|
|personnel|           3700.0|
+---------+-----------------+



___Using window aggregation___

In [7]:
from pyspark.sql.functions import *
avgSalDf2 = empDf. \
    withColumn("avg sal", avg("salary").over(Window.partitionBy("deptName"))). \
    select("deptName", "avg sal"). \
    distinct()

avgSalDf2.show()

+---------+-----------------+
| deptName|          avg sal|
+---------+-----------------+
|  develop|           5020.0|
|    sales|4866.666666666667|
|personnel|           3700.0|
+---------+-----------------+



### Top N per Group
___Top N per Group is useful when you need to compute the first and second best-sellers in category.___

In [8]:
prdList = [("Thin", "cell phone", 6000),
("Normal", "tablet", 1500),
("Mini", "tablet", 5500),
("Ultra thin", "cell phone", 5000),
("Very thin", "cell phone", 6000),
("Big", "tablet", 2500),
("Bendable", "cell phone", 3000),
("Foldable", "cell phone", 3000),
("Pro", "tablet", 4500),
("Pro2", "tablet", 6500)
]

In [9]:
prdDf = spark.createDataFrame(prdList, ["product", "category", "revenue"])

prdDf.show()

+----------+----------+-------+
|   product|  category|revenue|
+----------+----------+-------+
|      Thin|cell phone|   6000|
|    Normal|    tablet|   1500|
|      Mini|    tablet|   5500|
|Ultra thin|cell phone|   5000|
| Very thin|cell phone|   6000|
|       Big|    tablet|   2500|
|  Bendable|cell phone|   3000|
|  Foldable|cell phone|   3000|
|       Pro|    tablet|   4500|
|      Pro2|    tablet|   6500|
+----------+----------+-------+



___What are the best-selling and the second best-selling products in every category?___

In [10]:
best_selling_window = Window.partitionBy("category"). \
    orderBy(col("revenue").desc())

In [11]:
prdDf.withColumn("rank", dense_rank().over(best_selling_window)). \
    filter((col("rank") == 1) | (col("rank") == 2)). \
    drop("rank"). \
    show()

+----------+----------+-------+
|   product|  category|revenue|
+----------+----------+-------+
|      Pro2|    tablet|   6500|
|      Mini|    tablet|   5500|
|      Thin|cell phone|   6000|
| Very thin|cell phone|   6000|
|Ultra thin|cell phone|   5000|
+----------+----------+-------+



In [12]:
prdDf.select("product", "category", "revenue", (dense_rank().over(best_selling_window)).alias("best selling product")). \
    filter(col("best selling product") == 1). \
    drop("best selling product"). \
    show()

+---------+----------+-------+
|  product|  category|revenue|
+---------+----------+-------+
|     Pro2|    tablet|   6500|
|     Thin|cell phone|   6000|
|Very thin|cell phone|   6000|
+---------+----------+-------+



___What is the difference between the revenue of each product and the revenue of the best-selling product in the same category of that product?___

In [13]:
revenue_difference = max(prdDf["revenue"]).over(best_selling_window) - prdDf["revenue"]
prdDf.select("product", "category", "revenue", revenue_difference.alias("difference in revenue")).show()

+----------+----------+-------+---------------------+
|   product|  category|revenue|difference in revenue|
+----------+----------+-------+---------------------+
|      Pro2|    tablet|   6500|                    0|
|      Mini|    tablet|   5500|                 1000|
|       Pro|    tablet|   4500|                 2000|
|       Big|    tablet|   2500|                 4000|
|    Normal|    tablet|   1500|                 5000|
|      Thin|cell phone|   6000|                    0|
| Very thin|cell phone|   6000|                    0|
|Ultra thin|cell phone|   5000|                 1000|
|  Bendable|cell phone|   3000|                 3000|
|  Foldable|cell phone|   3000|                 3000|
+----------+----------+-------+---------------------+



### Creating boundary for the window

There are five types of boundaries, which are:
`UNBOUNDED PRECEDING, UNBOUNDED FOLLOWING, CURRENT ROW, <value> PRECEDING, and <value> FOLLOWING`

There are two types of frames, `ROW` frame and `RANGE` frame.

If `CURRENT ROW` is used as a boundary, it represents the current input row. `<value> PRECEDING` and `<value> FOLLOWING` describes the number of rows appear before and after the current input row, respectively.

### RANGE frame

RANGE frames are based on logical offsets from the position of the current input row

A logical offset is the difference between the value of the ordering expression of the current input row and the value of that same expression of the boundary row of the frame.

Because of this definition, when a RANGE frame is used, only a single ordering expression is allowed. Also, for a RANGE frame, all rows having the same value of the ordering expression with the current input row are considered as same row as far as the boundary calculation is concerned.

<img src="RangePartition/r1.png">
<img src="RangePartition/r2.png">
<img src="RangePartition/r3.png">
<img src="RangePartition/r4.png">
<img src="RangePartition/r5.png">

In [14]:
prdDf.show()

+----------+----------+-------+
|   product|  category|revenue|
+----------+----------+-------+
|      Thin|cell phone|   6000|
|    Normal|    tablet|   1500|
|      Mini|    tablet|   5500|
|Ultra thin|cell phone|   5000|
| Very thin|cell phone|   6000|
|       Big|    tablet|   2500|
|  Bendable|cell phone|   3000|
|  Foldable|cell phone|   3000|
|       Pro|    tablet|   4500|
|      Pro2|    tablet|   6500|
+----------+----------+-------+



In [18]:
rangeWindow = Window.partitionBy("category").orderBy("revenue").rangeBetween(1000,  3000)
rowWindow = Window.partitionBy("category").orderBy("revenue").rowsBetween(1,  2)

In [20]:
prdDf.withColumn("range revenue", sum("revenue").over(rangeWindow)). \
    withColumn("row revenue", sum("revenue").over(rowWindow)). \
    show()

+----------+----------+-------+-------------+-----------+
|   product|  category|revenue|range revenue|row revenue|
+----------+----------+-------+-------------+-----------+
|    Normal|    tablet|   1500|         7000|       7000|
|       Big|    tablet|   2500|        10000|      10000|
|       Pro|    tablet|   4500|        12000|      12000|
|      Mini|    tablet|   5500|         6500|       6500|
|      Pro2|    tablet|   6500|         null|       null|
|  Bendable|cell phone|   3000|        17000|       8000|
|  Foldable|cell phone|   3000|        17000|      11000|
|Ultra thin|cell phone|   5000|        12000|      12000|
|      Thin|cell phone|   6000|         null|       6000|
| Very thin|cell phone|   6000|         null|       null|
+----------+----------+-------+-------------+-----------+

