## Section 13 SQL Windowing functions

In [1]:
import org.apache.spark.sql.SparkSession

val spark = SparkSession.
            builder.
            config("spark.ui.port",0).
            config("spark.sql.warehouse.dir","/user/itversity/warehouse").
            enableHiveSupport.
            appName("Spark SQL - Windowing Functions").
            master("yarn").
            getOrCreate

spark = org.apache.spark.sql.SparkSession@3ed274


org.apache.spark.sql.SparkSession@3ed274

In [3]:
import sys.process._

val username = System.getProperty("user.name")

username = itversity


itversity

In [4]:
%%sql

SET spark.sql.shuffle.partitions=2

Waiting for a Spark session to start...

+--------------------+-----+
|                 key|value|
+--------------------+-----+
|spark.sql.shuffle...|    2|
+--------------------+-----+



### 149 Prepare HR Database 

In [6]:
%%sql

DROP DATABASE IF EXISTS itversity_hr CASCADE

++
||
++
++



In [7]:
%%sql

CREATE DATABASE itversity_hr

++
||
++
++



In [8]:
%%sql

USE itversity_hr

++
||
++
++



In [9]:
%%sql

select current_database()

+------------------+
|current_database()|
+------------------+
|      itversity_hr|
+------------------+



In [12]:
%%sql

CREATE TABLE employees
   ( employee_id INT,
     first_name VARCHAR(20),
     last_name VARCHAR(25),
     email VARCHAR(25),
     phone_number VARCHAR(20),
     hire_date DATE,
     job_id VARCHAR(10),
     salary DECIMAL(8,2),
     commission_pct DECIMAL(2,2),
     manager_id INT,
     department_id INT
   ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'

++
||
++
++



In [13]:
%%sql

LOAD DATA LOCAL INPATH '/data/hr_db/employees'
INTO TABLE employees

++
||
++
++



In [14]:
%%sql 

SELECT * FROM employees LIMIT 10

|        156|   Janette|     King|   JKING|011.44.1345.429268|1996-...


+-----------+----------+---------+--------+------------------+----------+------+--------+--------------+----------+-------------+
|employee_id|first_name|last_name|   email|      phone_number| hire_date|job_id|  salary|commission_pct|manager_id|department_id|
+-----------+----------+---------+--------+------------------+----------+------+--------+--------------+----------+-------------+
|        154|   Nanette|Cambrault|NCAMBRAU|011.44.1344.987668|1998-12-09|SA_REP| 7500.00|          0.20|       145|           80|
|        155|    Oliver|  Tuvault|OTUVAULT|011.44.1344.486508|1999-11-23|SA_REP| 7000.00|          0.15|       145|           80|
|        156|   Janette|     King|   JKING|011.44.1345.429268|1996-01-30|SA_REP|10000.00|          0.35|       146|           80|
|        157|   Patrick|    Sully|  PSULLY|011.44.1345.929268|1996-03-04|SA_REP| 9500.00|          0.35|       146|           80|
|        158|     Allan|   McEwen| AMCEWEN|011.44.1345.829268|1996-08-01|SA_REP| 9000.00| 

In [15]:
%%sql

SELECT COUNT(1) FROM employees

+--------+
|count(1)|
+--------+
|     107|
+--------+



In [16]:
%%sql

SELECT employee_id, department_id, salary FROM employees LIMIT 10

+-----------+-------------+--------+
|employee_id|department_id|  salary|
+-----------+-------------+--------+
|        154|           80| 7500.00|
|        155|           80| 7000.00|
|        156|           80|10000.00|
|        157|           80| 9500.00|
|        158|           80| 9000.00|
|        159|           80| 8000.00|
|        160|           80| 7500.00|
|        161|           80| 7000.00|
|        162|           80|10500.00|
|        163|           80| 9500.00|
+-----------+-------------+--------+



### Overview of Windowing functions

In [18]:
%%sql

SELECT employee_id, department_id, salary,
    count(1) OVER (PARTITION BY department_id) AS employee_count,
    rank() OVER (ORDER BY salary DESC) AS rnk,
    lead(employee_id) OVER (PARTITION BY department_id ORDER BY salary DESC) AS lead_emp_id,
    lead(salary) OVER(PARTITION BY department_id ORDER BY salary DESC) AS lead_emp_sal
FROM employees
ORDER BY employee_id

|        105|           60| 4800.00|             5| 59|        106|  ...


+-----------+-------------+--------+--------------+---+-----------+------------+
|employee_id|department_id|  salary|employee_count|rnk|lead_emp_id|lead_emp_sal|
+-----------+-------------+--------+--------------+---+-----------+------------+
|        100|           90|24000.00|             3|  1|        101|    17000.00|
|        101|           90|17000.00|             3|  2|        102|    17000.00|
|        102|           90|17000.00|             3|  2|       null|        null|
|        103|           60| 9000.00|             5| 24|        104|     6000.00|
|        104|           60| 6000.00|             5| 56|        105|     4800.00|
|        105|           60| 4800.00|             5| 59|        106|     4800.00|
|        106|           60| 4800.00|             5| 59|        107|     4200.00|
|        107|           60| 4200.00|             5| 62|       null|        null|
|        108|          100|12000.00|             6|  7|        109|     9000.00|
|        109|          100| 

### Aggregations using Windows functions

In [19]:
%%sql

SELECT employee_id, department_id, salary
FROM employees
ORDER BY department_id, salary
LIMIT 10

+-----------+-------------+--------+
|employee_id|department_id|  salary|
+-----------+-------------+--------+
|        178|         null| 7000.00|
|        200|           10| 4400.00|
|        202|           20| 6000.00|
|        201|           20|13000.00|
|        119|           30| 2500.00|
|        118|           30| 2600.00|
|        117|           30| 2800.00|
|        116|           30| 2900.00|
|        115|           30| 3100.00|
|        114|           30|11000.00|
+-----------+-------------+--------+



In [21]:
%%sql

SELECT department_id, sum(salary) AS department_salary_expense
FROM employees
GROUP BY department_id
ORDER BY department_id

+-------------+-------------------------+
|department_id|department_salary_expense|
+-------------+-------------------------+
|         null|                  7000.00|
|           10|                  4400.00|
|           20|                 19000.00|
|           30|                 24900.00|
|           40|                  6500.00|
|           50|                156400.00|
|           60|                 28800.00|
|           70|                 10000.00|
|           80|                304500.00|
|           90|                 58000.00|
+-------------+-------------------------+
only showing top 10 rows



In [25]:
%%sql

SELECT e.employee_id, e.department_id, e.salary, ae.department_salary_expense, ae.avg_salary_expense
FROM employees e JOIN (
    SELECT 
        department_id, 
        sum(salary) AS department_salary_expense,
        avg(salary) AS avg_salary_expense
    FROM employees
    GROUP BY department_id
    ORDER BY department_id
    ) ae
ON e.department_id=ae.department_id
ORDER BY e.department_id

|        119|           30| 2500.00|                 24900.00...


+-----------+-------------+--------+-------------------------+------------------+
|employee_id|department_id|  salary|department_salary_expense|avg_salary_expense|
+-----------+-------------+--------+-------------------------+------------------+
|        200|           10| 4400.00|                  4400.00|       4400.000000|
|        201|           20|13000.00|                 19000.00|       9500.000000|
|        202|           20| 6000.00|                 19000.00|       9500.000000|
|        117|           30| 2800.00|                 24900.00|       4150.000000|
|        115|           30| 3100.00|                 24900.00|       4150.000000|
|        119|           30| 2500.00|                 24900.00|       4150.000000|
|        116|           30| 2900.00|                 24900.00|       4150.000000|
|        114|           30|11000.00|                 24900.00|       4150.000000|
|        118|           30| 2600.00|                 24900.00|       4150.000000|
|        203|   

In [27]:
%%sql

SELECT e.employee_id, e.department_id, e.salary,
        sum(e.salary)
            OVER (PARTITION BY e.department_id)
            AS department_salary_expense
FROM employees e
ORDER BY e.department_id

|        117|           ...


+-----------+-------------+--------+-------------------------+
|employee_id|department_id|  salary|department_salary_expense|
+-----------+-------------+--------+-------------------------+
|        178|         null| 7000.00|                  7000.00|
|        200|           10| 4400.00|                  4400.00|
|        202|           20| 6000.00|                 19000.00|
|        201|           20|13000.00|                 19000.00|
|        116|           30| 2900.00|                 24900.00|
|        119|           30| 2500.00|                 24900.00|
|        115|           30| 3100.00|                 24900.00|
|        118|           30| 2600.00|                 24900.00|
|        117|           30| 2800.00|                 24900.00|
|        114|           30|11000.00|                 24900.00|
+-----------+-------------+--------+-------------------------+
only showing top 10 rows



In [32]:
%%sql

SELECT e.employee_id, e.department_id, e.salary,
        sum(e.salary) OVER (PARTITION BY e.department_id) AS sum_sal_expense,
        avg(e.salary) OVER (PARTITION BY e.department_id) AS avg_sal_expense,
        min(e.salary) OVER (PARTITION BY e.department_id) AS min_sal_expense,
        max(e.salary) OVER (PARTITION BY e.department_id) AS max_sal_expense,
        count(e.salary) OVER (PARTITION BY e.department_id) AS cnt_sal_expense
FROM employees e
ORDER BY e.department_id

|        202| ...


+-----------+-------------+--------+---------------+---------------+---------------+---------------+---------------+
|employee_id|department_id|  salary|sum_sal_expense|avg_sal_expense|min_sal_expense|max_sal_expense|cnt_sal_expense|
+-----------+-------------+--------+---------------+---------------+---------------+---------------+---------------+
|        178|         null| 7000.00|        7000.00|    7000.000000|        7000.00|        7000.00|              1|
|        200|           10| 4400.00|        4400.00|    4400.000000|        4400.00|        4400.00|              1|
|        201|           20|13000.00|       19000.00|    9500.000000|        6000.00|       13000.00|              2|
|        202|           20| 6000.00|       19000.00|    9500.000000|        6000.00|       13000.00|              2|
|        115|           30| 3100.00|       24900.00|    4150.000000|        2500.00|       11000.00|              6|
|        117|           30| 2800.00|       24900.00|    4150.000

In [33]:
%%sql

USE itversity_retail

++
||
++
++



In [34]:
%%sql

DROP TABLE IF EXISTS daily_revenue

++
||
++
++



In [39]:
%%sql

CREATE TABLE daily_revenue
AS
SELECT o.order_date, round(sum(oi.order_item_subtotal),2) AS revenue
FROM orders o JOIN order_items oi
ON o.order_id = oi.order_item_order_id
WHERE o.order_status IN('COMPLETE','CLOSED')
GROUP BY o.order_date

++
||
++
++



In [40]:
%%sql

SELECT * FROM daily_revenue
ORDER BY order_date
LIMIT 10

+--------------------+--------+
|          order_date| revenue|
+--------------------+--------+
|2013-07-25 00:00:...|31547.23|
|2013-07-26 00:00:...|54713.23|
|2013-07-27 00:00:...|48411.48|
|2013-07-28 00:00:...|35672.03|
|2013-07-29 00:00:...| 54579.7|
|2013-07-30 00:00:...|49329.29|
|2013-07-31 00:00:...|59212.49|
|2013-08-01 00:00:...|49160.08|
|2013-08-02 00:00:...|50688.58|
|2013-08-03 00:00:...|43416.74|
+--------------------+--------+



In [42]:
%%sql

SELECT 
    o.order_date, 
    oi.order_item_product_id,
    round(sum(oi.order_item_subtotal),2) AS revenue
FROM orders o JOIN order_items oi
ON o.order_id = oi.order_item_order_id
WHERE o.order_status IN('COMPLETE','CLOSED')
GROUP BY o.order_date, oi.order_item_product_id

+--------------------+-----...


+--------------------+---------------------+-------+
|          order_date|order_item_product_id|revenue|
+--------------------+---------------------+-------+
|2013-07-25 00:00:...|                  957| 4499.7|
|2013-07-25 00:00:...|                 1014|2798.88|
|2013-07-25 00:00:...|                  365|3359.44|
|2013-07-25 00:00:...|                  926|  79.95|
|2013-07-25 00:00:...|                 1004|5599.72|
|2013-07-25 00:00:...|                  828|  95.97|
|2013-07-25 00:00:...|                   93|  74.97|
|2013-07-25 00:00:...|                  810|  79.96|
|2013-07-25 00:00:...|                  906|  99.96|
|2013-07-25 00:00:...|                  835|  63.98|
+--------------------+---------------------+-------+
only showing top 10 rows



In [43]:
%%sql

CREATE TABLE daily_product_revenue
AS
SELECT 
    o.order_date, 
    oi.order_item_product_id,
    round(sum(oi.order_item_subtotal),2) AS revenue
FROM orders o JOIN order_items oi
ON o.order_id = oi.order_item_order_id
WHERE o.order_status IN('COMPLETE','CLOSED')
GROUP BY o.order_date, oi.order_item_product_id

++
||
++
++



In [44]:
%%sql

SELECT * FROM daily_product_revenue
ORDER BY order_date, order_item_product_id

+--------------------+-----...


+--------------------+---------------------+-------+
|          order_date|order_item_product_id|revenue|
+--------------------+---------------------+-------+
|2013-07-25 00:00:...|                   24| 319.96|
|2013-07-25 00:00:...|                   93|  74.97|
|2013-07-25 00:00:...|                  134|  100.0|
|2013-07-25 00:00:...|                  191|5099.49|
|2013-07-25 00:00:...|                  226| 599.99|
|2013-07-25 00:00:...|                  365|3359.44|
|2013-07-25 00:00:...|                  403|1949.85|
|2013-07-25 00:00:...|                  502| 1650.0|
|2013-07-25 00:00:...|                  572| 119.97|
|2013-07-25 00:00:...|                  625| 199.99|
+--------------------+---------------------+-------+
only showing top 10 rows



### 152 SQL LEAD and LAG 

In [46]:
%%sql

SELECT * FROM daily_revenue
ORDER by order_date DESC
LIMIT 10

+--------------------+--------+
|          order_date| revenue|
+--------------------+--------+
|2014-07-24 00:00:...|50885.19|
|2014-07-23 00:00:...|38795.23|
|2014-07-22 00:00:...|36717.24|
|2014-07-21 00:00:...| 51427.7|
|2014-07-20 00:00:...|60047.45|
|2014-07-19 00:00:...|38420.99|
|2014-07-18 00:00:...| 43856.6|
|2014-07-17 00:00:...|36384.77|
|2014-07-16 00:00:...|43011.92|
|2014-07-15 00:00:...|53480.23|
+--------------------+--------+



In [54]:
%%sql

SELECT t.*,
    lead(order_date) OVER (ORDER BY order_date DESC) AS prior_date,
    lead(revenue) OVER (ORDER BY order_date DESC) AS PRIOR_revenue,
    lag(order_date) OVER (ORDER BY order_date) AS lag_prior_date,
    lag(revenue) OVER (ORDER BY order_date) AS lag_prior_revenue
FROM daily_revenue as t
ORDER BY order_date DESC
LIMIT 10

|2014-07-21 00:00:...| 51427.7|2014-07-20 00:00:...|     60047.45|2014-07-20 00:...


+--------------------+--------+--------------------+-------------+--------------------+-----------------+
|          order_date| revenue|          prior_date|PRIOR_revenue|      lag_prior_date|lag_prior_revenue|
+--------------------+--------+--------------------+-------------+--------------------+-----------------+
|2014-07-24 00:00:...|50885.19|2014-07-23 00:00:...|     38795.23|2014-07-23 00:00:...|         38795.23|
|2014-07-23 00:00:...|38795.23|2014-07-22 00:00:...|     36717.24|2014-07-22 00:00:...|         36717.24|
|2014-07-22 00:00:...|36717.24|2014-07-21 00:00:...|      51427.7|2014-07-21 00:00:...|          51427.7|
|2014-07-21 00:00:...| 51427.7|2014-07-20 00:00:...|     60047.45|2014-07-20 00:00:...|         60047.45|
|2014-07-20 00:00:...|60047.45|2014-07-19 00:00:...|     38420.99|2014-07-19 00:00:...|         38420.99|
|2014-07-19 00:00:...|38420.99|2014-07-18 00:00:...|      43856.6|2014-07-18 00:00:...|          43856.6|
|2014-07-18 00:00:...| 43856.6|2014-07-17 00:0

In [55]:
%%sql

SELECT t.*,
    lead(order_date) OVER (ORDER BY order_date DESC) AS prior_date,
    lead(revenue) OVER (ORDER BY order_date DESC) AS PRIOR_revenue
FROM daily_revenue as t
ORDER BY order_date
LIMIT 10

|2013-08-01 00:00:...|49160.08|2013-07-31 00:0...


+--------------------+--------+--------------------+-------------+
|          order_date| revenue|          prior_date|PRIOR_revenue|
+--------------------+--------+--------------------+-------------+
|2013-07-25 00:00:...|31547.23|                null|         null|
|2013-07-26 00:00:...|54713.23|2013-07-25 00:00:...|     31547.23|
|2013-07-27 00:00:...|48411.48|2013-07-26 00:00:...|     54713.23|
|2013-07-28 00:00:...|35672.03|2013-07-27 00:00:...|     48411.48|
|2013-07-29 00:00:...| 54579.7|2013-07-28 00:00:...|     35672.03|
|2013-07-30 00:00:...|49329.29|2013-07-29 00:00:...|      54579.7|
|2013-07-31 00:00:...|59212.49|2013-07-30 00:00:...|     49329.29|
|2013-08-01 00:00:...|49160.08|2013-07-31 00:00:...|     59212.49|
|2013-08-02 00:00:...|50688.58|2013-08-01 00:00:...|     49160.08|
|2013-08-03 00:00:...|43416.74|2013-08-02 00:00:...|     50688.58|
+--------------------+--------+--------------------+-------------+



In [57]:
%%sql

SELECT t.*,
    lead(order_date,7) OVER (ORDER BY order_date DESC) AS prior_date,
    lead(revenue,7) OVER (ORDER BY order_date DESC) AS PRIOR_revenue
FROM daily_revenue as t
ORDER BY order_date DESC
LIMIT 10

|2014-07-17 00:00:...|36384.77|2014-07-10 00:0...


+--------------------+--------+--------------------+-------------+
|          order_date| revenue|          prior_date|PRIOR_revenue|
+--------------------+--------+--------------------+-------------+
|2014-07-24 00:00:...|50885.19|2014-07-17 00:00:...|     36384.77|
|2014-07-23 00:00:...|38795.23|2014-07-16 00:00:...|     43011.92|
|2014-07-22 00:00:...|36717.24|2014-07-15 00:00:...|     53480.23|
|2014-07-21 00:00:...| 51427.7|2014-07-14 00:00:...|     29937.52|
|2014-07-20 00:00:...|60047.45|2014-07-13 00:00:...|     40410.99|
|2014-07-19 00:00:...|38420.99|2014-07-12 00:00:...|     38449.77|
|2014-07-18 00:00:...| 43856.6|2014-07-11 00:00:...|     29596.32|
|2014-07-17 00:00:...|36384.77|2014-07-10 00:00:...|     47826.02|
|2014-07-16 00:00:...|43011.92|2014-07-09 00:00:...|     36929.91|
|2014-07-15 00:00:...|53480.23|2014-07-08 00:00:...|     50434.81|
+--------------------+--------+--------------------+-------------+



In [58]:
%%sql

SELECT t.*,
    lead(order_date,7) OVER (ORDER BY order_date DESC) AS prior_date,
    lead(revenue,7) OVER (ORDER BY order_date DESC) AS PRIOR_revenue
FROM daily_revenue as t
ORDER BY order_date 
LIMIT 10

|2013-08-01 00:00:...|49160.08|2013-07-25 00:0...


+--------------------+--------+--------------------+-------------+
|          order_date| revenue|          prior_date|PRIOR_revenue|
+--------------------+--------+--------------------+-------------+
|2013-07-25 00:00:...|31547.23|                null|         null|
|2013-07-26 00:00:...|54713.23|                null|         null|
|2013-07-27 00:00:...|48411.48|                null|         null|
|2013-07-28 00:00:...|35672.03|                null|         null|
|2013-07-29 00:00:...| 54579.7|                null|         null|
|2013-07-30 00:00:...|49329.29|                null|         null|
|2013-07-31 00:00:...|59212.49|                null|         null|
|2013-08-01 00:00:...|49160.08|2013-07-25 00:00:...|     31547.23|
|2013-08-02 00:00:...|50688.58|2013-07-26 00:00:...|     54713.23|
|2013-08-03 00:00:...|43416.74|2013-07-27 00:00:...|     48411.48|
+--------------------+--------+--------------------+-------------+



In [59]:
%%sql

SELECT t.*,
    lead(order_date,7) OVER (ORDER BY order_date DESC) AS prior_date,
    lead(revenue,7,0) OVER (ORDER BY order_date DESC) AS PRIOR_revenue
FROM daily_revenue as t
ORDER BY order_date 
LIMIT 10

|2013-08-01 00:00:...|49160.08|2013-07-25 00:0...


+--------------------+--------+--------------------+-------------+
|          order_date| revenue|          prior_date|PRIOR_revenue|
+--------------------+--------+--------------------+-------------+
|2013-07-25 00:00:...|31547.23|                null|          0.0|
|2013-07-26 00:00:...|54713.23|                null|          0.0|
|2013-07-27 00:00:...|48411.48|                null|          0.0|
|2013-07-28 00:00:...|35672.03|                null|          0.0|
|2013-07-29 00:00:...| 54579.7|                null|          0.0|
|2013-07-30 00:00:...|49329.29|                null|          0.0|
|2013-07-31 00:00:...|59212.49|                null|          0.0|
|2013-08-01 00:00:...|49160.08|2013-07-25 00:00:...|     31547.23|
|2013-08-02 00:00:...|50688.58|2013-07-26 00:00:...|     54713.23|
|2013-08-03 00:00:...|43416.74|2013-07-27 00:00:...|     48411.48|
+--------------------+--------+--------------------+-------------+



In [60]:
%%sql

SELECT t.*,
    lead(order_date,7,'na') OVER (ORDER BY order_date DESC) AS prior_date,
    lead(revenue,7,0) OVER (ORDER BY order_date DESC) AS PRIOR_revenue
FROM daily_revenue as t
ORDER BY order_date 
LIMIT 10

|2013-08-01 00:00:...|49160.08|2013-07-25 00:0...


+--------------------+--------+--------------------+-------------+
|          order_date| revenue|          prior_date|PRIOR_revenue|
+--------------------+--------+--------------------+-------------+
|2013-07-25 00:00:...|31547.23|                  na|          0.0|
|2013-07-26 00:00:...|54713.23|                  na|          0.0|
|2013-07-27 00:00:...|48411.48|                  na|          0.0|
|2013-07-28 00:00:...|35672.03|                  na|          0.0|
|2013-07-29 00:00:...| 54579.7|                  na|          0.0|
|2013-07-30 00:00:...|49329.29|                  na|          0.0|
|2013-07-31 00:00:...|59212.49|                  na|          0.0|
|2013-08-01 00:00:...|49160.08|2013-07-25 00:00:...|     31547.23|
|2013-08-02 00:00:...|50688.58|2013-07-26 00:00:...|     54713.23|
|2013-08-03 00:00:...|43416.74|2013-07-27 00:00:...|     48411.48|
+--------------------+--------+--------------------+-------------+

