In [None]:
/public/trendytech/datasets/windowdatamodified.csv

In [91]:
spark.stop()

In [92]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession.builder \
    .appName("itv023333") \
    .master("yarn") \
    .config("spark.ui.port", "0") \
    .config("spark.sql.warehouse.dir", f"/user/{username}/warehouse") \
    .enableHiveSupport() \
    .getOrCreate()

In [93]:
spark

In [94]:
invoice_df = spark.read \
.format("csv") \
.option("header","true") \
.option("inferSchema","true") \
.load("/public/trendytech/datasets/windowdatamodified.csv")

In [95]:
invoice_df.show(5)

+---------+-------+-----------+-------------+------------+
|  country|weeknum|numinvoices|totalquantity|invoicevalue|
+---------+-------+-----------+-------------+------------+
|    Spain|     49|          1|           67|      174.72|
|  Germany|     48|         11|         1795|      1600.0|
|Lithuania|     48|          3|          622|     1598.06|
|  Germany|     49|         12|         1852|      1800.0|
|  Bahrain|     51|          1|           54|      205.74|
+---------+-------+-----------+-------------+------------+
only showing top 5 rows



Select a relevant dataset of your choice and write some queries to
demonstrate the following :
- Running total, Grouping aggregates and various Window functions like
rank, dense_rank, row_num, lead, lag.
- Also try creating a pivot view.

In [27]:
from pyspark.sql.functions import *

grouped_df = (
    invoice_df
    .groupBy("country", "weeknum","invoicevalue")
    .agg(
        sum("totalquantity").alias("sum_quantity"),
        sum(expr("totalquantity * invoicevalue")).alias("total_price")
    )
    .sort("country","weeknum")
)


In [28]:
grouped_df.show()

+---------------+-------+------------+------------+------------------+
|        country|weeknum|invoicevalue|sum_quantity|       total_price|
+---------------+-------+------------+------------+------------------+
|      Australia|     48|      358.25|         107|          38332.75|
|      Australia|     49|       258.9|         214|           55404.6|
|      Australia|     50|      387.95|         133|          51597.35|
|        Austria|     50|      257.04|           3| 771.1200000000001|
|        Bahrain|     51|      205.74|          54|11109.960000000001|
|        Belgium|     48|       800.0|         528|          422400.0|
|        Belgium|     50|      625.16|         285|178170.59999999998|
|        Belgium|     51|       800.0|         942|          753600.0|
|Channel Islands|     49|      363.53|          80|29082.399999999998|
|         Cyprus|     50|     1590.82|         917|        1458781.94|
|        Denmark|     49|      1281.5|         454|          581801.0|
|     

In [79]:
from pyspark.sql.window import *

In [80]:
my_window = (Window.partitionBy("country")
             .orderBy("weeknum")
            .rowsBetween(Window.unboundedPreceding,Window.currentRow))
                

In [81]:
result_df = invoice_df.withColumn("running_total", sum("invoicevalue").over(my_window))

In [82]:
result_df.show()

+-------+-------+-----------+-------------+------------+------------------+
|country|weeknum|numinvoices|totalquantity|invoicevalue|     running_total|
+-------+-------+-----------+-------------+------------+------------------+
| Sweden|     50|          3|         3714|      2646.3|            2646.3|
|Germany|     48|         11|         1795|      1600.0|            1600.0|
|Germany|     49|         12|         1852|      1800.0|            3400.0|
|Germany|     50|         15|         1973|      1800.0|            5200.0|
|Germany|     51|          5|         1103|      1600.0|            6800.0|
| France|     48|          4|         1299|       500.0|             500.0|
| France|     49|          9|         2303|       500.0|            1000.0|
| France|     50|          6|          529|      537.32|1537.3200000000002|
| France|     51|          5|          847|       500.0|2037.3200000000002|
|Belgium|     48|          1|          528|       800.0|             800.0|
|Belgium|   

In [86]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, dense_rank, row_number
my_window = (
    Window.partitionBy("country")
          .orderBy(desc("invoicevalue"))
)
result_df = (
    invoice_df
    .withColumn("rank", rank().over(my_window))
    .withColumn("dense_rank", dense_rank().over(my_window))
    .withColumn("row_number", row_number().over(my_window))
)
result_df.show()

+-------+-------+-----------+-------------+------------+----+----------+----------+
|country|weeknum|numinvoices|totalquantity|invoicevalue|rank|dense_rank|row_number|
+-------+-------+-----------+-------------+------------+----+----------+----------+
| Sweden|     50|          3|         3714|      2646.3|   1|         1|         1|
|Germany|     49|         12|         1852|      1800.0|   1|         1|         1|
|Germany|     50|         15|         1973|      1800.0|   1|         1|         2|
|Germany|     48|         11|         1795|      1600.0|   3|         2|         3|
|Germany|     51|          5|         1103|      1600.0|   3|         2|         4|
| France|     50|          6|          529|      537.32|   1|         1|         1|
| France|     51|          5|          847|       500.0|   2|         2|         2|
| France|     49|          9|         2303|       500.0|   2|         2|         3|
| France|     48|          4|         1299|       500.0|   2|         2|    

In [96]:
my_window = (
    Window.partitionBy("country")
          .orderBy("weeknum")
)

In [98]:
result_df =(invoice_df
            .withColumn("previous_invoice",lag("invoicevalue").over(my_window))
            .withColumn("diff_invoice",expr("invoicevalue - previous_invoice"))
           )
result_df.show()

+-------+-------+-----------+-------------+------------+----------------+-------------------+
|country|weeknum|numinvoices|totalquantity|invoicevalue|previous_invoice|       diff_invoice|
+-------+-------+-----------+-------------+------------+----------------+-------------------+
| Sweden|     50|          3|         3714|      2646.3|            null|               null|
|Germany|     48|         11|         1795|      1600.0|            null|               null|
|Germany|     49|         12|         1852|      1800.0|          1600.0|              200.0|
|Germany|     50|         15|         1973|      1800.0|          1800.0|                0.0|
|Germany|     51|          5|         1103|      1600.0|          1800.0|             -200.0|
| France|     48|          4|         1299|       500.0|            null|               null|
| France|     49|          9|         2303|       500.0|           500.0|                0.0|
| France|     50|          6|          529|      537.32|    

In [99]:
result_df =(invoice_df
            .withColumn("previous_invoice",lead("invoicevalue").over(my_window))
            .withColumn("diff_invoice",expr("invoicevalue - previous_invoice"))
           )
result_df.show()

+-------+-------+-----------+-------------+------------+----------------+-------------------+
|country|weeknum|numinvoices|totalquantity|invoicevalue|previous_invoice|       diff_invoice|
+-------+-------+-----------+-------------+------------+----------------+-------------------+
| Sweden|     50|          3|         3714|      2646.3|            null|               null|
|Germany|     48|         11|         1795|      1600.0|          1800.0|             -200.0|
|Germany|     49|         12|         1852|      1800.0|          1800.0|                0.0|
|Germany|     50|         15|         1973|      1800.0|          1600.0|              200.0|
|Germany|     51|          5|         1103|      1600.0|            null|               null|
| France|     48|          4|         1299|       500.0|           500.0|                0.0|
| France|     49|          9|         2303|       500.0|          537.32| -37.32000000000005|
| France|     50|          6|          529|      537.32|    