In [2]:
val orders_df = spark.read
.format("csv")
.option("inferSchema","true")
.option("header","true")
.load("/public/trendytech/datasets/windowdatamodified.csv")

Waiting for a Spark session to start...

orders_df = [country: string, weeknum: int ... 3 more fields]


[country: string, weeknum: int ... 3 more fields]

In [3]:
orders_df.show(10, truncate = false)

+---------+-------+-----------+-------------+------------+
|country  |weeknum|numinvoices|totalquantity|invoicevalue|
+---------+-------+-----------+-------------+------------+
|Spain    |49     |1          |67           |174.72      |
|Germany  |48     |11         |1795         |1600.0      |
|Lithuania|48     |3          |622          |1598.06     |
|Germany  |49     |12         |1852         |1800.0      |
|Bahrain  |51     |1          |54           |205.74      |
|Iceland  |49     |1          |319          |711.79      |
|India    |51     |5          |95           |300.0       |
|Australia|50     |2          |133          |387.95      |
|Italy    |49     |1          |-2           |-17.0       |
|India    |49     |5          |1280         |3284.1      |
+---------+-------+-----------+-------------+------------+
only showing top 10 rows



# Running Total

In [10]:

import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._

val window = Window.partitionBy("country").orderBy(col("weeknum").desc).rowsBetween(Window.unboundedPreceding, Window.currentRow)

val running_total = orders_df.withColumn("running_total", sum("invoicevalue").over(window))

running_total.show(10, truncate = false)

+-------+-------+-----------+-------------+------------+------------------+
|country|weeknum|numinvoices|totalquantity|invoicevalue|running_total     |
+-------+-------+-----------+-------------+------------+------------------+
|Sweden |50     |3          |3714         |2646.3      |2646.3            |
|Germany|51     |5          |1103         |1600.0      |1600.0            |
|Germany|50     |15         |1973         |1800.0      |3400.0            |
|Germany|49     |12         |1852         |1800.0      |5200.0            |
|Germany|48     |11         |1795         |1600.0      |6800.0            |
|France |51     |5          |847          |500.0       |500.0             |
|France |50     |6          |529          |537.32      |1037.3200000000002|
|France |49     |9          |2303         |500.0       |1537.3200000000002|
|France |48     |4          |1299         |500.0       |2037.3200000000002|
|Belgium|51     |2          |942          |800.0       |800.0             |
+-------+---

window = org.apache.spark.sql.expressions.WindowSpec@5410180d
running_total = [country: string, weeknum: int ... 4 more fields]


[country: string, weeknum: int ... 4 more fields]

## Lag

In [16]:
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._

val windowAsc = Window.partitionBy("country").orderBy(col("weeknum").asc)
val lag_df_explicit = orders_df.withColumn("previous_week", lag("invoicevalue", 1).over(windowAsc))

lag_df_explicit.show(10, truncate = false)

+-------+-------+-----------+-------------+------------+-------------+
|country|weeknum|numinvoices|totalquantity|invoicevalue|previous_week|
+-------+-------+-----------+-------------+------------+-------------+
|Sweden |50     |3          |3714         |2646.3      |null         |
|Germany|48     |11         |1795         |1600.0      |null         |
|Germany|49     |12         |1852         |1800.0      |1600.0       |
|Germany|50     |15         |1973         |1800.0      |1800.0       |
|Germany|51     |5          |1103         |1600.0      |1800.0       |
|France |48     |4          |1299         |500.0       |null         |
|France |49     |9          |2303         |500.0       |500.0        |
|France |50     |6          |529          |537.32      |500.0        |
|France |51     |5          |847          |500.0       |537.32       |
|Belgium|48     |1          |528          |800.0       |null         |
+-------+-------+-----------+-------------+------------+-------------+
only s

windowAsc = org.apache.spark.sql.expressions.WindowSpec@7eec60c1
lag_df_explicit = [country: string, weeknum: int ... 4 more fields]


[country: string, weeknum: int ... 4 more fields]

# Dense Rank

In [18]:
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._

val windowDesc = Window.partitionBy("country").orderBy(col("weeknum").desc)
val dense_rank_df = orders_df.withColumn("dence_rank", dense_rank().over(windowDesc))

dense_rank_df.show(10, truncate = false)

+-------+-------+-----------+-------------+------------+----------+
|country|weeknum|numinvoices|totalquantity|invoicevalue|dence_rank|
+-------+-------+-----------+-------------+------------+----------+
|Sweden |50     |3          |3714         |2646.3      |1         |
|Germany|51     |5          |1103         |1600.0      |1         |
|Germany|50     |15         |1973         |1800.0      |2         |
|Germany|49     |12         |1852         |1800.0      |3         |
|Germany|48     |11         |1795         |1600.0      |4         |
|France |51     |5          |847          |500.0       |1         |
|France |50     |6          |529          |537.32      |2         |
|France |49     |9          |2303         |500.0       |3         |
|France |48     |4          |1299         |500.0       |4         |
|Belgium|51     |2          |942          |800.0       |1         |
+-------+-------+-----------+-------------+------------+----------+
only showing top 10 rows



windowDesc = org.apache.spark.sql.expressions.WindowSpec@1d101e23
dense_rank_df = [country: string, weeknum: int ... 4 more fields]


[country: string, weeknum: int ... 4 more fields]