In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from functools import reduce
import pandas as pd
import pyspark.sql.functions as F
import pyspark.sql.types as T
from time import sleep
from typing import Iterator, Sequence, Tuple
from sklearn.linear_model import LinearRegression

In this chapter, we continue to work with the NOAA weather data we worked with in Chapter 9: 10 years' (2010 to 2020) worth of NOAA weather data located in Google BigQuery, which totals over 40 million records. The `bigquery-public-data` is a project available to all.

Here, we read in a large amount of data from a warehouse and assemble a single data frame representing weather information across the globe for a period of 10 years.

In [3]:
# Use the Spark connector package version closest to our installed version of Spark (3.4.1)
spark = SparkSession.builder.config(
    "spark.jars.packages",
    "com.google.cloud.spark:spark-3.3-bigquery:0.32.0"
).config(
    "parentProject", "cool-wharf-393713"
).getOrCreate()

In [4]:
# Abstract the table reading routine into a reusable function, returning the resulting data frame
def read_df_from_bq(year):
    return (
        spark.read.format("bigquery").option(
            "table", f"bigquery-public-data.noaa_gsod.gsod{year}"
        )
        .option("credentialsFile", "../../../../cool-wharf-393713-73800a184f10.json")
        .load()
    )

In [6]:
# gsod = (
#     reduce(
#         # use a lambda function over a list comprehension of data frames to union them all
#         lambda x, y: x.unionByName(y, allowMissingColumns=True),
#         [read_df_from_bq(year) for year in range(2010, 2021)],
#     )
#     .dropna(subset=["year", "mo", "da", "temp"])
#     .where(F.col("temp") != 9999.9)
#     .drop("date")
# )

# Instead, we work locally with three year's worth of data in Parquet format:
gsod = spark.read.parquet("../../data/window/gsod.parquet")


### Identifying the coldest day of each year, the long way

we want a data frame containing three records, one for each year and showing the station, the date (year, month, day), and the temperature of the coldest day recorded for that year.

In [7]:
# Compute the lowest temperature for each year using groupBy()
coldest_temp = gsod.groupby("year").agg(F.min("temp").alias("temp"))
coldest_temp.orderBy("temp").show()

+----+------+
|year|  temp|
+----+------+
|2017|-114.7|
|2019|-114.7|
|2018|-113.5|
+----+------+



In [8]:
# Use a left-semi equi-join on the original table to get 
# the month and day columns
coldest_when = gsod.join(
    coldest_temp, how="left_semi", on=["year", "temp"]
).select("stn", "year", "mo", "da", "temp")

coldest_when.orderBy("year", "mo", "da").show()

+------+----+---+---+------+
|   stn|year| mo| da|  temp|
+------+----+---+---+------+
|896250|2017| 06| 20|-114.7|
|896060|2018| 08| 27|-113.5|
|895770|2019| 06| 15|-114.7|
+------+----+---+---+------+



Let's do this instead using a window function:

In [11]:
# Create a WindowSpec object by using the Window builder class
# This forms a blueprint for an eventual window function.
each_year = Window.partitionBy("year")

print(each_year)

<pyspark.sql.window.WindowSpec object at 0x000001AE0FBAA040>


In [15]:
# Select the minimum temperature for each year using a window function
(
    gsod
    .withColumn("min_temp", F.min("temp").over(each_year))
    .where("temp = min_temp")
    .select("year", "mo", "da", "stn", "temp")
    .orderBy("year", "mo", "da")
    .show()
)

+----+---+---+------+------+
|year| mo| da|   stn|  temp|
+----+---+---+------+------+
|2017| 06| 20|896250|-114.7|
|2018| 08| 27|896060|-113.5|
|2019| 06| 15|895770|-114.7|
+----+---+---+------+------+



In [5]:
# Read gsod_light, a smaller data frame with 10 records
# (so that we can see it in its entirety when show()ing it)
gsod_light = spark.read.parquet("../../data/window/gsod_light.parquet")
gsod_light.show()

+------+----+---+---+----+----------+
|   stn|year| mo| da|temp|count_temp|
+------+----+---+---+----+----------+
|994979|2017| 12| 11|21.3|        21|
|998012|2017| 03| 02|31.4|        24|
|719200|2017| 10| 09|60.5|        11|
|917350|2018| 04| 21|82.6|         9|
|076470|2018| 06| 07|65.0|        24|
|996470|2018| 03| 12|55.6|        12|
|041680|2019| 02| 19|16.1|        15|
|949110|2019| 11| 23|54.9|        14|
|998252|2019| 04| 18|44.7|        11|
|998166|2019| 03| 20|34.8|        12|
+------+----+---+---+----+----------+



In [6]:
# Create a window which partitions the data frame by month,
# order each partition by the count_temp column
temp_per_month_asc = Window.partitionBy("mo").orderBy("count_temp")

In [7]:
# Rank each day within its month account to count_temp column
gsod_light.withColumn(
    "rank_tpm", F.rank().over(temp_per_month_asc)
).show()

+------+----+---+---+----+----------+--------+
|   stn|year| mo| da|temp|count_temp|rank_tpm|
+------+----+---+---+----+----------+--------+
|041680|2019| 02| 19|16.1|        15|       1|
|996470|2018| 03| 12|55.6|        12|       1|
|998166|2019| 03| 20|34.8|        12|       1|
|998012|2017| 03| 02|31.4|        24|       3|
|917350|2018| 04| 21|82.6|         9|       1|
|998252|2019| 04| 18|44.7|        11|       2|
|076470|2018| 06| 07|65.0|        24|       1|
|719200|2017| 10| 09|60.5|        11|       1|
|949110|2019| 11| 23|54.9|        14|       1|
|994979|2017| 12| 11|21.3|        21|       1|
+------+----+---+---+----+----------+--------+



In [9]:
# We can avoid gaps in the ranking using dense_rank()
gsod_light.withColumn(
    "rank_tpm", F.dense_rank().over(temp_per_month_asc)
).show()

+------+----+---+---+----+----------+--------+
|   stn|year| mo| da|temp|count_temp|rank_tpm|
+------+----+---+---+----+----------+--------+
|041680|2019| 02| 19|16.1|        15|       1|
|996470|2018| 03| 12|55.6|        12|       1|
|998166|2019| 03| 20|34.8|        12|       1|
|998012|2017| 03| 02|31.4|        24|       2|
|917350|2018| 04| 21|82.6|         9|       1|
|998252|2019| 04| 18|44.7|        11|       2|
|076470|2018| 06| 07|65.0|        24|       1|
|719200|2017| 10| 09|60.5|        11|       1|
|949110|2019| 11| 23|54.9|        14|       1|
|994979|2017| 12| 11|21.3|        21|       1|
+------+----+---+---+----+----------+--------+



In [12]:
# Compute percentage rank for every recorded temperature per year
temp_each_year = each_year.orderBy("temp")

gsod_light.withColumn(
    "rank_tpm", F.percent_rank().over(temp_each_year)
).show()

+------+----+---+---+----+----------+------------------+
|   stn|year| mo| da|temp|count_temp|          rank_tpm|
+------+----+---+---+----+----------+------------------+
|994979|2017| 12| 11|21.3|        21|               0.0|
|998012|2017| 03| 02|31.4|        24|               0.5|
|719200|2017| 10| 09|60.5|        11|               1.0|
|996470|2018| 03| 12|55.6|        12|               0.0|
|076470|2018| 06| 07|65.0|        24|               0.5|
|917350|2018| 04| 21|82.6|         9|               1.0|
|041680|2019| 02| 19|16.1|        15|               0.0|
|998166|2019| 03| 20|34.8|        12|0.3333333333333333|
|998252|2019| 04| 18|44.7|        11|0.6666666666666666|
|949110|2019| 11| 23|54.9|        14|               1.0|
+------+----+---+---+----+----------+------------------+



In [14]:
# Compute the two-tile value over the window.
# If a value overlaps with two tiles, it takes the value of the lowest one.
gsod_light.withColumn("rank_tpm", F.ntile(2).over(temp_each_year)).show()

+------+----+---+---+----+----------+--------+
|   stn|year| mo| da|temp|count_temp|rank_tpm|
+------+----+---+---+----+----------+--------+
|994979|2017| 12| 11|21.3|        21|       1|
|998012|2017| 03| 02|31.4|        24|       1|
|719200|2017| 10| 09|60.5|        11|       2|
|996470|2018| 03| 12|55.6|        12|       1|
|076470|2018| 06| 07|65.0|        24|       1|
|917350|2018| 04| 21|82.6|         9|       2|
|041680|2019| 02| 19|16.1|        15|       1|
|998166|2019| 03| 20|34.8|        12|       1|
|998252|2019| 04| 18|44.7|        11|       2|
|949110|2019| 11| 23|54.9|        14|       2|
+------+----+---+---+----+----------+--------+



In [15]:
# Number records within each window partition using row_number()
gsod_light.withColumn(
    "rank_tpm", F.row_number().over(temp_each_year)
).show()

+------+----+---+---+----+----------+--------+
|   stn|year| mo| da|temp|count_temp|rank_tpm|
+------+----+---+---+----+----------+--------+
|994979|2017| 12| 11|21.3|        21|       1|
|998012|2017| 03| 02|31.4|        24|       2|
|719200|2017| 10| 09|60.5|        11|       3|
|996470|2018| 03| 12|55.6|        12|       1|
|076470|2018| 06| 07|65.0|        24|       2|
|917350|2018| 04| 21|82.6|         9|       3|
|041680|2019| 02| 19|16.1|        15|       1|
|998166|2019| 03| 20|34.8|        12|       2|
|998252|2019| 04| 18|44.7|        11|       3|
|949110|2019| 11| 23|54.9|        14|       4|
+------+----+---+---+----+----------+--------+



In [16]:
# Create a window with a descending-ordered column
temp_per_month_desc = Window.partitionBy("mo").orderBy(
    F.col("count_temp").desc()
)

gsod_light.withColumn(
    "row_number", F.row_number().over(temp_per_month_desc)
).show()

+------+----+---+---+----+----------+----------+
|   stn|year| mo| da|temp|count_temp|row_number|
+------+----+---+---+----+----------+----------+
|041680|2019| 02| 19|16.1|        15|         1|
|998012|2017| 03| 02|31.4|        24|         1|
|996470|2018| 03| 12|55.6|        12|         2|
|998166|2019| 03| 20|34.8|        12|         3|
|998252|2019| 04| 18|44.7|        11|         1|
|917350|2018| 04| 21|82.6|         9|         2|
|076470|2018| 06| 07|65.0|        24|         1|
|719200|2017| 10| 09|60.5|        11|         1|
|949110|2019| 11| 23|54.9|        14|         1|
|994979|2017| 12| 11|21.3|        21|         1|
+------+----+---+---+----+----------+----------+



### Analytic functions: looking back and ahead

We can access records bofore or after using LAG() and LEAD()

In [17]:
gsod_light.withColumn(
    "previous_temp", F.lag("temp").over(temp_each_year)
).withColumn(
    "previous_temp_2", F.lag("temp", 2).over(temp_each_year)
).show()

+------+----+---+---+----+----------+-------------+---------------+
|   stn|year| mo| da|temp|count_temp|previous_temp|previous_temp_2|
+------+----+---+---+----+----------+-------------+---------------+
|994979|2017| 12| 11|21.3|        21|         null|           null|
|998012|2017| 03| 02|31.4|        24|         21.3|           null|
|719200|2017| 10| 09|60.5|        11|         31.4|           21.3|
|996470|2018| 03| 12|55.6|        12|         null|           null|
|076470|2018| 06| 07|65.0|        24|         55.6|           null|
|917350|2018| 04| 21|82.6|         9|         65.0|           55.6|
|041680|2019| 02| 19|16.1|        15|         null|           null|
|998166|2019| 03| 20|34.8|        12|         16.1|           null|
|998252|2019| 04| 18|44.7|        11|         34.8|           16.1|
|949110|2019| 11| 23|54.9|        14|         44.7|           34.8|
+------+----+---+---+----+----------+-------------+---------------+



In [18]:
# Compare percent_rank() and cum_dist over a window
gsod_light.withColumn(
    "percent_rank", F.percent_rank().over(temp_each_year)
).withColumn("cume_dist", F.cume_dist().over(temp_each_year)).show()

+------+----+---+---+----+----------+------------------+------------------+
|   stn|year| mo| da|temp|count_temp|      percent_rank|         cume_dist|
+------+----+---+---+----+----------+------------------+------------------+
|994979|2017| 12| 11|21.3|        21|               0.0|0.3333333333333333|
|998012|2017| 03| 02|31.4|        24|               0.5|0.6666666666666666|
|719200|2017| 10| 09|60.5|        11|               1.0|               1.0|
|996470|2018| 03| 12|55.6|        12|               0.0|0.3333333333333333|
|076470|2018| 06| 07|65.0|        24|               0.5|0.6666666666666666|
|917350|2018| 04| 21|82.6|         9|               1.0|               1.0|
|041680|2019| 02| 19|16.1|        15|               0.0|              0.25|
|998166|2019| 03| 20|34.8|        12|0.3333333333333333|               0.5|
|998252|2019| 04| 18|44.7|        11|0.6666666666666666|              0.75|
|949110|2019| 11| 23|54.9|        14|               1.0|               1.0|
+------+----

### Exercise 10.2

If you have a window where all the ordered values are the same, what is the result of applying ntile() to the window?


In [24]:
ntile_example = spark.createDataFrame(
    [[x // 4, 2] for x in range(1001)], ["index", "value"] 
)

ntile_example.show()

+-----+-----+
|index|value|
+-----+-----+
|    0|    2|
|    0|    2|
|    0|    2|
|    0|    2|
|    1|    2|
|    1|    2|
|    1|    2|
|    1|    2|
|    2|    2|
|    2|    2|
|    2|    2|
|    2|    2|
|    3|    2|
|    3|    2|
|    3|    2|
|    3|    2|
|    4|    2|
|    4|    2|
|    4|    2|
|    4|    2|
+-----+-----+
only showing top 20 rows



In [26]:
ntile_window = Window.partitionBy("index").orderBy("value")

print(ntile_window)

<pyspark.sql.window.WindowSpec object at 0x000001AE152C15E0>


In [28]:
ntile_example.withColumn("3tile", F.ntile(3).over(ntile_window)).show(10)

+-----+-----+-----+
|index|value|3tile|
+-----+-----+-----+
|    0|    2|    1|
|    0|    2|    1|
|    0|    2|    2|
|    0|    2|    3|
|    1|    2|    1|
|    1|    2|    1|
|    1|    2|    2|
|    1|    2|    3|
|    2|    2|    1|
|    2|    2|    1|
+-----+-----+-----+
only showing top 10 rows



### Using row and range boundaries

In [30]:
# We get different results when we average over an ordered window
# than we do when we average over an unordered window
not_ordered = Window.partitionBy("year")
ordered = not_ordered.orderBy("temp")

gsod_light.withColumn(
    "avg_NO", F.avg("temp").over(not_ordered)
).withColumn(
    "avg_O", F.avg("temp").over(ordered)
).show()

+------+----+---+---+----+----------+------------------+------------------+
|   stn|year| mo| da|temp|count_temp|            avg_NO|             avg_O|
+------+----+---+---+----+----------+------------------+------------------+
|994979|2017| 12| 11|21.3|        21|37.733333333333334|              21.3|
|998012|2017| 03| 02|31.4|        24|37.733333333333334|             26.35|
|719200|2017| 10| 09|60.5|        11|37.733333333333334|37.733333333333334|
|996470|2018| 03| 12|55.6|        12| 67.73333333333333|              55.6|
|076470|2018| 06| 07|65.0|        24| 67.73333333333333|              60.3|
|917350|2018| 04| 21|82.6|         9| 67.73333333333333| 67.73333333333333|
|041680|2019| 02| 19|16.1|        15|            37.625|              16.1|
|998166|2019| 03| 20|34.8|        12|            37.625|             25.45|
|998252|2019| 04| 18|44.7|        11|            37.625|31.866666666666664|
|949110|2019| 11| 23|54.9|        14|            37.625|            37.625|
+------+----

In [31]:
# Use a window spec with explicit window boundaries
not_ordered = Window.partitionBy("year").rowsBetween(
    Window.unboundedPreceding, Window.unboundedFollowing
)
ordered = not_ordered.partitionBy("temp").rowsBetween(
    Window.unboundedPreceding, Window.currentRow
)

In [32]:
# Create a date column to apply range a range window on
gsod_light_p = (
    gsod_light
    .withColumn("year", F.lit(2019))
    .withColumn("dt",
             F.to_date(
                 F.concat_ws("-", F.col("year"), F.col("mo"), F.col("da"))
             )
        )
    .withColumn("dt_num", F.unix_timestamp("dt"))
)

gsod_light_p.show()

+------+----+---+---+----+----------+----------+----------+
|   stn|year| mo| da|temp|count_temp|        dt|    dt_num|
+------+----+---+---+----+----------+----------+----------+
|994979|2019| 12| 11|21.3|        21|2019-12-11|1576022400|
|998012|2019| 03| 02|31.4|        24|2019-03-02|1551484800|
|719200|2019| 10| 09|60.5|        11|2019-10-09|1570575600|
|917350|2019| 04| 21|82.6|         9|2019-04-21|1555801200|
|076470|2019| 06| 07|65.0|        24|2019-06-07|1559862000|
|996470|2019| 03| 12|55.6|        12|2019-03-12|1552348800|
|041680|2019| 02| 19|16.1|        15|2019-02-19|1550534400|
|949110|2019| 11| 23|54.9|        14|2019-11-23|1574467200|
|998252|2019| 04| 18|44.7|        11|2019-04-18|1555542000|
|998166|2019| 03| 20|34.8|        12|2019-03-20|1553040000|
+------+----+---+---+----+----------+----------+----------+



In [34]:
# Compute the average temperature for a 60 day sliding window
# Note that rangeBetween() uses the VALUES rather than ROW NUMBERs.
ONE_MONTH_ISH = 30 * 60 * 60 * 24
one_month_ish_before_after = (
    Window.partitionBy("year")
    .orderBy("dt_num")
    .rangeBetween(-ONE_MONTH_ISH, ONE_MONTH_ISH)
)

gsod_light_p.withColumn(
    "avg_count", F.avg("count_temp").over(one_month_ish_before_after)
).show()

+------+----+---+---+----+----------+----------+----------+------------------+
|   stn|year| mo| da|temp|count_temp|        dt|    dt_num|         avg_count|
+------+----+---+---+----+----------+----------+----------+------------------+
|041680|2019| 02| 19|16.1|        15|2019-02-19|1550534400|             15.75|
|998012|2019| 03| 02|31.4|        24|2019-03-02|1551484800|             15.75|
|996470|2019| 03| 12|55.6|        12|2019-03-12|1552348800|             15.75|
|998166|2019| 03| 20|34.8|        12|2019-03-20|1553040000|              14.8|
|998252|2019| 04| 18|44.7|        11|2019-04-18|1555542000|10.666666666666666|
|917350|2019| 04| 21|82.6|         9|2019-04-21|1555801200|              10.0|
|076470|2019| 06| 07|65.0|        24|2019-06-07|1559862000|              24.0|
|719200|2019| 10| 09|60.5|        11|2019-10-09|1570575600|              11.0|
|949110|2019| 11| 23|54.9|        14|2019-11-23|1574467200|              17.5|
|994979|2019| 12| 11|21.3|        21|2019-12-11|1576

### Exercise 10.3

If you have a data frame with 1,000,001 rows, where the ordered column ord is defined by `F.lit(10)`, what is the result of the following window functions?

1) `F.count("ord").over(Window.partitionBy().orderBy("ord").rowsBetween(-2, 2))`

2) `F.count("ord").over(Window.partitionBy().orderBy("ord").rangeBetween(-2, 2))`



In [37]:
count_example = spark.createDataFrame(
    [[10] for x in range(1000001)], ["ord"] 
)

count_example.show(10)

+---+
|ord|
+---+
| 10|
| 10|
| 10|
| 10|
| 10|
| 10|
| 10|
| 10|
| 10|
| 10|
+---+
only showing top 10 rows



In [38]:
count_example.select(
    "ord",
    F.count("ord")
    .over(Window.partitionBy().orderBy("ord").rowsBetween(-2, 2))
    .alias("row"),
    F.count("ord")
    .over(Window.partitionBy().orderBy("ord").rangeBetween(-2, 2))
    .alias("range"),
).show(10)

+---+---+-------+
|ord|row|  range|
+---+---+-------+
| 10|  3|1000001|
| 10|  4|1000001|
| 10|  5|1000001|
| 10|  5|1000001|
| 10|  5|1000001|
| 10|  5|1000001|
| 10|  5|1000001|
| 10|  5|1000001|
| 10|  5|1000001|
| 10|  5|1000001|
+---+---+-------+
only showing top 10 rows

