In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from functools import reduce
import pandas as pd
import pyspark.sql.functions as F
import pyspark.sql.types as T
from time import sleep
from typing import Iterator, Sequence, Tuple
from sklearn.linear_model import LinearRegression

In this chapter, we continue to work with the NOAA weather data we worked with in Chapter 9: 10 years' (2010 to 2020) worth of NOAA weather data located in Google BigQuery, which totals over 40 million records. The `bigquery-public-data` is a project available to all.

Here, we read in a large amount of data from a warehouse and assemble a single data frame representing weather information across the globe for a period of 10 years.

In [3]:
# Use the Spark connector package version closest to our installed version of Spark (3.4.1)
spark = SparkSession.builder.config(
    "spark.jars.packages",
    "com.google.cloud.spark:spark-3.3-bigquery:0.32.0"
).config(
    "parentProject", "cool-wharf-393713"
).getOrCreate()

In [4]:
# Abstract the table reading routine into a reusable function, returning the resulting data frame
def read_df_from_bq(year):
    return (
        spark.read.format("bigquery").option(
            "table", f"bigquery-public-data.noaa_gsod.gsod{year}"
        )
        .option("credentialsFile", "../../../../cool-wharf-393713-73800a184f10.json")
        .load()
    )

In [6]:
# gsod = (
#     reduce(
#         # use a lambda function over a list comprehension of data frames to union them all
#         lambda x, y: x.unionByName(y, allowMissingColumns=True),
#         [read_df_from_bq(year) for year in range(2010, 2021)],
#     )
#     .dropna(subset=["year", "mo", "da", "temp"])
#     .where(F.col("temp") != 9999.9)
#     .drop("date")
# )

# Instead, we work locally with three year's worth of data in Parquet format:
gsod = spark.read.parquet("../../data/window/gsod.parquet")


### Identifying the coldest day of each year, the long way

we want a data frame containing three records, one for each year and showing the station, the date (year, month, day), and the temperature of the coldest day recorded for that year.

In [7]:
# Compute the lowest temperature for each year using groupBy()
coldest_temp = gsod.groupby("year").agg(F.min("temp").alias("temp"))
coldest_temp.orderBy("temp").show()

+----+------+
|year|  temp|
+----+------+
|2017|-114.7|
|2019|-114.7|
|2018|-113.5|
+----+------+



In [8]:
# Use a left-semi equi-join on the original table to get 
# the month and day columns
coldest_when = gsod.join(
    coldest_temp, how="left_semi", on=["year", "temp"]
).select("stn", "year", "mo", "da", "temp")

coldest_when.orderBy("year", "mo", "da").show()

+------+----+---+---+------+
|   stn|year| mo| da|  temp|
+------+----+---+---+------+
|896250|2017| 06| 20|-114.7|
|896060|2018| 08| 27|-113.5|
|895770|2019| 06| 15|-114.7|
+------+----+---+---+------+



Let's do this instead using a window function:

In [12]:
# Create a WindowSpec object by using the Window builder class
# This forms a blueprint for an eventual window function.
each_year = Window.partitionBy("year")

print(each_year)

<pyspark.sql.window.WindowSpec object at 0x0000020E3908EC70>


In [15]:
# Select the minimum temperature for each year using a window function
(
    gsod
    .withColumn("min_temp", F.min("temp").over(each_year))
    .where("temp = min_temp")
    .select("year", "mo", "da", "stn", "temp")
    .orderBy("year", "mo", "da")
    .show()
)

+----+---+---+------+------+
|year| mo| da|   stn|  temp|
+----+---+---+------+------+
|2017| 06| 20|896250|-114.7|
|2018| 08| 27|896060|-113.5|
|2019| 06| 15|895770|-114.7|
+----+---+---+------+------+

