In [28]:
from pyspark.sql import SparkSession
from functools import reduce
import pandas as pd
import pyspark.sql.functions as F
import pyspark.sql.types as T
from time import sleep
from typing import Iterator, Tuple

In this chapter, we work with 10 years' (2010 to 2020) worth of NOAA weather data located in Google BigQuery, which totals over 40 million records. The `bigquery-public-data` is a project available to all.

Here, we read in a large amount of data from a warehouse and assemble a single data frame representing weather information across the globe for a period of 10 years.

In [19]:
# Use the Spark connector package version closest to our installed version of Spark (3.4.1)
spark = SparkSession.builder.config(
    "spark.jars.packages",
    "com.google.cloud.spark:spark-3.3-bigquery:0.32.0"
).config(
    "parentProject", "cool-wharf-393713"
).getOrCreate()

In [16]:
# Abstract the table reading routine into a reusable function, returning the resulting data frame
def read_df_from_bq(year):
    return (
        spark.read.format("bigquery").option(
            "table", f"bigquery-public-data.noaa_gsod.gsod{year}"
        )
        .option("credentialsFile", "../../../../cool-wharf-393713-73800a184f10.json")
        .load()
    )

In [20]:
gsod = (
    reduce(
        # use a lambda function over a list comprehension of data frames to union them all
        lambda x, y: x.unionByName(y, allowMissingColumns=True),
        [read_df_from_bq(year) for year in range(2010, 2021)],
    )
    .dropna(subset=["year", "mo", "da", "temp"])
    .where(F.col("temp") != 9999.9)
    .drop("date")
)

In [13]:
# Pandas scalar UDF that transforms Fahrenheit into Celsius
@F.pandas_udf(T.DoubleType())
def f_to_c(degrees: pd.Series) -> pd.Series:
    """Transforms Fahrenheight to Celcius."""
    return (degrees - 32) *5 / 9

In [21]:
gsod.select("temp").distinct().show(5)

+----+
|temp|
+----+
|69.8|
|74.5|
|64.2|
|76.4|
|15.5|
+----+
only showing top 5 rows



In [22]:
# Create a temp_c column by applying the scalar UDF:
gsod = gsod.withColumn("temp_c", f_to_c(F.col("temp")))
gsod.select("temp", "temp_c").distinct().show(5)

+----+-------------------+
|temp|             temp_c|
+----+-------------------+
|37.2| 2.8888888888888906|
|71.6| 21.999999999999996|
|70.4| 21.333333333333336|
|29.6|-1.3333333333333326|
|-1.1| -18.38888888888889|
+----+-------------------+
only showing top 5 rows



Iterator of Series UDFs are very useful when you have an expensive cold start operation you
need to perform. By cold start, we mean an operation we need to perform once at the
beginning of the processing step, before working through the data. Deserializing a
local ML model (fitted with scikit-learn or another Python modeling library) is an
example: we would need to unpack and read the model once for the whole data
frame, and then it could be used to process all records.

In [26]:
# Pandas Iterator of Series to Iterator of Series UDF
@F.pandas_udf(T.DoubleType())
def f_to_c2(degrees: Iterator[pd.Series]) -> Iterator[pd.Series]:
    """Transforms Farhenheit to Celcius."""
    # We simulate a cold start using sleep() for five seconds.
    # The cold start will happen on each worker once, rather than for every batch.
    sleep(5)
    # We iterate over each batch, using yield (instead of return)
    for batch in degrees:
        yield (batch - 32) * 5 / 9

In [27]:
gsod.select(
    "temp", f_to_c2(F.col("temp")).alias("temp_c")
).distinct().show(5)

+----+-------------------+
|temp|             temp_c|
+----+-------------------+
|37.2| 2.8888888888888906|
|71.6| 21.999999999999996|
|70.4| 21.333333333333336|
|29.6|-1.3333333333333326|
|-1.1| -18.38888888888889|
+----+-------------------+
only showing top 5 rows



We can use an Iterator of multiple Series to Iterator of Series to assemple the `year`, `mo` and `da` columns (representing year, month and day) into a single column:

1) `year_mo_da`` is an Iterator of a tuple of Series, representing all the batches of values contained in the `year``, `mo``, and `da` columns.
2) To access each batch, we use a for loop over the iterator, the same principle as
for the Iterator of Series UDF.
3) To extract each individual series from the tuple, we use multiple assignments.
In this case, `year` will map to the first Series of the tuple, `mo` to the second, and
`da` to the third.
4) Since `pd.to_datetime` requests a data frame containing the year, month, and
day columns, we create the data frame via a dictionary, giving the keys the relevant column names. `pd.to_datetime` returns a Series.
5) Finally, we yield the answer to build the Iterator of Series, fulfilling our contract.

In [29]:
@F.pandas_udf(T.DateType())
def create_date(
    year_mo_da: Iterator[Tuple[pd.Series, pd.Series, pd.Series]]
) -> Iterator[pd.Series]:
    """Merges three sols (Y-M-D of a date) into a Date col."""
    for year, mo, da in year_mo_da:
        yield pd.to_datetime(
            pd.DataFrame(dict(year=year, month=mo, day=da))
        )

In [None]:
gsod.select(
    "year", "mo", "da",
    create_date(F.col("year"), F.col("mo"), F.col("da")).alias("date")
).distinct().show(5)