In [6]:
from datetime import date

from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import (
  DateType,
  IntegerType,
  StringType,
  StructField,
  StructType,
)
import plotly.express as px
import polars as pl


def df_generator(my_spark: SparkSession) -> DataFrame:
  """
  Generate a sample DataFrame with airport temperature data.

  Creates a DataFrame containing historical temperature readings (high and low)
  for three Pacific Northwest airports (BLI, PDX, SEA) over a 3-day period
  in April 2021.

  Args:
      my_spark: Active SparkSession instance used to create the DataFrame.

  Returns:
      DataFrame with schema: AirportCode (string), Date (date),
      TempHighF (int), TempLowF (int).
  """
  schema = StructType(
    [
      StructField('AirportCode', StringType(), False),
      StructField('Date', DateType(), False),
      StructField('TempHighF', IntegerType(), False),
      StructField('TempLowF', IntegerType(), False),
    ]
  )

  data = [
    ['BLI', date(2021, 4, 3), 52, 43],
    ['BLI', date(2021, 4, 2), 50, 38],
    ['BLI', date(2021, 4, 1), 52, 41],
    ['PDX', date(2021, 4, 3), 64, 45],
    ['PDX', date(2021, 4, 2), 61, 41],
    ['PDX', date(2021, 4, 1), 66, 39],
    ['SEA', date(2021, 4, 3), 57, 43],
    ['SEA', date(2021, 4, 2), 54, 39],
    ['SEA', date(2021, 4, 1), 56, 41],
  ]

  temps = my_spark.createDataFrame(data, schema)
  return temps

In [7]:
# Generate and display the temperature data
temps_df = df_generator(spark)

# Show schema
temps_df.printSchema()

# Display sample data
temps_df.show()

# Summary statistics
temps_df.describe().show()

root
 |-- AirportCode: string (nullable = false)
 |-- Date: date (nullable = false)
 |-- TempHighF: integer (nullable = false)
 |-- TempLowF: integer (nullable = false)

+-----------+----------+---------+--------+
|AirportCode|      Date|TempHighF|TempLowF|
+-----------+----------+---------+--------+
|        BLI|2021-04-03|       52|      43|
|        BLI|2021-04-02|       50|      38|
|        BLI|2021-04-01|       52|      41|
|        PDX|2021-04-03|       64|      45|
|        PDX|2021-04-02|       61|      41|
|        PDX|2021-04-01|       66|      39|
|        SEA|2021-04-03|       57|      43|
|        SEA|2021-04-02|       54|      39|
|        SEA|2021-04-01|       56|      41|
+-----------+----------+---------+--------+



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+-------+-----------+------------------+------------------+
|summary|AirportCode|         TempHighF|          TempLowF|
+-------+-----------+------------------+------------------+
|  count|          9|                 9|                 9|
|   mean|       NULL|56.888888888888886|41.111111111111114|
| stddev|       NULL| 5.644564740625367|2.2607766610417563|
|    min|        BLI|                50|                38|
|    max|        SEA|                66|                45|
+-------+-----------+------------------+------------------+



In [None]:
# Time-series visualization of high temperatures by airport
# Convert Spark DataFrame to Polars for better performance
temps_polars = pl.from_pandas(temps_df.toPandas()).sort(['AirportCode', 'Date'])

fig = px.line(
  temps_polars,
  x='Date',
  y='TempHighF',
  color='AirportCode',
  markers=True,
  title='Airport High Temperature Trends (April 2021)',
  labels={'TempHighF': 'High Temperature (°F)', 'Date': 'Date'},
  template='plotly_white',
)

fig.update_layout(hovermode='x unified', legend_title_text='Airport Code')

fig.show()