Filter weather measurements to keep only the ones matching the station for each selected measured element

In [1]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date, timedelta

In [2]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

['capstone.cfg']

In [3]:
project_path = config["PATH"]["project"]
os.chdir(project_path)

Create spark session. Add driver postgress to enable to load from existing postgres DB


In [4]:
# add driver postgress to enable to load from existing postgres DB
spark = SparkSession \
    .builder \
    .appName("filter_weather") \
    .config("spark.jars", "postgresql-42.2.18.jar")\
    .config( "spark.driver.extraClassPath", "postgresql-42.2.18.jar")\
    .getOrCreate()

# Load data

## load weather data

In [5]:
%%time
weather_path = os.path.join(config["PATH"]["project"], "weather_2020_with_stations.parquet/" )
weather_2020 = spark.read.load(weather_path)

CPU times: user 1.67 ms, sys: 4.3 ms, total: 5.97 ms
Wall time: 14.6 s


In [6]:
weather_2020.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- v1: string (nullable = true)
 |-- date: integer (nullable = true)



In [None]:
%%time
weather_2020.select("measured").distinct().show()

## Load selected stations data

In [7]:
stations_path = os.path.join(project_path, "OUT_DATA", "closest_stations_filtered.csv")
filtered_stations = pd.read_csv(stations_path)
filtered_stations.rename(columns = {"Unnamed: 0" : "element"}, inplace = True)

In [8]:
filtered_stations

Unnamed: 0,element,Unnamed: 1,fips,county,state_gazeeter,station_id,station_name,state_station,latitude_station,longitude_station,distance,state_pair
0,SNOW,0,53061.0,Snohomish,Washington,USW00094290,SEATTLE SAND PT WSFO,WA,47.6872,-122.2553,27.427372,"('Washington', 'WA')"
1,SNOW,1,17031.0,Cook,Illinois,USC00111577,CHICAGO MIDWAY AP 3SW 72534,IL,41.7372,-87.7775,10.306315,"('Illinois', 'IL')"
2,SNOW,2,6059.0,Orange,California,US1CALA0092,REDONDO BEACH 2.1 SSW,CA,33.8274,-118.3888,29.503349,"('California', 'CA')"
3,SNOW,3,4013.0,Maricopa,Arizona,USW00023183,PHOENIX SKY HARBOR INTL AP GSN 72278,AZ,33.4278,-112.0039,23.435202,"('Arizona', 'AZ')"
4,SNOW,4,6037.0,Los Angeles,California,US1CALA0001,GLENDALE 2.4 WSW,CA,34.1689,-118.2947,2.149035,"('California', 'CA')"
...,...,...,...,...,...,...,...,...,...,...,...,...
19615,TAVG,3265,,Unknown,Idaho,USR0000IHPK,HORTON PEAK IDAHO,ID,43.9481,-114.7561,6.701040,"('Idaho', 'ID')"
19616,TAVG,3266,,Unknown,Montana,USS0009C01S,Crystal Lake,MT,46.7900,-109.5100,9.785520,"('Montana', 'MT')"
19617,TAVG,3267,,Unknown,Alaska,USR0000ANOR,NORUTAK LAKE ALASKA,AK,66.8333,-154.3333,43.082228,"('Alaska', 'AK')"
19618,TAVG,3268,,Unknown,Wyoming,USS0007F06S,Grave Springs,WY,43.4700,-107.2400,22.001426,"('Wyoming', 'WY')"


In [9]:
spark_stations = spark.read.load(stations_path, format = "csv", header = True, sep = ",", inferSchema = True)

In [10]:
spark_stations = spark_stations.withColumnRenamed("_c0", "element")

In [11]:
spark_stations.printSchema()

root
 |-- element: string (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- fips: double (nullable = true)
 |-- county: string (nullable = true)
 |-- state_gazeeter: string (nullable = true)
 |-- station_id: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- state_station: string (nullable = true)
 |-- latitude_station: double (nullable = true)
 |-- longitude_station: double (nullable = true)
 |-- distance: double (nullable = true)
 |-- state_pair: string (nullable = true)



## Load NYT data

In [12]:
nyt_path = os.path.join(project_path, "DATA", "us-counties.txt")

In [13]:
nyt = spark.read.load(nyt_path, format = "csv", header = True, sep = ",", inferSchema = True)

In [14]:
nyt.printSchema()

root
 |-- date: timestamp (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- fips: integer (nullable = true)
 |-- cases: integer (nullable = true)
 |-- deaths: integer (nullable = true)



# Filter weather data
Keep only rows which maps station_id and element in the station dataframe

In [15]:
weather_join_station = weather_2020.join( spark_stations, 
                  (spark_stations["station_id"] == weather_2020["station_id"] ) & (spark_stations["element"] == weather_2020["measured"] ))

In [14]:
%%time 
weather_join_station.count()

CPU times: user 7.25 ms, sys: 13.2 ms, total: 20.5 ms
Wall time: 1min 40s


7027907

In [16]:
weather_join_station.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- v1: string (nullable = true)
 |-- date: integer (nullable = true)
 |-- element: string (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- fips: double (nullable = true)
 |-- county: string (nullable = true)
 |-- state_gazeeter: string (nullable = true)
 |-- station_id: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- state_station: string (nullable = true)
 |-- latitude_station: double (nullable = true)
 |-- longitude_station: double (nullable = true)
 |-- distance: double (nullable = true)
 |-- state_pair: string (nullable = true)



In [17]:
weather_join_station = weather_join_station.drop(weather_2020["station_id"])
weather_join_station = weather_join_station.drop(weather_2020["measured"])

In [18]:
weather_join_station.printSchema()

root
 |-- v1: string (nullable = true)
 |-- date: integer (nullable = true)
 |-- element: string (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- fips: double (nullable = true)
 |-- county: string (nullable = true)
 |-- state_gazeeter: string (nullable = true)
 |-- station_id: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- state_station: string (nullable = true)
 |-- latitude_station: double (nullable = true)
 |-- longitude_station: double (nullable = true)
 |-- distance: double (nullable = true)
 |-- state_pair: string (nullable = true)



# Join with NYT covid data

## wide to  long NYT transform
Duplicate the rows for each weather element, to be able to join with weather data

In [None]:
l_elements = filtered_stations["element"].unique()
list(l_elements)

In [20]:
sp_elements = spark.createDataFrame( pd.DataFrame(l_elements, columns = ["element"] ) )

In [21]:
sp_elements.show()

+-------+
|element|
+-------+
|   SNOW|
|   SNWD|
|   PRCP|
|   TMAX|
|   TMIN|
|   TAVG|
+-------+



In [22]:
nyt_with_elements = nyt.crossJoin(sp_elements)

In [61]:
nyt_with_elements.take(10)

[Row(date=datetime.datetime(2020, 1, 21, 0, 0), county='Snohomish', state='Washington', fips=53061, cases=1, deaths=0, element='SNOW'),
 Row(date=datetime.datetime(2020, 1, 22, 0, 0), county='Snohomish', state='Washington', fips=53061, cases=1, deaths=0, element='SNOW'),
 Row(date=datetime.datetime(2020, 1, 23, 0, 0), county='Snohomish', state='Washington', fips=53061, cases=1, deaths=0, element='SNOW'),
 Row(date=datetime.datetime(2020, 1, 24, 0, 0), county='Cook', state='Illinois', fips=17031, cases=1, deaths=0, element='SNOW'),
 Row(date=datetime.datetime(2020, 1, 24, 0, 0), county='Snohomish', state='Washington', fips=53061, cases=1, deaths=0, element='SNOW'),
 Row(date=datetime.datetime(2020, 1, 25, 0, 0), county='Orange', state='California', fips=6059, cases=1, deaths=0, element='SNOW'),
 Row(date=datetime.datetime(2020, 1, 25, 0, 0), county='Cook', state='Illinois', fips=17031, cases=1, deaths=0, element='SNOW'),
 Row(date=datetime.datetime(2020, 1, 25, 0, 0), county='Snohomish'

In [26]:
%%time
nyt_with_elements.count()

Py4JJavaError: An error occurred while calling o96.count.
: org.apache.spark.SparkException: Could not execute broadcast in 300 secs. You can increase the timeout for broadcasts via spark.sql.broadcastTimeout or disable broadcast join by setting spark.sql.autoBroadcastJoinThreshold to -1
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:150)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeBroadcast$1.apply(SparkPlan.scala:144)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeBroadcast$1.apply(SparkPlan.scala:140)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.executeBroadcast(SparkPlan.scala:140)
	at org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec.doExecute(BroadcastNestedLoopJoinExec.scala:343)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:374)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:151)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:610)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.prepareShuffleDependency(ShuffleExchangeExec.scala:92)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:128)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:374)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:151)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:610)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:247)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:296)
	at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2831)
	at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2830)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3365)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3364)
	at org.apache.spark.sql.Dataset.count(Dataset.scala:2830)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.util.concurrent.TimeoutException: Futures timed out after [300 seconds]
	at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:223)
	at scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:227)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:220)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:146)
	... 63 more


## Join NYT with weather on element

In [23]:
nyt_weather = nyt_with_elements.join(weather_join_station, on = "element")

In [24]:
nyt_weather.printSchema()

root
 |-- element: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- fips: integer (nullable = true)
 |-- cases: integer (nullable = true)
 |-- deaths: integer (nullable = true)
 |-- v1: string (nullable = true)
 |-- date: integer (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- fips: double (nullable = true)
 |-- county: string (nullable = true)
 |-- state_gazeeter: string (nullable = true)
 |-- station_id: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- state_station: string (nullable = true)
 |-- latitude_station: double (nullable = true)
 |-- longitude_station: double (nullable = true)
 |-- distance: double (nullable = true)
 |-- state_pair: string (nullable = true)



## Pivot : long to wide format

In [25]:
%%time
nyt_weather = nyt_weather.groupby("date", "state", "county", "fips")\
        .pivot("element")

KeyboardInterrupt: 