In [1]:
# mapping of covid locations (fips + GPS) to the closest weather station

In [37]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.types import MapType, StringType, FloatType
from pyspark.sql import DataFrame, Window
from collections import OrderedDict
import pandas as pd
import numpy as np

In [3]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

os.chdir(config["PATH"]["project"])
project_path = config["PATH"]["project"]


In [4]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .appName("covid_DB") \
        .getOrCreate()
    
    return spark

In [5]:
spark = create_spark_session()

In [6]:
# Load relevant stations with weather element
path = os.path.join(project_path, "OUT_DATA", "filtered_stations")
selected_stations = spark.read.parquet(path)

In [7]:
selected_stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)



In [8]:
selected_stations.select("measured").distinct().show()

+--------+
|measured|
+--------+
|    TMIN|
|    SNOW|
|    AWND|
|    PRCP|
+--------+



In [9]:
# load all stations, with GPS location
raw_stations = spark.read.csv( os.path.join(project_path, "DATA", "WEATHER", "ghcnd-stations.txt"))

In [10]:
# parse raw stations into columns
@udf(MapType( StringType(), StringType()))
def ParseStationsUDF(line):
    return{
        "station_id": line[0:11],
        "latitude" : line[13:20], 
        "longitude" : line[21:30], 
        "elevation" : line[31:38], 
        "state" : line[38:40], 
        "station_name" : line[41:]
        
    }

fields = OrderedDict( [
        ( "station_id" , "string"),
        ( "latitude" , "float"), 
        ("longitude" , "float"), 
        ("elevation" , "float"),
        ("state" , "string"), 
        ("station_name" , "string")
] )

#exprs = [ f"parsed['{field}'].cast({fld_type}) as {field}" for field, fld_type in fields.items() ]
exprs = [ f"CAST(parsed['{field}'] AS {fld_type}) AS {field}" for field, fld_type in fields.items() ]

df_stations = raw_stations.withColumn("parsed", ParseStationsUDF("_c0")).selectExpr( *exprs)

In [11]:
df_stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- elevation: float (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)



In [12]:
df_stations = df_stations.join(selected_stations, ["station_id"])

In [13]:
df_stations.count()

23556

In [14]:
df_stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- elevation: float (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- measured: string (nullable = true)



In [15]:
df_stations.show(10)

+-----------+--------+---------+---------+-----+--------------------+--------+
| station_id|latitude|longitude|elevation|state|        station_name|measured|
+-----------+--------+---------+---------+-----+--------------------+--------+
|AQC00914000| 14.3167|-170.7667|    408.4|   AS|AASUFOU          ...|    PRCP|
|AQC00914141| 14.2667|-170.6167|      4.6|   AS|FAGAITUA         ...|    PRCP|
|AQC00914594| 14.3333|-170.7667|     42.4|   AS|MALAELOA         ...|    PRCP|
|AQW00061705| 14.3306|-170.7136|      3.7|   AS|PAGO PAGO WSO AP ...|    AWND|
|AQW00061705| 14.3306|-170.7136|      3.7|   AS|PAGO PAGO WSO AP ...|    TMIN|
|AQW00061705| 14.3306|-170.7136|      3.7|   AS|PAGO PAGO WSO AP ...|    PRCP|
|CQC00914080| 15.2136| 145.7497|    252.1|   MP|CAPITOL HILL 1   ...|    TMIN|
|CQC00914080| 15.2136| 145.7497|    252.1|   MP|CAPITOL HILL 1   ...|    PRCP|
|CQC00914801| 14.1717| 145.2428|    179.2|   MP|ROTA AP          ...|    TMIN|
|CQC00914801| 14.1717| 145.2428|    179.2|   MP|ROTA

In [16]:
# Load NYT locations (FIPS + GPS)
path = os.path.join(project_path, "OUT_DATA", "nyt_locations_geography")
df_locations = spark.read.parquet(path)

In [17]:
df_locations = df_locations.where( ( ~ F.isnan("latitude") ) | (~ F.isnan("longitude")) )

In [18]:
df_locations.printSchema()

root
 |-- fips: string (nullable = true)
 |-- county: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- state: string (nullable = true)



In [19]:
def precompute_distance(l_ref : DataFrame) -> DataFrame:
    l_ref = l_ref.withColumnRenamed("latitude", "latitude_degrees")
    l_ref = l_ref.withColumnRenamed( "longitude", "longitude_degrees") 
    @udf( FloatType())
    def degree_to_radian(x):
        return  x* np.pi / 180.
    l_ref = l_ref.withColumn("latitude", degree_to_radian("latitude_degrees") )
    l_ref = l_ref.withColumn("longitude", degree_to_radian("longitude_degrees") )
    l_ref = l_ref.withColumn("cos_latitude", F.cos("latitude") )  
    print(type(l_ref))
    return l_ref

In [20]:
df_stations_precompute =precompute_distance(df_stations)

<class 'pyspark.sql.dataframe.DataFrame'>


In [21]:
df_stations_precompute.count()

23556

In [22]:
df_stations_precompute.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude_degrees: float (nullable = true)
 |-- longitude_degrees: float (nullable = true)
 |-- elevation: float (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- cos_latitude: double (nullable = true)



In [23]:
df_stations_precompute.agg({"latitude" : "min", "longitude" : "min", "latitude_degrees" : "min", "longitude_degrees" : "min" } ).collect()

[Row(min(longitude_degrees)=-170.76669311523438, min(latitude_degrees)=13.389399528503418, min(latitude)=0.2336890995502472, min(longitude)=-2.980441093444824)]

In [24]:
df_stations_precompute.agg({"latitude" : "max", "longitude" : "max", "latitude_degrees" : "max", "longitude_degrees" : "max" } ).collect()

[Row(max(longitude_degrees)=145.74969482421875, max(latitude_degrees)=71.32140350341797, max(latitude)=1.2447932958602905, max(longitude)=2.5438120365142822)]

In [25]:
df_locations_precompute = precompute_distance(df_locations)

<class 'pyspark.sql.dataframe.DataFrame'>


In [26]:
df_locations_precompute.agg({"latitude" : "min", "longitude" : "min", "latitude_degrees" : "min", "longitude_degrees" : "min" } ).collect()

[Row(min(longitude_degrees)=-164.1889190673828, min(latitude_degrees)=13.444, min(latitude)=0.23464205861091614, min(longitude)=-2.8656373023986816)]

In [27]:
df_locations_precompute.agg({"latitude" : "max", "longitude" : "max", "latitude_degrees" : "max", "longitude_degrees" : "max" } ).collect()

[Row(max(longitude_degrees)=178.33880615234375, max(latitude_degrees)=69.4493408203125, max(latitude)=1.212119698524475, max(longitude)=3.1125993728637695)]

# closest station for all fips

In [48]:
sub_fips = df_locations_precompute\
    .select("fips", "latitude", "longitude", "cos_latitude")\
    .withColumnRenamed("latitude", "latitude_fips")\
    .withColumnRenamed("longitude", "longitude_fips")\
    .withColumnRenamed("cos_latitude", "cos_latitude_fips")

In [49]:
sub_stations = df_stations_precompute\
    .select("station_id", "measured","latitude", "longitude", "cos_latitude")\
    .withColumnRenamed("latitude", "latitude_station")\
    .withColumnRenamed("longitude", "longitude_station")\
    .withColumnRenamed("cos_latitude", "cos_latitude_station")

In [50]:
fips_cross_stations = sub_fips.crossJoin(sub_stations)

In [31]:
%%time
fips_cross_stations.count()

10000

In [51]:
fips_cross_stations.printSchema()

root
 |-- fips: string (nullable = true)
 |-- latitude_fips: float (nullable = true)
 |-- longitude_fips: float (nullable = true)
 |-- cos_latitude_fips: double (nullable = true)
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- latitude_station: float (nullable = true)
 |-- longitude_station: float (nullable = true)
 |-- cos_latitude_station: double (nullable = true)



In [73]:
def delta_coord(col1, col2):
    return F.pow( F.sin(0.5* (col1 - col2 ) ), 2 )

In [74]:
def haversine( lat1, long1, cos_lat1, lat2, long2, cos_lat2):
    ''' computation of angular distance between 2 locations given by GPS coordinates,
    using haversine formula.
    exact formulas (maybe overkill), taken from :
    https://www.movable-type.co.uk/scripts/latlong.html
    '''    
    delta_lat = delta_coord(lat1, lat2)
    delta_long = delta_coord(long1, long2)
    a = delta_lat + delta_long * cos_lat1 * cos_lat2
    return  F.atan2( F.sqrt(a), F.sqrt( 1.-a ) )

In [75]:
#dist_col = "toto"
dist_col = "angle"
df_cross_distance = fips_cross_stations.withColumn(dist_col,                                  
        haversine(col("latitude_fips"), col("longitude_fips"),col("cos_latitude_fips"), 
                    col("latitude_station"), col("longitude_station"), col("cos_latitude_station")) )

In [40]:
df_cross_distance.show(10)

+-----+-------------+--------------+-------------------+-----------+--------+----------------+-----------------+--------------------+------------------+
| fips|latitude_fips|longitude_fips|  cos_latitude_fips| station_id|measured|latitude_station|longitude_station|cos_latitude_station|              toto|
+-----+-------------+--------------+-------------------+-----------+--------+----------------+-----------------+--------------------+------------------+
|02198|    0.9718477|     -2.324122| 0.5637744149985534|AQC00914000|    PRCP|      0.24987355|        -2.980441|  0.9689436985050108| 0.853762920078671|
|02240|    1.1146545|    -2.4996367|0.44048764879312186|AQC00914000|    PRCP|      0.24987355|        -2.980441|  0.9689436985050108|0.7929826951260189|
|02261|    1.0669032|    -2.5544662| 0.4828384076125879|AQC00914000|    PRCP|      0.24987355|        -2.980441|  0.9689436985050108|0.8234070924966638|
|02090|    1.1288098|    -2.5577478| 0.4277359281117351|AQC00914000|    PRCP|     

In [76]:
window = Window.partitionBy("measured", "fips")

In [77]:
df_min_distance = df_cross_distance.withColumn("min_angle", F.min(dist_col).over(window))\
            .filter(col(dist_col) == col("min_angle"))\
            .drop("min_angle")

In [57]:
%%time
df_min_distance.show(10)

+-----+-------------+--------------+------------------+-----------+--------+----------------+-----------------+--------------------+-------------------+
| fips|latitude_fips|longitude_fips| cos_latitude_fips| station_id|measured|latitude_station|longitude_station|cos_latitude_station|              angle|
+-----+-------------+--------------+------------------+-----------+--------+----------------+-----------------+--------------------+-------------------+
|01109|    0.5549912|     -1.499964|0.8499050848915212|GQW00041415|    AWND|       0.2353332|        2.5271688|  0.9724367030704606| 0.1302571691123501|
|08107|   0.70657325|    -1.8672876|0.7605910917793964|VQW00011640|    AWND|      0.31997293|       -1.1338828|  0.9492439323158217|0.17312217958096277|
|13165|    0.5723731|     -1.430673|0.8406180091627147|GQW00041415|    AWND|       0.2353332|        2.5271688|  0.9724367030704606|0.10449719309773617|
|16021|    0.8512517|     -2.033739|0.6590422353506066|VQW00011640|    AWND|      

In [78]:
R_earth = 6371

In [79]:
df_min_distance = df_min_distance.withColumn("distance", R_earth * col("angle"))

In [61]:
%%time
df_min_distance.show(10)

+-----+-------------+--------------+------------------+-----------+--------+----------------+-----------------+--------------------+-------------------+------------------+
| fips|latitude_fips|longitude_fips| cos_latitude_fips| station_id|measured|latitude_station|longitude_station|cos_latitude_station|              angle|          distance|
+-----+-------------+--------------+------------------+-----------+--------+----------------+-----------------+--------------------+-------------------+------------------+
|01109|    0.5549912|     -1.499964|0.8499050848915212|GQW00041415|    AWND|       0.2353332|        2.5271688|  0.9724367030704606| 0.1302571691123501| 829.8684244147826|
|08107|   0.70657325|    -1.8672876|0.7605910917793964|VQW00011640|    AWND|      0.31997293|       -1.1338828|  0.9492439323158217|0.17312217958096277|1102.9614061103139|
|13165|    0.5723731|     -1.430673|0.8406180091627147|GQW00041415|    AWND|       0.2353332|        2.5271688|  0.9724367030704606|0.104497

In [63]:
%%time
df_min_distance.count()

CPU times: user 52.9 ms, sys: 12.3 ms, total: 65.2 ms
Wall time: 1min 48s


12868

In [80]:
df_min_distance.printSchema()

root
 |-- fips: string (nullable = true)
 |-- latitude_fips: float (nullable = true)
 |-- longitude_fips: float (nullable = true)
 |-- cos_latitude_fips: double (nullable = true)
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- latitude_station: float (nullable = true)
 |-- longitude_station: float (nullable = true)
 |-- cos_latitude_station: double (nullable = true)
 |-- angle: double (nullable = true)
 |-- distance: double (nullable = true)



In [81]:
df_locations.printSchema()

root
 |-- fips: string (nullable = true)
 |-- county: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- state: string (nullable = true)



In [82]:
df_stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- elevation: float (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- measured: string (nullable = true)



In [105]:
res = df_min_distance.alias("d").join( 
        df_stations.withColumnRenamed("state", "station_state").alias("st"), 
        "station_id" )\
    .join( df_locations.withColumnRenamed("state", "fips_state").alias("loc"), ["fips"])\
    .select( "fips", "county", "fips_state", "st.*","angle", "distance")


In [108]:
%%time
res.show(10)

+-----+--------+--------------+--------+---------+---------+-------------+--------------------+--------+--------------------+------------------+
| fips|  county|    fips_state|latitude|longitude|elevation|station_state|        station_name|measured|               angle|          distance|
+-----+--------+--------------+--------+---------+---------+-------------+--------------------+--------+--------------------+------------------+
|05055|  Greene|      Arkansas| 36.2086| -90.5347|    102.7|           AR|LAFE 1.8 W       ...|    PRCP|7.996087882014496E-4| 5.094307589631435|
|04013|Maricopa|       Arizona| 33.4066|-112.4485|    276.1|           AZ|GOODYEAR 5.9 WSW ...|    PRCP|6.499267869547607E-4| 4.140683559688781|
|20137|  Norton|        Kansas| 39.8163| -99.8978|    690.7|           KS|NORTON 1.3 SSW   ...|    SNOW| 2.83199104122919E-4|1.8042614923671167|
|20137|  Norton|        Kansas| 39.8163| -99.8978|    690.7|           KS|NORTON 1.3 SSW   ...|    PRCP| 2.83199104122919E-4|1.804

In [106]:
res.printSchema()

root
 |-- fips: string (nullable = true)
 |-- county: string (nullable = true)
 |-- fips_state: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- elevation: float (nullable = true)
 |-- station_state: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- angle: double (nullable = true)
 |-- distance: double (nullable = true)



In [None]:
out_path = os.path.join(project_path, "OUT_DATA", "stations_per_fips")
res.write.partitionBy("measured").format("parquet").save(out_path)