In [5]:
# mapping of covid locations (fips + GPS) to the closest weather station

In [97]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.types import MapType, StringType, FloatType
from pyspark.sql import DataFrame
from collections import OrderedDict
import pandas as pd
import numpy as np

In [7]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

os.chdir(config["PATH"]["project"])
project_path = config["PATH"]["project"]


In [8]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .appName("covid_DB") \
        .getOrCreate()
    
    return spark

In [10]:
spark = create_spark_session()

In [83]:
# Load relevant stations with weather element
path = os.path.join(project_path, "OUT_DATA", "filtered_stations")
selected_stations = spark.read.parquet(path)

In [84]:
selected_stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)



In [85]:
selected_stations.select("measured").distinct().show()

+--------+
|measured|
+--------+
|    TMIN|
|    SNOW|
|    AWND|
|    PRCP|
+--------+



In [86]:
# load all stations, with GPS location
raw_stations = spark.read.csv( os.path.join(project_path, "DATA", "WEATHER", "ghcnd-stations.txt"))

In [87]:
# parse raw stations into columns
@udf(MapType( StringType(), StringType()))
def ParseStationsUDF(line):
    return{
        "station_id": line[0:11],
        "latitude" : line[13:20], 
        "longitude" : line[21:30], 
        "elevation" : line[31:38], 
        "state" : line[38:40], 
        "station_name" : line[41:]
        
    }

fields = OrderedDict( [
        ( "station_id" , "string"),
        ( "latitude" , "float"), 
        ("longitude" , "float"), 
        ("elevation" , "float"),
        ("state" , "string"), 
        ("station_name" , "string")
] )

#exprs = [ f"parsed['{field}'].cast({fld_type}) as {field}" for field, fld_type in fields.items() ]
exprs = [ f"CAST(parsed['{field}'] AS {fld_type}) AS {field}" for field, fld_type in fields.items() ]

df_stations = raw_stations.withColumn("parsed", ParseStationsUDF("_c0")).selectExpr( *exprs)

In [88]:
df_stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- elevation: float (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)



In [89]:
df_stations = df_stations.join(selected_stations, ["station_id"])

In [90]:
df_stations.count()

23556

In [91]:
df_stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- elevation: float (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- measured: string (nullable = true)



In [92]:
df_stations.show(10)

+-----------+--------+---------+---------+-----+--------------------+--------+
| station_id|latitude|longitude|elevation|state|        station_name|measured|
+-----------+--------+---------+---------+-----+--------------------+--------+
|AQC00914000| 14.3167|-170.7667|    408.4|   AS|AASUFOU          ...|    PRCP|
|AQC00914141| 14.2667|-170.6167|      4.6|   AS|FAGAITUA         ...|    PRCP|
|AQC00914594| 14.3333|-170.7667|     42.4|   AS|MALAELOA         ...|    PRCP|
|AQW00061705| 14.3306|-170.7136|      3.7|   AS|PAGO PAGO WSO AP ...|    AWND|
|AQW00061705| 14.3306|-170.7136|      3.7|   AS|PAGO PAGO WSO AP ...|    TMIN|
|AQW00061705| 14.3306|-170.7136|      3.7|   AS|PAGO PAGO WSO AP ...|    PRCP|
|CQC00914080| 15.2136| 145.7497|    252.1|   MP|CAPITOL HILL 1   ...|    TMIN|
|CQC00914080| 15.2136| 145.7497|    252.1|   MP|CAPITOL HILL 1   ...|    PRCP|
|CQC00914801| 14.1717| 145.2428|    179.2|   MP|ROTA AP          ...|    TMIN|
|CQC00914801| 14.1717| 145.2428|    179.2|   MP|ROTA

In [164]:
# Load NYT locations (FIPS + GPS)
path = os.path.join(project_path, "OUT_DATA", "nyt_locations_geography")
df_locations = spark.read.parquet(path)

In [165]:
df_locations = df_locations.where( ( ~ F.isnan("latitude") ) | (~ F.isnan("longitude")) )

In [166]:
df_locations.printSchema()

root
 |-- fips: string (nullable = true)
 |-- county: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- state: string (nullable = true)



In [120]:
def precompute_distance(l_ref : DataFrame) -> DataFrame:
    l_ref = l_ref.withColumnRenamed("latitude", "latitude_degrees")
    l_ref = l_ref.withColumnRenamed( "longitude", "longitude_degrees") 
    @udf( FloatType())
    def degree_to_radian(x):
        return  x* np.pi / 180.
    l_ref = l_ref.withColumn("latitude", degree_to_radian("latitude_degrees") )
    l_ref = l_ref.withColumn("longitude", degree_to_radian("longitude_degrees") )
    l_ref = l_ref.withColumn("cos_latitude", F.cos("latitude") )  
    print(type(l_ref))
    return l_ref

In [None]:
def closest_ref(l_ref, lati, longi):
        ''' computation of angular distance between 2 locations given by GPS coordinates
        exact formulas (maybe overkill), taken from :
        https://www.movable-type.co.uk/scripts/latlong.html
        '''    
        latitude = float(lati) * np.pi / 180.
        longitude = float(longi) * np.pi / 180.
        cos_lat = np.cos(latitude)
        #print( f'latitude : {latitude}, longitude : {longitude}, cos(latitude) : {cos_lat}')
        # Haversine formula
        l_ref["delta_lat_term"] = ( np.sin( (l_ref["latitude"] - latitude) * 0.5 ) )**2
        l_ref["delta_long_term"] = ( np.sin( (l_ref["longitude"] - longitude) * 0.5) )**2
        l_ref["a"] = l_ref["delta_lat_term"] + l_ref["delta_long_term"] \
                        * cos_lat * l_ref["cos_latitude"]
        l_ref["sqrt_a"] = l_ref["a"].apply( lambda x : np.sqrt(x) )
        l_ref["sqrt_1_a"] = l_ref["a"].apply(lambda x : np.sqrt(1.- x) )
        l_ref["angle"] = np.arctan2( np.sqrt(l_ref["a"]), np.sqrt( 1. - l_ref["a"] ) )
        closest = l_ref["angle"].idxmin()
        #print(closest)
        return closest, l_ref.iloc[closest]
    

In [121]:
df_stations_precompute =precompute_distance(df_stations)

<class 'pyspark.sql.dataframe.DataFrame'>


In [101]:
df_stations_precompute.count()

23556

In [123]:
df_stations_precompute.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude_degrees: float (nullable = true)
 |-- longitude_degrees: float (nullable = true)
 |-- elevation: float (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- cos_latitude: double (nullable = true)



In [124]:
df_stations_precompute.agg({"latitude" : "min", "longitude" : "min", "latitude_degrees" : "min", "longitude_degrees" : "min" } ).collect()

[Row(min(longitude_degrees)=-170.76669311523438, min(latitude_degrees)=13.389399528503418, min(latitude)=0.2336890995502472, min(longitude)=-2.980441093444824)]

In [125]:
df_stations_precompute.agg({"latitude" : "max", "longitude" : "max", "latitude_degrees" : "max", "longitude_degrees" : "max" } ).collect()

[Row(max(longitude_degrees)=145.74969482421875, max(latitude_degrees)=71.32140350341797, max(latitude)=1.2447932958602905, max(longitude)=2.5438120365142822)]

In [167]:
df_locations_precompute = precompute_distance(df_locations)

<class 'pyspark.sql.dataframe.DataFrame'>


In [168]:
df_locations_precompute.agg({"latitude" : "min", "longitude" : "min", "latitude_degrees" : "min", "longitude_degrees" : "min" } ).collect()

[Row(min(longitude_degrees)=-164.1889190673828, min(latitude_degrees)=13.444, min(latitude)=0.23464205861091614, min(longitude)=-2.8656373023986816)]

In [169]:
df_locations_precompute.agg({"latitude" : "max", "longitude" : "max", "latitude_degrees" : "max", "longitude_degrees" : "max" } ).collect()

[Row(max(longitude_degrees)=178.33880615234375, max(latitude_degrees)=69.4493408203125, max(latitude)=1.212119698524475, max(longitude)=3.1125993728637695)]

In [137]:
df_locations.sort("latitude", ascending = False).show(10)

+-----+--------------------+------------------+-------------------+------+
| fips|              county|          latitude|          longitude| state|
+-----+--------------------+------------------+-------------------+------+
|02998|Yakutat plus Hoon...|               NaN|                NaN|Alaska|
|02997|Bristol Bay plus ...|               NaN|                NaN|Alaska|
|02185| North Slope Borough|  69.4493408203125| -153.4728240966797|Alaska|
|02188|Northwest Arctic ...| 67.00506591796875|-160.02108764648438|Alaska|
| null|             Unknown|         66.160507|-153.36914099999998|Alaska|
|02290|Yukon-Koyukuk Cen...| 65.37572479248047|-151.57785034179688|Alaska|
|02180|    Nome Census Area| 64.78368377685547| -164.1889190673828|Alaska|
|02090|Fairbanks North S...| 64.67604064941406|-146.54815673828125|Alaska|
|02240|Southeast Fairban...| 63.86499786376953| -143.2186279296875|Alaska|
|02068|      Denali Borough|63.682037353515625| -150.0270233154297|Alaska|
+-----+------------------

In [138]:
df_locations.printSchema()

root
 |-- fips: string (nullable = true)
 |-- county: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- state: string (nullable = true)



In [140]:
df_locations.where(col("latitude") == "NaN").show()

+-----+--------------------+--------+---------+------+
| fips|              county|latitude|longitude| state|
+-----+--------------------+--------+---------+------+
|02997|Bristol Bay plus ...|     NaN|      NaN|Alaska|
|02998|Yakutat plus Hoon...|     NaN|      NaN|Alaska|
+-----+--------------------+--------+---------+------+



In [141]:
df_locations = df_locations.replace("NaN", None)

In [142]:
df_locations.sort("latitude", ascending = False).show(10)

+-----+--------------------+------------------+-------------------+------+
| fips|              county|          latitude|          longitude| state|
+-----+--------------------+------------------+-------------------+------+
|02998|Yakutat plus Hoon...|               NaN|                NaN|Alaska|
|02997|Bristol Bay plus ...|               NaN|                NaN|Alaska|
|02185| North Slope Borough|  69.4493408203125| -153.4728240966797|Alaska|
|02188|Northwest Arctic ...| 67.00506591796875|-160.02108764648438|Alaska|
| null|             Unknown|         66.160507|-153.36914099999998|Alaska|
|02290|Yukon-Koyukuk Cen...| 65.37572479248047|-151.57785034179688|Alaska|
|02180|    Nome Census Area| 64.78368377685547| -164.1889190673828|Alaska|
|02090|Fairbanks North S...| 64.67604064941406|-146.54815673828125|Alaska|
|02240|Southeast Fairban...| 63.86499786376953| -143.2186279296875|Alaska|
|02068|      Denali Borough|63.682037353515625| -150.0270233154297|Alaska|
+-----+------------------