In [1]:
# mapping of covid locations (fips + GPS) to the closest weather station

In [2]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.types import MapType, StringType, FloatType
from pyspark.sql import DataFrame, Window
from collections import OrderedDict
import pandas as pd
import numpy as np

In [3]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

os.chdir(config["PATH"]["project"])
project_path = config["PATH"]["project"]


In [4]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .appName("covid_DB") \
        .getOrCreate()
    
    return spark

In [5]:
spark = create_spark_session()

In [6]:
# Load relevant stations with weather element
path = os.path.join(project_path, "OUT_DATA", "filtered_stations")
selected_stations = spark.read.parquet(path)

In [7]:
selected_stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)



In [7]:
selected_stations.select("measured").distinct().show()

+--------+
|measured|
+--------+
|    TMIN|
|    SNOW|
|    AWND|
|    PRCP|
+--------+



In [8]:
# load all stations, with GPS location
raw_stations = spark.read.csv( os.path.join(project_path, "DATA", "WEATHER", "ghcnd-stations.txt"))

In [9]:
# parse raw stations into columns
@udf(MapType( StringType(), StringType()))
def ParseStationsUDF(line):
    return{
        "station_id": line[0:11],
        "latitude" : line[13:20], 
        "longitude" : line[21:30], 
        "elevation" : line[31:38], 
        "state" : line[38:40], 
        "station_name" : line[41:]
        
    }

fields = OrderedDict( [
        ( "station_id" , "string"),
        ( "latitude" , "float"), 
        ("longitude" , "float"), 
        ("elevation" , "float"),
        ("state" , "string"), 
        ("station_name" , "string")
] )

#exprs = [ f"parsed['{field}'].cast({fld_type}) as {field}" for field, fld_type in fields.items() ]
exprs = [ f"CAST(parsed['{field}'] AS {fld_type}) AS {field}" for field, fld_type in fields.items() ]

df_stations = raw_stations.withColumn("parsed", ParseStationsUDF("_c0")).selectExpr( *exprs)

In [10]:
df_stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- elevation: float (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)



In [11]:
df_stations = df_stations.join(selected_stations, ["station_id"])

In [21]:
df_stations.count()

23556

In [22]:
selected_stations.count()

23556

In [23]:
raw_stations.count()

118492

In [17]:
wgps = Window.partitionBy("measured","latitude", "longitude").orderBy("station_id")
df_stations_filtered = df_stations.withColumn("n", F.rank().over(wgps)) \
    .where(col("n") == 1)

In [33]:
# find stations with same GPS coordinates
wgps = Window.partitionBy("measured","latitude", "longitude")
multi_stations = df_stations.withColumn("n", F.count("station_id").over(wgps)).where(col("n") > 1).cache()
multi_stations.count()

34

In [35]:
mulst_pd = multi_stations.toPandas()

In [36]:
# some stations have different elevations for same latitude, longitude => pylon ?
mulst_pd.sort_values("latitude")

Unnamed: 0,station_id,latitude,longitude,elevation,state,station_name,measured,n
4,USC00082008,29.633301,-83.105301,12.8,FL,CROSS CITY 1 E,TMIN,2
5,USW00012833,29.633301,-83.105301,11.6,FL,CROSS CITY AP 72212,TMIN,2
10,USC00082008,29.633301,-83.105301,12.8,FL,CROSS CITY 1 E,PRCP,2
11,USW00012833,29.633301,-83.105301,11.6,FL,CROSS CITY AP 72212,PRCP,2
22,US1NMDA0371,32.6161,-106.740303,1315.800049,NM,LAS CRUCES 19.4 N,PRCP,2
23,USC00294426,32.6161,-106.740303,1316.099976,NM,JORNADA EXP RANGE HCN,PRCP,2
29,USW00093806,33.211899,-87.616096,45.700001,AL,TUSCALOOSA MUNI AP HCN,PRCP,2
2,USC00018380,33.211899,-87.616096,51.5,AL,TUSCALOOSA ACFD,TMIN,2
3,USW00093806,33.211899,-87.616096,45.700001,AL,TUSCALOOSA MUNI AP HCN,TMIN,2
28,USC00018380,33.211899,-87.616096,51.5,AL,TUSCALOOSA ACFD,PRCP,2


In [28]:
mult2 = df_stations.groupBy("measured", "latitude", "longitude").count().where(col("count")>1).cache()

In [29]:
mult2.count()

17

In [32]:
mult2.show()

+--------+--------+---------+-----+
|measured|latitude|longitude|count|
+--------+--------+---------+-----+
|    PRCP| 40.5147|-102.9906|    2|
|    TMIN| 33.2119| -87.6161|    2|
|    TMIN| 29.6333| -83.1053|    2|
|    PRCP| 35.4033| -92.3828|    2|
|    TMIN|   44.42| -72.0194|    2|
|    PRCP| 29.6333| -83.1053|    2|
|    TMIN| 42.1244| -86.4267|    2|
|    TMIN| 42.2122| -71.1136|    2|
|    PRCP| 40.5949|  -98.427|    2|
|    PRCP|   44.42| -72.0194|    2|
|    PRCP|   38.49|-101.3669|    2|
|    PRCP| 32.6161|-106.7403|    2|
|    PRCP| 40.0225| -95.7883|    2|
|    PRCP| 42.2122| -71.1136|    2|
|    PRCP| 33.2119| -87.6161|    2|
|    SNOW| 42.2122| -71.1136|    2|
|    SNOW| 40.5949|  -98.427|    2|
+--------+--------+---------+-----+



In [108]:
df_stations_filtered.count()

23539

In [26]:
df_stations.groupBy("measured", "station_id").count().orderBy("count", ascending = False).show()

+--------+-----------+-----+
|measured| station_id|count|
+--------+-----------+-----+
|    PRCP|US10chey019|    1|
|    PRCP|US10nema010|    1|
|    PRCP|US1AZPM0266|    1|
|    PRCP|US1AZYV0153|    1|
|    PRCP|US1CASD0104|    1|
|    SNOW|US1COLR1020|    1|
|    SNOW|US1CTMD0021|    1|
|    PRCP|US1ILKN0078|    1|
|    PRCP|US1KSEL0006|    1|
|    PRCP|US1KSJF0009|    1|
|    PRCP|US1KYOL0008|    1|
|    PRCP|US1MNBK0010|    1|
|    PRCP|US1MNSR0048|    1|
|    SNOW|US1MOSN0002|    1|
|    SNOW|US1NCBC0051|    1|
|    PRCP|US1NCBC0091|    1|
|    PRCP|US1NCRW0002|    1|
|    PRCP|US1NYRN0001|    1|
|    PRCP|US1OHAT0006|    1|
|    PRCP|US1OHGR0026|    1|
+--------+-----------+-----+
only showing top 20 rows



In [27]:
# Load NYT locations (FIPS + GPS)
path = os.path.join(project_path, "OUT_DATA", "nyt_locations_geography")
df_locations = spark.read.parquet(path)

In [28]:
df_locations = df_locations.where( ( ~ F.isnan("latitude") ) | (~ F.isnan("longitude")) )

In [29]:
df_locations.printSchema()

root
 |-- fips: string (nullable = true)
 |-- county: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- state: string (nullable = true)



In [30]:
def precompute_distance(l_ref : DataFrame) -> DataFrame:
    l_ref = l_ref.withColumnRenamed("latitude", "latitude_degrees")
    l_ref = l_ref.withColumnRenamed( "longitude", "longitude_degrees") 
    @udf( FloatType())
    def degree_to_radian(x):
        return  x* np.pi / 180.
    l_ref = l_ref.withColumn("latitude", degree_to_radian("latitude_degrees") )
    l_ref = l_ref.withColumn("longitude", degree_to_radian("longitude_degrees") )
    l_ref = l_ref.withColumn("cos_latitude", F.cos("latitude") )  
    print(type(l_ref))
    return l_ref

In [109]:
df_stations_precompute =precompute_distance(df_stations)

<class 'pyspark.sql.dataframe.DataFrame'>


In [32]:
df_stations_precompute.count()

23556

In [22]:
df_stations_precompute.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude_degrees: float (nullable = true)
 |-- longitude_degrees: float (nullable = true)
 |-- elevation: float (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- cos_latitude: double (nullable = true)



In [23]:
df_stations_precompute.agg({"latitude" : "min", "longitude" : "min", "latitude_degrees" : "min", "longitude_degrees" : "min" } ).collect()

[Row(min(longitude_degrees)=-170.76669311523438, min(latitude_degrees)=13.389399528503418, min(latitude)=0.2336890995502472, min(longitude)=-2.980441093444824)]

In [24]:
df_stations_precompute.agg({"latitude" : "max", "longitude" : "max", "latitude_degrees" : "max", "longitude_degrees" : "max" } ).collect()

[Row(max(longitude_degrees)=145.74969482421875, max(latitude_degrees)=71.32140350341797, max(latitude)=1.2447932958602905, max(longitude)=2.5438120365142822)]

In [33]:
df_locations_precompute = precompute_distance(df_locations)

<class 'pyspark.sql.dataframe.DataFrame'>


In [26]:
df_locations_precompute.agg({"latitude" : "min", "longitude" : "min", "latitude_degrees" : "min", "longitude_degrees" : "min" } ).collect()

[Row(min(longitude_degrees)=-164.1889190673828, min(latitude_degrees)=13.444, min(latitude)=0.23464205861091614, min(longitude)=-2.8656373023986816)]

In [27]:
df_locations_precompute.agg({"latitude" : "max", "longitude" : "max", "latitude_degrees" : "max", "longitude_degrees" : "max" } ).collect()

[Row(max(longitude_degrees)=178.33880615234375, max(latitude_degrees)=69.4493408203125, max(latitude)=1.212119698524475, max(longitude)=3.1125993728637695)]

In [34]:
df_locations_precompute.count()

3272

# closest station for all fips

In [110]:
sub_fips = df_locations_precompute\
    .select("fips", "county", "state", "latitude", "longitude", "cos_latitude")\
    .withColumnRenamed("latitude", "latitude_fips")\
    .withColumnRenamed("longitude", "longitude_fips")\
    .withColumnRenamed("cos_latitude", "cos_latitude_fips")

In [111]:
sub_stations = df_stations_precompute\
    .select("station_id", "measured","latitude", "longitude", "cos_latitude")\
    .withColumnRenamed("latitude", "latitude_station")\
    .withColumnRenamed("longitude", "longitude_station")\
    .withColumnRenamed("cos_latitude", "cos_latitude_station")

In [112]:
fips_cross_stations = sub_fips.crossJoin(sub_stations)

In [56]:
%%time
fips_cross_stations.count()

CPU times: user 2.79 ms, sys: 467 µs, total: 3.26 ms
Wall time: 8.61 s


77075232

In [57]:
fips_cross_stations.printSchema()

root
 |-- fips: string (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- latitude_fips: float (nullable = true)
 |-- longitude_fips: float (nullable = true)
 |-- cos_latitude_fips: double (nullable = true)
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- latitude_station: float (nullable = true)
 |-- longitude_station: float (nullable = true)
 |-- cos_latitude_station: double (nullable = true)



In [58]:
def delta_coord(col1, col2):
    return F.pow( F.sin(0.5* (col1 - col2 ) ), 2 )

In [59]:
def haversine( lat1, long1, cos_lat1, lat2, long2, cos_lat2):
    ''' computation of angular distance between 2 locations given by GPS coordinates,
    using haversine formula.
    exact formulas (maybe overkill), taken from :
    https://www.movable-type.co.uk/scripts/latlong.html
    '''    
    delta_lat = delta_coord(lat1, lat2)
    delta_long = delta_coord(long1, long2)
    a = delta_lat + delta_long * cos_lat1 * cos_lat2
    return  F.atan2( F.sqrt(a), F.sqrt( 1.-a ) )

In [113]:
# compute all pair-wise distances btw fips and weather stations
dist_col = "angle"
df_cross_distance = fips_cross_stations.withColumn(dist_col,                                  
        haversine(col("latitude_fips"), col("longitude_fips"),col("cos_latitude_fips"), 
                    col("latitude_station"), col("longitude_station"), col("cos_latitude_station")) )

In [188]:
%%time
df_cross_distance.show(10)

+-----+-------------+--------------+-------------------+-----------+--------+----------------+-----------------+--------------------+-------------------+
| fips|latitude_fips|longitude_fips|  cos_latitude_fips| station_id|measured|latitude_station|longitude_station|cos_latitude_station|              angle|
+-----+-------------+--------------+-------------------+-----------+--------+----------------+-----------------+--------------------+-------------------+
|02198|    0.9718477|     -2.324122| 0.5637744149985534|AQC00914000|    PRCP|      0.24987355|        -2.980441|  0.9689436985050108| 0.4400899995854632|
|02240|    1.1146545|    -2.4996367|0.44048764879312186|AQC00914000|    PRCP|      0.24987355|        -2.980441|  0.9689436985050108|  0.463387150877088|
|02261|    1.0669032|    -2.5544662| 0.4828384076125879|AQC00914000|    PRCP|      0.24987355|        -2.980441|  0.9689436985050108|0.43646667760730384|
|02090|    1.1288098|    -2.5577478| 0.4277359281117351|AQC00914000|    PRCP

In [189]:
%%time
df_cross_distance.count()

CPU times: user 3.16 ms, sys: 0 ns, total: 3.16 ms
Wall time: 7.62 s


77075232

In [61]:
df_cross_distance.printSchema()

root
 |-- fips: string (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- latitude_fips: float (nullable = true)
 |-- longitude_fips: float (nullable = true)
 |-- cos_latitude_fips: double (nullable = true)
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- latitude_station: float (nullable = true)
 |-- longitude_station: float (nullable = true)
 |-- cos_latitude_station: double (nullable = true)
 |-- angle: double (nullable = true)



In [63]:
df_cross_distance.groupby("fips", "county", "state", "measured").count().count()

13088

In [64]:
df_cross_distance.select("fips").distinct().count()

3217

In [88]:
df_min_distance.unpersist()

DataFrame[fips: string, county: string, state: string, latitude_fips: float, longitude_fips: float, cos_latitude_fips: double, station_id: string, measured: string, latitude_station: float, longitude_station: float, cos_latitude_station: double, angle: double, distance: double]

In [114]:
# for each measurement and fips, keep only the station with the smallest distance (e.g. smallest angle)
window = Window.partitionBy("measured", "fips", "county", "state")
df_min_distance = df_cross_distance.withColumn("min_angle", F.min(dist_col).over(window))\
            .filter( col(dist_col) == col("min_angle") ) \
            .drop("min_angle")

In [77]:
df_cross_distance.distinct().count()

77075232

In [69]:
R_earth = 6371

In [115]:
# compute distance from angle
df_min_distance = df_min_distance.withColumn("distance", R_earth * col("angle"))

In [117]:
%%time
# TODO : should be 13088 !!!
df_min_distance.count()

CPU times: user 47.6 ms, sys: 20 ms, total: 67.5 ms
Wall time: 2min 43s


13095

In [92]:
df_min_distance.groupby("fips", "county", "state").count().orderBy("count", ascending = False).show()

+-----+-------------+--------------+-----+
| fips|       county|         state|count|
+-----+-------------+--------------+-----+
|12029|        Dixie|       Florida|    6|
|12041|    Gilchrist|       Florida|    5|
|50005|    Caledonia|       Vermont|    5|
|31001|        Adams|      Nebraska|    5|
|26159|    Van Buren|      Michigan|    5|
|20203|      Wichita|        Kansas|    5|
|29219|       Warren|      Missouri|    4|
| null|      Unknown|       Vermont|    4|
|08057|      Jackson|      Colorado|    4|
|37065|    Edgecombe|North Carolina|    4|
|18145|       Shelby|       Indiana|    4|
|12065|    Jefferson|       Florida|    4|
|47179|   Washington|     Tennessee|    4|
|51590|Danville city|      Virginia|    4|
|20115|       Marion|        Kansas|    4|
|54105|         Wirt| West Virginia|    4|
|56031|       Platte|       Wyoming|    4|
|48111|       Dallam|         Texas|    4|
|48483|      Wheeler|         Texas|    4|
|19015|        Boone|          Iowa|    4|
+-----+----

In [75]:
df_min_distance.where( col("fips") == 12041).show()

+-----+------+-------+-------------+--------------+------------------+-----------+--------+----------------+-----------------+--------------------+--------------------+------------------+
| fips|county|  state|latitude_fips|longitude_fips| cos_latitude_fips| station_id|measured|latitude_station|longitude_station|cos_latitude_station|               angle|          distance|
+-----+------+-------+-------------+--------------+------------------+-----------+--------+----------------+-----------------+--------------------+--------------------+------------------+
|12029| Dixie|Florida|    0.5157915|    -1.4527463|0.8699026203480482|USC00082008|    PRCP|       0.5171987|       -1.4504611|  0.8692076921622199|0.001217431579044...| 7.756256590094734|
|12029| Dixie|Florida|    0.5157915|    -1.4527463|0.8699026203480482|USW00012833|    PRCP|       0.5171987|       -1.4504611|  0.8692076921622199|0.001217431579044...| 7.756256590094734|
|12029| Dixie|Florida|    0.5157915|    -1.4527463|0.8699026

In [93]:
df_min_distance.where( col("fips") == 12029).show()

+-----+------+-------+-------------+--------------+------------------+-----------+--------+----------------+-----------------+--------------------+--------------------+
| fips|county|  state|latitude_fips|longitude_fips| cos_latitude_fips| station_id|measured|latitude_station|longitude_station|cos_latitude_station|               angle|
+-----+------+-------+-------------+--------------+------------------+-----------+--------+----------------+-----------------+--------------------+--------------------+
|12029| Dixie|Florida|    0.5157915|    -1.4527463|0.8699026203480482|USC00082008|    PRCP|       0.5171987|       -1.4504611|  0.8692076921622199|0.001217431579044...|
|12029| Dixie|Florida|    0.5157915|    -1.4527463|0.8699026203480482|USW00012833|    PRCP|       0.5171987|       -1.4504611|  0.8692076921622199|0.001217431579044...|
|12029| Dixie|Florida|    0.5157915|    -1.4527463|0.8699026203480482|USW00012816|    AWND|      0.51822144|       -1.4359801|  0.8687015387992307|0.007388

In [206]:
# join with fips and stations DB to get all relevant data
res_detailed = df_min_distance.alias("closest").join( df_locations.alias("fips"), "fips") \
        .join(df_stations.alias("station"), "station_id")

In [269]:
res_detailed.count()

33920

In [207]:
res_detailed.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- fips: string (nullable = true)
 |-- latitude_fips: float (nullable = true)
 |-- longitude_fips: float (nullable = true)
 |-- cos_latitude_fips: double (nullable = true)
 |-- measured: string (nullable = true)
 |-- latitude_station: float (nullable = true)
 |-- longitude_station: float (nullable = true)
 |-- cos_latitude_station: double (nullable = true)
 |-- angle: double (nullable = true)
 |-- distance: double (nullable = true)
 |-- county: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- state: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- elevation: float (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- measured: string (nullable = true)



In [251]:
selected = ["fips"]
selected.extend( ["fips." + name for name in ["county", "state"] ])
selected.append("station_id")
d_select = {
           "station" : [ "station_name", "state"],
            "closest" : ["angle", "distance", "measured"]           
           }
selected.extend( [ k + "." + vi  for k,v in d_select.items() for vi in v ] )
print(selected)

['fips', 'fips.county', 'fips.state', 'station_id', 'station.station_name', 'station.state', 'closest.angle', 'closest.distance', 'closest.measured']


In [252]:
# keep only relevant information
res_short = res_detailed.select(*selected).cache()

In [253]:
res_short.printSchema()

root
 |-- fips: string (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- station_id: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- angle: double (nullable = true)
 |-- distance: double (nullable = true)
 |-- measured: string (nullable = true)



In [254]:
%%time
# find fips and measurement with 
res_short.sort("distance", ascending = False).show(30)

+-----+--------------------+------+-----------+--------------------+-----+--------------------+------------------+--------+
| fips|              county| state| station_id|        station_name|state|               angle|          distance|measured|
+-----+--------------------+------+-----------+--------------------+-----+--------------------+------------------+--------+
|15009|                Maui|Hawaii|US1CASM0007|HALF MOON BAY 0.5...|   CA|  0.2955648047390893| 1883.043370992738|    SNOW|
|15009|                Maui|Hawaii|US1CASM0007|HALF MOON BAY 0.5...|   CA|  0.2955648047390893| 1883.043370992738|    SNOW|
|15001|              Hawaii|Hawaii|US1CASM0007|HALF MOON BAY 0.5...|   CA| 0.29521110773626774|1880.7899673877619|    SNOW|
|15001|              Hawaii|Hawaii|US1CASM0007|HALF MOON BAY 0.5...|   CA| 0.29521110773626774|1880.7899673877619|    SNOW|
|15005|             Kalawao|Hawaii|USC00502587|DUTCH HARBOR     ...|   AK|   0.292156590936852| 1861.329640858684|    SNOW|
|15005| 

In [None]:
a_df_locations = df_locations.select( *( col(name).alias(name + "_FIPS") for name in ["county", "state"])

In [82]:
df_stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- elevation: float (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- measured: string (nullable = true)



In [109]:
out_path = os.path.join(project_path, "OUT_DATA", "stations_per_fips")
res.write.partitionBy("measured").format("parquet").save(out_path)

## Outliers

In [263]:
outliers = res_short.where(col("distance")>100).cache()

In [111]:
%%time
outliers.count()

CPU times: user 57.5 ms, sys: 34.7 ms, total: 92.2 ms
Wall time: 2min


56

In [264]:
outliers.groupBy("measured").count().show()

+--------+-----+
|measured|count|
+--------+-----+
|    TMIN|    2|
|    SNOW|   46|
|    AWND|    6|
|    PRCP|    2|
+--------+-----+



In [265]:
outliers.sort("distance", ascending = False).show()

+-----+--------------------+------+-----------+--------------------+-----+-------------------+------------------+--------+
| fips|              county| state| station_id|        station_name|state|              angle|          distance|measured|
+-----+--------------------+------+-----------+--------------------+-----+-------------------+------------------+--------+
|15009|                Maui|Hawaii|US1CASM0007|HALF MOON BAY 0.5...|   CA| 0.2955648047390893| 1883.043370992738|    SNOW|
|15009|                Maui|Hawaii|US1CASM0007|HALF MOON BAY 0.5...|   CA| 0.2955648047390893| 1883.043370992738|    SNOW|
|15001|              Hawaii|Hawaii|US1CASM0007|HALF MOON BAY 0.5...|   CA|0.29521110773626774|1880.7899673877619|    SNOW|
|15001|              Hawaii|Hawaii|US1CASM0007|HALF MOON BAY 0.5...|   CA|0.29521110773626774|1880.7899673877619|    SNOW|
|15005|             Kalawao|Hawaii|USC00502587|DUTCH HARBOR     ...|   AK|  0.292156590936852| 1861.329640858684|    SNOW|
|15005|         

In [260]:
%%time
pd_outliers = outliers.toPandas()

CPU times: user 32.4 ms, sys: 9.95 ms, total: 42.3 ms
Wall time: 911 ms


In [261]:
pd_outliers.sort_values("distance", ascending = False)

Unnamed: 0,fips,county,fips_state,latitude,longitude,elevation,station_state,station_name,measured,angle,distance
34,15009,Maui,Hawaii,37.463001,-122.440804,16.5,CA,HALF MOON BAY 0.5 SSW,SNOW,0.295565,1883.043371
35,15009,Maui,Hawaii,37.463001,-122.440804,16.5,CA,HALF MOON BAY 0.5 SSW,PRCP,0.295565,1883.043371
36,15001,Hawaii,Hawaii,37.463001,-122.440804,16.5,CA,HALF MOON BAY 0.5 SSW,SNOW,0.295211,1880.789967
37,15001,Hawaii,Hawaii,37.463001,-122.440804,16.5,CA,HALF MOON BAY 0.5 SSW,PRCP,0.295211,1880.789967
44,15005,Kalawao,Hawaii,53.895,-166.543304,3.0,AK,DUTCH HARBOR,SNOW,0.292157,1861.329641
45,15005,Kalawao,Hawaii,53.895,-166.543304,3.0,AK,DUTCH HARBOR,TMIN,0.292157,1861.329641
46,15005,Kalawao,Hawaii,53.895,-166.543304,3.0,AK,DUTCH HARBOR,PRCP,0.292157,1861.329641
52,15003,Honolulu,Hawaii,53.895,-166.543304,3.0,AK,DUTCH HARBOR,PRCP,0.2884,1837.397841
50,15003,Honolulu,Hawaii,53.895,-166.543304,3.0,AK,DUTCH HARBOR,SNOW,0.2884,1837.397841
51,15003,Honolulu,Hawaii,53.895,-166.543304,3.0,AK,DUTCH HARBOR,TMIN,0.2884,1837.397841


In [123]:
df_locations.where(col("state") == "Hawaii").toPandas()

Unnamed: 0,fips,county,latitude,longitude,state
0,15003.0,Honolulu,21.461365,-158.201981,Hawaii
1,15005.0,Kalawao,21.218763,-156.974014,Hawaii
2,15001.0,Hawaii,19.597763,-155.502441,Hawaii
3,15007.0,Kauai,22.012037,-159.705963,Hawaii
4,15009.0,Maui,20.85593,-156.601547,Hawaii
5,,Unknown,19.741755,-155.844437,Hawaii


In [128]:
pd_outliers[ pd_outliers["fips_state"] == "Hawaii"].sort_values( ["measured", "county"] )

Unnamed: 0,fips,county,fips_state,latitude,longitude,elevation,station_state,station_name,measured,angle,distance
37,15001,Hawaii,Hawaii,37.463001,-122.440804,16.5,CA,HALF MOON BAY 0.5 SSW,PRCP,0.295211,1880.789967
52,15003,Honolulu,Hawaii,53.895,-166.543304,3.0,AK,DUTCH HARBOR,PRCP,0.2884,1837.397841
46,15005,Kalawao,Hawaii,53.895,-166.543304,3.0,AK,DUTCH HARBOR,PRCP,0.292157,1861.329641
49,15007,Kauai,Hawaii,53.895,-166.543304,3.0,AK,DUTCH HARBOR,PRCP,0.281888,1795.907502
35,15009,Maui,Hawaii,37.463001,-122.440804,16.5,CA,HALF MOON BAY 0.5 SSW,PRCP,0.295565,1883.043371
36,15001,Hawaii,Hawaii,37.463001,-122.440804,16.5,CA,HALF MOON BAY 0.5 SSW,SNOW,0.295211,1880.789967
50,15003,Honolulu,Hawaii,53.895,-166.543304,3.0,AK,DUTCH HARBOR,SNOW,0.2884,1837.397841
44,15005,Kalawao,Hawaii,53.895,-166.543304,3.0,AK,DUTCH HARBOR,SNOW,0.292157,1861.329641
47,15007,Kauai,Hawaii,53.895,-166.543304,3.0,AK,DUTCH HARBOR,SNOW,0.281888,1795.907502
34,15009,Maui,Hawaii,37.463001,-122.440804,16.5,CA,HALF MOON BAY 0.5 SSW,SNOW,0.295565,1883.043371


In [133]:
df_stations.select("state").distinct().sort("state").toPandas()

Unnamed: 0,state
0,AK
1,AL
2,AR
3,AS
4,AZ
5,CA
6,CO
7,CT
8,DC
9,DE


In [134]:
df_stations.where(col("state") == "HI").toPandas()

Unnamed: 0,station_id,latitude,longitude,elevation,state,station_name,measured
0,US1HIHI0003,19.504299,-154.916504,137.199997,HI,PAHOA 2.1 E,PRCP
1,US1HIHI0006,19.723801,-155.992599,304.500000,HI,KALAOA 1.7 E,PRCP
2,US1HIHI0012,19.073200,-155.663193,678.799988,HI,NAALEHU 5.7 W,PRCP
3,US1HIHI0033,20.041700,-155.477707,757.400024,HI,HONOKAA 2.5 SSW,PRCP
4,US1HIHI0034,20.060600,-155.429993,332.799988,HI,HONOKAA 2.7 ESE,PRCP
...,...,...,...,...,...,...,...
103,USW00022534,21.154400,-157.096100,135.000000,HI,MOLOKAI AP 91186,TMIN
104,USW00022534,21.154400,-157.096100,135.000000,HI,MOLOKAI AP 91186,PRCP
105,USW00022536,21.983900,-159.340500,30.500000,HI,LIHUE WSO AP 1020.1 GSN 91165,AWND
106,USW00022536,21.983900,-159.340500,30.500000,HI,LIHUE WSO AP 1020.1 GSN 91165,TMIN


In [135]:
df_locations.where(col("state") == "Hawaii").toPandas()

Unnamed: 0,fips,county,latitude,longitude,state
0,15003.0,Honolulu,21.461365,-158.201981,Hawaii
1,15005.0,Kalawao,21.218763,-156.974014,Hawaii
2,15001.0,Hawaii,19.597763,-155.502441,Hawaii
3,15007.0,Kauai,22.012037,-159.705963,Hawaii
4,15009.0,Maui,20.85593,-156.601547,Hawaii
5,,Unknown,19.741755,-155.844437,Hawaii


In [144]:
fips_hawaii = df_locations_precompute.where(col("state") == "Hawaii")\
    .select("fips", "latitude", "longitude", "cos_latitude")\
    .withColumnRenamed("latitude", "latitude_fips")\
    .withColumnRenamed("longitude", "longitude_fips")\
    .withColumnRenamed("cos_latitude", "cos_latitude_fips")


In [146]:
cross_hawaii = fips_hawaii.crossJoin(sub_stations)

In [147]:
cross_hawaii.printSchema()

root
 |-- fips: string (nullable = true)
 |-- latitude_fips: float (nullable = true)
 |-- longitude_fips: float (nullable = true)
 |-- cos_latitude_fips: double (nullable = true)
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- latitude_station: float (nullable = true)
 |-- longitude_station: float (nullable = true)
 |-- cos_latitude_station: double (nullable = true)



In [148]:
hawaii_cross_distance = cross_hawaii.withColumn(dist_col,                                  
        haversine(col("latitude_fips"), col("longitude_fips"),col("cos_latitude_fips"), 
                    col("latitude_station"), col("longitude_station"), col("cos_latitude_station")) )

In [149]:
hawaii_cross_distance.cache()

DataFrame[fips: string, latitude_fips: float, longitude_fips: float, cos_latitude_fips: double, station_id: string, measured: string, latitude_station: float, longitude_station: float, cos_latitude_station: double, angle: double]

In [171]:
df_stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- elevation: float (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- measured: string (nullable = true)



In [173]:
a_df_stations = df_stations.select( *( col(colname).alias(colname + "_STATION") for colname in df_stations.columns) )
a_df_stations.printSchema()

root
 |-- station_id_STATION: string (nullable = true)
 |-- latitude_STATION: float (nullable = true)
 |-- longitude_STATION: float (nullable = true)
 |-- elevation_STATION: float (nullable = true)
 |-- state_STATION: string (nullable = true)
 |-- station_name_STATION: string (nullable = true)
 |-- measured_STATION: string (nullable = true)



In [175]:
hawaii_cross_distance_detail = hawaii_cross_distance.join(a_df_stations, col("station_id") == col("station_id_STATION") )
    

In [176]:
hawaii_cross_distance.printSchema()

root
 |-- fips: string (nullable = true)
 |-- latitude_fips: float (nullable = true)
 |-- longitude_fips: float (nullable = true)
 |-- cos_latitude_fips: double (nullable = true)
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- latitude_station: float (nullable = true)
 |-- longitude_station: float (nullable = true)
 |-- cos_latitude_station: double (nullable = true)
 |-- angle: double (nullable = true)



In [177]:
hawaii_cross_distance_detail.printSchema()

root
 |-- fips: string (nullable = true)
 |-- latitude_fips: float (nullable = true)
 |-- longitude_fips: float (nullable = true)
 |-- cos_latitude_fips: double (nullable = true)
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- latitude_station: float (nullable = true)
 |-- longitude_station: float (nullable = true)
 |-- cos_latitude_station: double (nullable = true)
 |-- angle: double (nullable = true)
 |-- station_id_STATION: string (nullable = true)
 |-- latitude_STATION: float (nullable = true)
 |-- longitude_STATION: float (nullable = true)
 |-- elevation_STATION: float (nullable = true)
 |-- state_STATION: string (nullable = true)
 |-- station_name_STATION: string (nullable = true)
 |-- measured_STATION: string (nullable = true)



In [178]:
hawaii_cross_distance_detail.cache()

DataFrame[fips: string, latitude_fips: float, longitude_fips: float, cos_latitude_fips: double, station_id: string, measured: string, latitude_station: float, longitude_station: float, cos_latitude_station: double, angle: double, station_id_STATION: string, latitude_STATION: float, longitude_STATION: float, elevation_STATION: float, state_STATION: string, station_name_STATION: string, measured_STATION: string]

In [179]:
hawaii_cross_distance_detail = hawaii_cross_distance_detail.withColumn("distance", R_earth * col("angle"))

In [180]:
w = Window.partitionBy("measured", "fips")
min_hawaii =hawaii_cross_distance_detail.withColumn("min_angle", F.min("angle").over(w))\
    .where(col("min_angle") == col("angle"))

In [182]:
min_hawaii.toPandas()

Unnamed: 0,fips,latitude_fips,longitude_fips,cos_latitude_fips,station_id,measured,latitude_station,longitude_station,cos_latitude_station,angle,station_id_STATION,latitude_STATION,longitude_STATION,elevation_STATION,state_STATION,station_name_STATION,measured_STATION,distance,min_angle
0,,0.344559,-2.719999,0.941225,USR0000HPUW,TMIN,0.345488,-2.720014,0.94091,0.000465,USR0000HPUW,19.795,-155.845306,709.0,HI,PUU WAAWAA HAWAII,TMIN,2.960615,0.000465
1,15003.0,0.374571,-2.761145,0.930664,US1HIHN0014,PRCP,0.373378,-2.756199,0.9311,0.002378,US1HIHN0014,21.393,-157.918594,175.0,HI,AIEA 0.7 ENE,PRCP,15.150998,0.002378
2,15009.0,0.364005,-2.733213,0.934479,USC00518407,TMIN,0.365273,-2.73447,0.934026,0.000864,USC00518407,20.9286,-156.673599,128.300003,HI,PUUKOLII 457.1,TMIN,5.507033,0.000864
3,15009.0,0.364005,-2.733213,0.934479,USC00518407,TMIN,0.365273,-2.73447,0.934026,0.000864,USC00518407,20.9286,-156.673599,128.300003,HI,PUUKOLII 457.1,PRCP,5.507033,0.000864
4,15001.0,0.342045,-2.71403,0.942071,USW00021504,AWND,0.344165,-2.706187,0.941358,0.003842,USW00021504,19.7192,-155.053101,11.6,HI,HILO INTL AP GSN 91285,AWND,24.475917,0.003842
5,15001.0,0.342045,-2.71403,0.942071,USW00021504,AWND,0.344165,-2.706187,0.941358,0.003842,USW00021504,19.7192,-155.053101,11.6,HI,HILO INTL AP GSN 91285,TMIN,24.475917,0.003842
6,15001.0,0.342045,-2.71403,0.942071,USW00021504,AWND,0.344165,-2.706187,0.941358,0.003842,USW00021504,19.7192,-155.053101,11.6,HI,HILO INTL AP GSN 91285,PRCP,24.475917,0.003842
7,15005.0,0.370337,-2.739713,0.932205,USC00502587,SNOW,0.940645,-2.906729,0.589267,0.292157,USC00502587,53.895,-166.543304,3.0,AK,DUTCH HARBOR,SNOW,1861.329641,0.292157
8,15005.0,0.370337,-2.739713,0.932205,USC00502587,SNOW,0.940645,-2.906729,0.589267,0.292157,USC00502587,53.895,-166.543304,3.0,AK,DUTCH HARBOR,TMIN,1861.329641,0.292157
9,15005.0,0.370337,-2.739713,0.932205,USC00502587,SNOW,0.940645,-2.906729,0.589267,0.292157,USC00502587,53.895,-166.543304,3.0,AK,DUTCH HARBOR,PRCP,1861.329641,0.292157


In [164]:
df_hawaii = hawaii_cross_distance_detail.where(col("state_station") == "HI").toPandas()

ValueError: Grouper for 'measured' not 1-dimensional

In [168]:
df_hawaii

Unnamed: 0,station_id,fips,latitude_fips,longitude_fips,cos_latitude_fips,measured,latitude_station,longitude_station,cos_latitude_station,angle,latitude,longitude,elevation,state_station,station_name,measured.1,distance
0,US1HIHI0003,,0.344559,-2.719999,0.941225,PRCP,0.340414,-2.703803,0.942616,0.007904,19.504299,-154.916504,137.199997,HI,PAHOA 2.1 E,PRCP,50.355917
1,US1HIHI0003,15009,0.364005,-2.733213,0.934479,PRCP,0.340414,-2.703803,0.942616,0.018155,19.504299,-154.916504,137.199997,HI,PAHOA 2.1 E,PRCP,115.666874
2,US1HIHI0003,15007,0.384183,-2.787395,0.927105,PRCP,0.340414,-2.703803,0.942616,0.044787,19.504299,-154.916504,137.199997,HI,PAHOA 2.1 E,PRCP,285.341003
3,US1HIHI0003,15001,0.342045,-2.714030,0.942071,PRCP,0.340414,-2.703803,0.942616,0.004887,19.504299,-154.916504,137.199997,HI,PAHOA 2.1 E,PRCP,31.134920
4,US1HIHI0003,15005,0.370337,-2.739713,0.932205,PRCP,0.340414,-2.703803,0.942616,0.022520,19.504299,-154.916504,137.199997,HI,PAHOA 2.1 E,PRCP,143.478064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,USW00022536,15009,0.364005,-2.733213,0.934479,AWND,0.383691,-2.781016,0.927289,0.024330,21.983900,-159.340500,30.500000,HI,LIHUE WSO AP 1020.1 GSN 91165,PRCP,155.007730
944,USW00022536,15007,0.384183,-2.787395,0.927105,AWND,0.383691,-2.781016,0.927289,0.002967,21.983900,-159.340500,30.500000,HI,LIHUE WSO AP 1020.1 GSN 91165,PRCP,18.904727
945,USW00022536,15001,0.342045,-2.714030,0.942071,AWND,0.383691,-2.781016,0.927289,0.037601,21.983900,-159.340500,30.500000,HI,LIHUE WSO AP 1020.1 GSN 91165,PRCP,239.554080
946,USW00022536,15005,0.370337,-2.739713,0.932205,AWND,0.383691,-2.781016,0.927289,0.020329,21.983900,-159.340500,30.500000,HI,LIHUE WSO AP 1020.1 GSN 91165,PRCP,129.513057
