In [1]:
# mapping of covid locations (fips + GPS) to the closest weather station

In [2]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.types import MapType, StringType, FloatType
from pyspark.sql import DataFrame, Window
from collections import OrderedDict
import pandas as pd
import numpy as np

In [3]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

os.chdir(config["PATH"]["project"])
project_path = config["PATH"]["project"]


In [4]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .appName("covid_DB") \
        .getOrCreate()
    
    return spark

In [5]:
spark = create_spark_session()

In [6]:
# Load relevant stations with weather element
path = os.path.join(project_path, "OUT_DATA", "filtered_stations")
selected_stations = spark.read.parquet(path)

In [7]:
selected_stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)



In [8]:
selected_stations.select("measured").distinct().show()

+--------+
|measured|
+--------+
|    TMIN|
|    SNOW|
|    AWND|
|    PRCP|
+--------+



In [9]:
# load all stations, with GPS location
raw_stations = spark.read.csv( os.path.join(project_path, "DATA", "WEATHER", "ghcnd-stations.txt"))

In [10]:
# parse raw stations into columns
@udf(MapType( StringType(), StringType()))
def ParseStationsUDF(line):
    return{
        "station_id": line[0:11],
        "latitude" : line[13:20], 
        "longitude" : line[21:30], 
        "elevation" : line[31:38], 
        "state" : line[38:40], 
        "station_name" : line[41:]
        
    }

fields = OrderedDict( [
        ( "station_id" , "string"),
        ( "latitude" , "float"), 
        ("longitude" , "float"), 
        ("elevation" , "float"),
        ("state" , "string"), 
        ("station_name" , "string")
] )

#exprs = [ f"parsed['{field}'].cast({fld_type}) as {field}" for field, fld_type in fields.items() ]
exprs = [ f"CAST(parsed['{field}'] AS {fld_type}) AS {field}" for field, fld_type in fields.items() ]

df_stations = raw_stations.withColumn("parsed", ParseStationsUDF("_c0")).selectExpr( *exprs)

In [11]:
df_stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- elevation: float (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)



In [12]:
df_stations = df_stations.join(selected_stations, ["station_id"])

In [65]:
df_stations.count()

23556

In [13]:
#selected_stations.count()

23556

In [14]:
#raw_stations.count()

118492

In [13]:
# some stations have identical latitude and longitude => keep only one within a GPS group
w = Window.partitionBy("latitude", "longitude").orderBy("elevation", "station_id")
df_stations_dedup = df_stations.withColumn("rank_elevation", F.rank().over(w))\
    .where( col("rank_elevation") == 1)\
    .drop("rank_elevation")\
    .cache()

In [14]:
df_stations_dedup.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- elevation: float (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- measured: string (nullable = true)



In [15]:
df_stations_dedup.count()

23531

In [16]:
w = Window.partitionBy("station_id").orderBy("measured")
unique_stations = df_stations_dedup.withColumn("dummy", F.rank().over(w) )\
    .where(col("dummy") == 1) \
    .drop("dummy")


In [105]:
unique_stations.count()

14006

In [123]:
# Load NYT locations (FIPS + GPS)
path = os.path.join(project_path, "OUT_DATA", "nyt_locations_geography")
df_locations = spark.read.parquet(path)
df_locations = df_locations.where( ( ~ F.isnan("latitude") ) | (~ F.isnan("longitude")) )
df_locations = df_locations.withColumn("location_id", F.monotonically_increasing_id())

In [124]:
df_locations.printSchema()

root
 |-- fips: string (nullable = true)
 |-- county: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- state: string (nullable = true)
 |-- location_id: long (nullable = false)



In [125]:
df_locations.count()

3272

In [20]:
def precompute_distance(l_ref : DataFrame) -> DataFrame:
    l_ref = l_ref.withColumnRenamed("latitude", "latitude_degrees")
    l_ref = l_ref.withColumnRenamed( "longitude", "longitude_degrees") 
    @udf( FloatType())
    def degree_to_radian(x):
        return  x* np.pi / 180.
    l_ref = l_ref.withColumn("latitude", degree_to_radian("latitude_degrees") )
    l_ref = l_ref.withColumn("longitude", degree_to_radian("longitude_degrees") )
    l_ref = l_ref.withColumn("cos_latitude", F.cos("latitude") )  
    print(type(l_ref))
    return l_ref

In [21]:
df_stations_precompute =precompute_distance(df_stations_dedup)

<class 'pyspark.sql.dataframe.DataFrame'>


In [126]:
df_locations_precompute = precompute_distance(df_locations)

<class 'pyspark.sql.dataframe.DataFrame'>


In [75]:
df_stations_precompute.where(F.isnan("cos_latitude") ).show()

+----------+----------------+-----------------+---------+-----+------------+--------+--------+---------+------------+
|station_id|latitude_degrees|longitude_degrees|elevation|state|station_name|measured|latitude|longitude|cos_latitude|
+----------+----------------+-----------------+---------+-----+------------+--------+--------+---------+------------+
+----------+----------------+-----------------+---------+-----+------------+--------+--------+---------+------------+



# closest station for all fips

In [128]:
sub_fips = df_locations_precompute\
    .select("location_id", "fips", "county", "state", "latitude", "longitude", "cos_latitude")\
    .withColumnRenamed("latitude", "latitude_fips")\
    .withColumnRenamed("longitude", "longitude_fips")\
    .withColumnRenamed("cos_latitude", "cos_latitude_fips")

In [24]:
sub_stations = df_stations_precompute\
    .select("station_id", "measured","latitude", "longitude", "cos_latitude")\
    .withColumnRenamed("latitude", "latitude_station")\
    .withColumnRenamed("longitude", "longitude_station")\
    .withColumnRenamed("cos_latitude", "cos_latitude_station")

In [129]:
fips_cross_stations = sub_fips.crossJoin(sub_stations)

In [130]:
%%time
fips_cross_stations.count()
#76993432

CPU times: user 34.6 ms, sys: 4.22 ms, total: 38.8 ms
Wall time: 5.56 s


76993432

In [43]:
fips_cross_stations.printSchema()

root
 |-- fips: string (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- latitude_fips: float (nullable = true)
 |-- longitude_fips: float (nullable = true)
 |-- cos_latitude_fips: double (nullable = true)
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- latitude_station: float (nullable = true)
 |-- longitude_station: float (nullable = true)
 |-- cos_latitude_station: double (nullable = true)



In [26]:
def delta_coord(col1, col2):
    return F.pow( F.sin(0.5* (col1 - col2 ) ), 2 )

In [27]:
def haversine( lat1, long1, cos_lat1, lat2, long2, cos_lat2):
    ''' computation of angular distance between 2 locations given by GPS coordinates,
    using haversine formula.
    exact formulas (maybe overkill), taken from :
    https://www.movable-type.co.uk/scripts/latlong.html
    '''    
    delta_lat = delta_coord(lat1, lat2)
    delta_long = delta_coord(long1, long2)
    a = delta_lat + delta_long * cos_lat1 * cos_lat2
    return  F.atan2( F.sqrt(a), F.sqrt( 1.-a ) )

In [131]:
# compute all pair-wise distances btw fips and weather stations
dist_col = "angle"
df_cross_distance = fips_cross_stations.withColumn(dist_col,                                  
        haversine(col("latitude_fips"), col("longitude_fips"),col("cos_latitude_fips"), 
                    col("latitude_station"), col("longitude_station"), col("cos_latitude_station")) )

In [132]:
# for each measurement and fips, keep only the station with the smallest distance (e.g. smallest angle)
window = Window.partitionBy("measured", "fips", "county", "state")
df_min_distance = df_cross_distance.withColumn("min_angle", F.min(dist_col).over(window))\
            .filter( col(dist_col) == col("min_angle") ) \
            .drop("min_angle")

In [133]:
R_earth = 6371

In [53]:
df_cross_distance.count()

77040494

In [55]:
df_locations_precompute.count()

3274

In [56]:
3274*4

13096

In [134]:
# compute distance from angle
df_min_distance = df_min_distance.withColumn("distance", R_earth * col("angle"))

In [135]:
# number of stations having the same min distance for each fips
w = Window.partitionBy("measured", "fips", "county", "state")
debug_1 = df_min_distance.withColumn("count", F.count("station_id").over(w))\
    .orderBy("count", ascending = False)

In [60]:
debug_1.show()

+-----+--------------------+------+-------------+--------------+-----------------+-----------+--------+----------------+-----------------+--------------------+-----+--------+-----+
| fips|              county| state|latitude_fips|longitude_fips|cos_latitude_fips| station_id|measured|latitude_station|longitude_station|cos_latitude_station|angle|distance|count|
+-----+--------------------+------+-------------+--------------+-----------------+-----------+--------+----------------+-----------------+--------------------+-----+--------+-----+
|02998|Yakutat plus Hoon...|Alaska|          NaN|           NaN|              NaN|US1AZPM0185|    PRCP|        0.565354|       -1.9371619|  0.8443990158790373|  NaN|     NaN|12371|
|02998|Yakutat plus Hoon...|Alaska|          NaN|           NaN|              NaN|US1AZPM0112|    PRCP|       0.5658759|       -1.9370414|  0.8441193122887083|  NaN|     NaN|12371|
|02998|Yakutat plus Hoon...|Alaska|          NaN|           NaN|              NaN|USC00142432| 

In [136]:
%%time
# must be 13088 
df_min_distance.count()

CPU times: user 71.6 ms, sys: 19.9 ms, total: 91.6 ms
Wall time: 1min 23s


13088

In [137]:
# join with fips and stations DB to get all relevant data
res_detailed = df_min_distance.alias("closest").join( df_locations.alias("fips"), ["location_id"]) \
        .join(unique_stations.alias("station"), "station_id")

In [138]:
%%time
res_detailed.count()

CPU times: user 122 ms, sys: 37.4 ms, total: 160 ms
Wall time: 2min 30s


13088

In [140]:
res_short = res_detailed.select("station.measured", "location_id", "fips.fips", "fips.county", col("fips.state").alias("fips_state"),
                    "station_id", "station_name", col("station.state").alias("station_state"),
                      "angle", "distance"        )


In [141]:
res_short.printSchema()

root
 |-- measured: string (nullable = true)
 |-- location_id: long (nullable = false)
 |-- fips: string (nullable = true)
 |-- county: string (nullable = true)
 |-- fips_state: string (nullable = true)
 |-- station_id: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- station_state: string (nullable = true)
 |-- angle: double (nullable = true)
 |-- distance: double (nullable = true)



In [254]:
%%time
# find fips and measurement with 
res_short.sort("distance", ascending = False).show(30)

+-----+--------------------+------+-----------+--------------------+-----+--------------------+------------------+--------+
| fips|              county| state| station_id|        station_name|state|               angle|          distance|measured|
+-----+--------------------+------+-----------+--------------------+-----+--------------------+------------------+--------+
|15009|                Maui|Hawaii|US1CASM0007|HALF MOON BAY 0.5...|   CA|  0.2955648047390893| 1883.043370992738|    SNOW|
|15009|                Maui|Hawaii|US1CASM0007|HALF MOON BAY 0.5...|   CA|  0.2955648047390893| 1883.043370992738|    SNOW|
|15001|              Hawaii|Hawaii|US1CASM0007|HALF MOON BAY 0.5...|   CA| 0.29521110773626774|1880.7899673877619|    SNOW|
|15001|              Hawaii|Hawaii|US1CASM0007|HALF MOON BAY 0.5...|   CA| 0.29521110773626774|1880.7899673877619|    SNOW|
|15005|             Kalawao|Hawaii|USC00502587|DUTCH HARBOR     ...|   AK|   0.292156590936852| 1861.329640858684|    SNOW|
|15005| 

In [85]:
out_path = os.path.join(project_path, "OUT_DATA", "stations_per_fips")
res_short.write.partitionBy("measured").format("parquet").mode("overwrite").save(out_path)

## Outliers

In [142]:
outliers = res_short.where(col("distance")>100).cache()

In [143]:
%%time
outliers.count()

CPU times: user 121 ms, sys: 42.6 ms, total: 163 ms
Wall time: 2min 26s


23

In [144]:
outliers.groupBy("measured").count().show()

+--------+-----+
|measured|count|
+--------+-----+
|    SNOW|    4|
|    AWND|    7|
|    PRCP|   12|
+--------+-----+



In [145]:
outliers.sort("distance", ascending = False).show()

+--------+------------+-----+--------------------+--------------------+-----------+--------------------+-------------+--------------------+------------------+
|measured| location_id| fips|              county|          fips_state| station_id|        station_name|station_state|               angle|          distance|
+--------+------------+-----+--------------------+--------------------+-----------+--------------------+-------------+--------------------+------------------+
|    PRCP|695784701972| null|             Unknown|              Hawaii|US1CASM0007|HALF MOON BAY 0.5...|           CA| 0.29661576550557106|1889.7390420359932|
|    PRCP|584115552272|15009|                Maui|              Hawaii|US1CASM0007|HALF MOON BAY 0.5...|           CA|  0.2955648047390893| 1883.043370992738|
|    PRCP|369367187475|15001|              Hawaii|              Hawaii|US1CASM0007|HALF MOON BAY 0.5...|           CA| 0.29521110773626774|1880.7899673877619|
|    PRCP|257698037778|15005|             Kala

In [146]:
%%time
pd_outliers = outliers.toPandas()

CPU times: user 16.9 ms, sys: 23.9 ms, total: 40.8 ms
Wall time: 897 ms


In [147]:
pd_outliers.sort_values("distance", ascending = False)

Unnamed: 0,measured,location_id,fips,county,fips_state,station_id,station_name,station_state,angle,distance
22,PRCP,695784701972,,Unknown,Hawaii,US1CASM0007,HALF MOON BAY 0.5 SSW,CA,0.296616,1889.739042
14,PRCP,584115552272,15009.0,Maui,Hawaii,US1CASM0007,HALF MOON BAY 0.5 SSW,CA,0.295565,1883.043371
2,PRCP,369367187475,15001.0,Hawaii,Hawaii,US1CASM0007,HALF MOON BAY 0.5 SSW,CA,0.295211,1880.789967
16,PRCP,257698037778,15005.0,Kalawao,Hawaii,USC00502587,DUTCH HARBOR,AK,0.292157,1861.329641
15,PRCP,154618822663,15003.0,Honolulu,Hawaii,USC00502587,DUTCH HARBOR,AK,0.2884,1837.397841
4,PRCP,481036337165,15007.0,Kauai,Hawaii,USC00502587,DUTCH HARBOR,AK,0.281888,1795.907502
6,PRCP,4,2016.0,Aleutians West Census Area,Alaska,USC00502587,DUTCH HARBOR,AK,0.081137,516.925004
12,AWND,4,2016.0,Aleutians West Census Area,Alaska,USW00025628,ST GEORGE ISLAND AP,AK,0.073584,468.803797
0,PRCP,4,2016.0,Aleutians West Census Area,Alaska,USW00025711,ST. PAUL 4 NE 70309,AK,0.073424,467.78338
5,PRCP,4,2016.0,Aleutians West Census Area,Alaska,USW00025711,ST. PAUL 4 NE 70309,AK,0.073424,467.78338


In [148]:
hawaii_stations = df_stations_dedup.where(col("state") == "HI").toPandas()

In [149]:
hawaii_stations["measured"].value_counts()

PRCP    68
TMIN    35
AWND     5
Name: measured, dtype: int64

In [150]:
df_locations.where(col("state") == "Hawaii").toPandas()

Unnamed: 0,fips,county,latitude,longitude,state,location_id
0,15003.0,Honolulu,21.461365,-158.201981,Hawaii,154618822663
1,15005.0,Kalawao,21.218763,-156.974014,Hawaii,257698037778
2,15001.0,Hawaii,19.597763,-155.502441,Hawaii,369367187475
3,15007.0,Kauai,22.012037,-159.705963,Hawaii,481036337165
4,15009.0,Maui,20.85593,-156.601547,Hawaii,584115552272
5,,Unknown,19.741755,-155.844437,Hawaii,695784701972


In [128]:
pd_outliers[ pd_outliers["fips_state"] == "Hawaii"].sort_values( ["measured", "county"] )

Unnamed: 0,fips,county,fips_state,latitude,longitude,elevation,station_state,station_name,measured,angle,distance
37,15001,Hawaii,Hawaii,37.463001,-122.440804,16.5,CA,HALF MOON BAY 0.5 SSW,PRCP,0.295211,1880.789967
52,15003,Honolulu,Hawaii,53.895,-166.543304,3.0,AK,DUTCH HARBOR,PRCP,0.2884,1837.397841
46,15005,Kalawao,Hawaii,53.895,-166.543304,3.0,AK,DUTCH HARBOR,PRCP,0.292157,1861.329641
49,15007,Kauai,Hawaii,53.895,-166.543304,3.0,AK,DUTCH HARBOR,PRCP,0.281888,1795.907502
35,15009,Maui,Hawaii,37.463001,-122.440804,16.5,CA,HALF MOON BAY 0.5 SSW,PRCP,0.295565,1883.043371
36,15001,Hawaii,Hawaii,37.463001,-122.440804,16.5,CA,HALF MOON BAY 0.5 SSW,SNOW,0.295211,1880.789967
50,15003,Honolulu,Hawaii,53.895,-166.543304,3.0,AK,DUTCH HARBOR,SNOW,0.2884,1837.397841
44,15005,Kalawao,Hawaii,53.895,-166.543304,3.0,AK,DUTCH HARBOR,SNOW,0.292157,1861.329641
47,15007,Kauai,Hawaii,53.895,-166.543304,3.0,AK,DUTCH HARBOR,SNOW,0.281888,1795.907502
34,15009,Maui,Hawaii,37.463001,-122.440804,16.5,CA,HALF MOON BAY 0.5 SSW,SNOW,0.295565,1883.043371


In [133]:
df_stations_dedup.select("state").distinct().sort("state").toPandas()

Unnamed: 0,state
0,AK
1,AL
2,AR
3,AS
4,AZ
5,CA
6,CO
7,CT
8,DC
9,DE


In [134]:
df_stations_dedup.where(col("state") == "HI").toPandas()

Unnamed: 0,station_id,latitude,longitude,elevation,state,station_name,measured
0,US1HIHI0003,19.504299,-154.916504,137.199997,HI,PAHOA 2.1 E,PRCP
1,US1HIHI0006,19.723801,-155.992599,304.500000,HI,KALAOA 1.7 E,PRCP
2,US1HIHI0012,19.073200,-155.663193,678.799988,HI,NAALEHU 5.7 W,PRCP
3,US1HIHI0033,20.041700,-155.477707,757.400024,HI,HONOKAA 2.5 SSW,PRCP
4,US1HIHI0034,20.060600,-155.429993,332.799988,HI,HONOKAA 2.7 ESE,PRCP
...,...,...,...,...,...,...,...
103,USW00022534,21.154400,-157.096100,135.000000,HI,MOLOKAI AP 91186,TMIN
104,USW00022534,21.154400,-157.096100,135.000000,HI,MOLOKAI AP 91186,PRCP
105,USW00022536,21.983900,-159.340500,30.500000,HI,LIHUE WSO AP 1020.1 GSN 91165,AWND
106,USW00022536,21.983900,-159.340500,30.500000,HI,LIHUE WSO AP 1020.1 GSN 91165,TMIN


In [135]:
df_locations.where(col("state") == "Hawaii").toPandas()

Unnamed: 0,fips,county,latitude,longitude,state
0,15003.0,Honolulu,21.461365,-158.201981,Hawaii
1,15005.0,Kalawao,21.218763,-156.974014,Hawaii
2,15001.0,Hawaii,19.597763,-155.502441,Hawaii
3,15007.0,Kauai,22.012037,-159.705963,Hawaii
4,15009.0,Maui,20.85593,-156.601547,Hawaii
5,,Unknown,19.741755,-155.844437,Hawaii


In [144]:
fips_hawaii = df_locations_precompute.where(col("state") == "Hawaii")\
    .select("fips", "latitude", "longitude", "cos_latitude")\
    .withColumnRenamed("latitude", "latitude_fips")\
    .withColumnRenamed("longitude", "longitude_fips")\
    .withColumnRenamed("cos_latitude", "cos_latitude_fips")


In [146]:
cross_hawaii = fips_hawaii.crossJoin(sub_stations)

In [147]:
cross_hawaii.printSchema()

root
 |-- fips: string (nullable = true)
 |-- latitude_fips: float (nullable = true)
 |-- longitude_fips: float (nullable = true)
 |-- cos_latitude_fips: double (nullable = true)
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- latitude_station: float (nullable = true)
 |-- longitude_station: float (nullable = true)
 |-- cos_latitude_station: double (nullable = true)



In [148]:
hawaii_cross_distance = cross_hawaii.withColumn(dist_col,                                  
        haversine(col("latitude_fips"), col("longitude_fips"),col("cos_latitude_fips"), 
                    col("latitude_station"), col("longitude_station"), col("cos_latitude_station")) )

In [149]:
hawaii_cross_distance.cache()

DataFrame[fips: string, latitude_fips: float, longitude_fips: float, cos_latitude_fips: double, station_id: string, measured: string, latitude_station: float, longitude_station: float, cos_latitude_station: double, angle: double]

In [171]:
df_stations_dedup.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- elevation: float (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- measured: string (nullable = true)



In [173]:
a_df_stations = df_stations_dedup.select( *( col(colname).alias(colname + "_STATION") for colname in df_stations.columns) )
a_df_stations.printSchema()

root
 |-- station_id_STATION: string (nullable = true)
 |-- latitude_STATION: float (nullable = true)
 |-- longitude_STATION: float (nullable = true)
 |-- elevation_STATION: float (nullable = true)
 |-- state_STATION: string (nullable = true)
 |-- station_name_STATION: string (nullable = true)
 |-- measured_STATION: string (nullable = true)



In [175]:
hawaii_cross_distance_detail = hawaii_cross_distance.join(a_df_stations, col("station_id") == col("station_id_STATION") )
    

In [176]:
hawaii_cross_distance.printSchema()

root
 |-- fips: string (nullable = true)
 |-- latitude_fips: float (nullable = true)
 |-- longitude_fips: float (nullable = true)
 |-- cos_latitude_fips: double (nullable = true)
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- latitude_station: float (nullable = true)
 |-- longitude_station: float (nullable = true)
 |-- cos_latitude_station: double (nullable = true)
 |-- angle: double (nullable = true)



In [177]:
hawaii_cross_distance_detail.printSchema()

root
 |-- fips: string (nullable = true)
 |-- latitude_fips: float (nullable = true)
 |-- longitude_fips: float (nullable = true)
 |-- cos_latitude_fips: double (nullable = true)
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- latitude_station: float (nullable = true)
 |-- longitude_station: float (nullable = true)
 |-- cos_latitude_station: double (nullable = true)
 |-- angle: double (nullable = true)
 |-- station_id_STATION: string (nullable = true)
 |-- latitude_STATION: float (nullable = true)
 |-- longitude_STATION: float (nullable = true)
 |-- elevation_STATION: float (nullable = true)
 |-- state_STATION: string (nullable = true)
 |-- station_name_STATION: string (nullable = true)
 |-- measured_STATION: string (nullable = true)



In [178]:
hawaii_cross_distance_detail.cache()

DataFrame[fips: string, latitude_fips: float, longitude_fips: float, cos_latitude_fips: double, station_id: string, measured: string, latitude_station: float, longitude_station: float, cos_latitude_station: double, angle: double, station_id_STATION: string, latitude_STATION: float, longitude_STATION: float, elevation_STATION: float, state_STATION: string, station_name_STATION: string, measured_STATION: string]

In [179]:
hawaii_cross_distance_detail = hawaii_cross_distance_detail.withColumn("distance", R_earth * col("angle"))

In [180]:
w = Window.partitionBy("measured", "fips")
min_hawaii =hawaii_cross_distance_detail.withColumn("min_angle", F.min("angle").over(w))\
    .where(col("min_angle") == col("angle"))

In [182]:
min_hawaii.toPandas()

Unnamed: 0,fips,latitude_fips,longitude_fips,cos_latitude_fips,station_id,measured,latitude_station,longitude_station,cos_latitude_station,angle,station_id_STATION,latitude_STATION,longitude_STATION,elevation_STATION,state_STATION,station_name_STATION,measured_STATION,distance,min_angle
0,,0.344559,-2.719999,0.941225,USR0000HPUW,TMIN,0.345488,-2.720014,0.94091,0.000465,USR0000HPUW,19.795,-155.845306,709.0,HI,PUU WAAWAA HAWAII,TMIN,2.960615,0.000465
1,15003.0,0.374571,-2.761145,0.930664,US1HIHN0014,PRCP,0.373378,-2.756199,0.9311,0.002378,US1HIHN0014,21.393,-157.918594,175.0,HI,AIEA 0.7 ENE,PRCP,15.150998,0.002378
2,15009.0,0.364005,-2.733213,0.934479,USC00518407,TMIN,0.365273,-2.73447,0.934026,0.000864,USC00518407,20.9286,-156.673599,128.300003,HI,PUUKOLII 457.1,TMIN,5.507033,0.000864
3,15009.0,0.364005,-2.733213,0.934479,USC00518407,TMIN,0.365273,-2.73447,0.934026,0.000864,USC00518407,20.9286,-156.673599,128.300003,HI,PUUKOLII 457.1,PRCP,5.507033,0.000864
4,15001.0,0.342045,-2.71403,0.942071,USW00021504,AWND,0.344165,-2.706187,0.941358,0.003842,USW00021504,19.7192,-155.053101,11.6,HI,HILO INTL AP GSN 91285,AWND,24.475917,0.003842
5,15001.0,0.342045,-2.71403,0.942071,USW00021504,AWND,0.344165,-2.706187,0.941358,0.003842,USW00021504,19.7192,-155.053101,11.6,HI,HILO INTL AP GSN 91285,TMIN,24.475917,0.003842
6,15001.0,0.342045,-2.71403,0.942071,USW00021504,AWND,0.344165,-2.706187,0.941358,0.003842,USW00021504,19.7192,-155.053101,11.6,HI,HILO INTL AP GSN 91285,PRCP,24.475917,0.003842
7,15005.0,0.370337,-2.739713,0.932205,USC00502587,SNOW,0.940645,-2.906729,0.589267,0.292157,USC00502587,53.895,-166.543304,3.0,AK,DUTCH HARBOR,SNOW,1861.329641,0.292157
8,15005.0,0.370337,-2.739713,0.932205,USC00502587,SNOW,0.940645,-2.906729,0.589267,0.292157,USC00502587,53.895,-166.543304,3.0,AK,DUTCH HARBOR,TMIN,1861.329641,0.292157
9,15005.0,0.370337,-2.739713,0.932205,USC00502587,SNOW,0.940645,-2.906729,0.589267,0.292157,USC00502587,53.895,-166.543304,3.0,AK,DUTCH HARBOR,PRCP,1861.329641,0.292157


In [164]:
df_hawaii = hawaii_cross_distance_detail.where(col("state_station") == "HI").toPandas()

ValueError: Grouper for 'measured' not 1-dimensional

In [168]:
df_hawaii

Unnamed: 0,station_id,fips,latitude_fips,longitude_fips,cos_latitude_fips,measured,latitude_station,longitude_station,cos_latitude_station,angle,latitude,longitude,elevation,state_station,station_name,measured.1,distance
0,US1HIHI0003,,0.344559,-2.719999,0.941225,PRCP,0.340414,-2.703803,0.942616,0.007904,19.504299,-154.916504,137.199997,HI,PAHOA 2.1 E,PRCP,50.355917
1,US1HIHI0003,15009,0.364005,-2.733213,0.934479,PRCP,0.340414,-2.703803,0.942616,0.018155,19.504299,-154.916504,137.199997,HI,PAHOA 2.1 E,PRCP,115.666874
2,US1HIHI0003,15007,0.384183,-2.787395,0.927105,PRCP,0.340414,-2.703803,0.942616,0.044787,19.504299,-154.916504,137.199997,HI,PAHOA 2.1 E,PRCP,285.341003
3,US1HIHI0003,15001,0.342045,-2.714030,0.942071,PRCP,0.340414,-2.703803,0.942616,0.004887,19.504299,-154.916504,137.199997,HI,PAHOA 2.1 E,PRCP,31.134920
4,US1HIHI0003,15005,0.370337,-2.739713,0.932205,PRCP,0.340414,-2.703803,0.942616,0.022520,19.504299,-154.916504,137.199997,HI,PAHOA 2.1 E,PRCP,143.478064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,USW00022536,15009,0.364005,-2.733213,0.934479,AWND,0.383691,-2.781016,0.927289,0.024330,21.983900,-159.340500,30.500000,HI,LIHUE WSO AP 1020.1 GSN 91165,PRCP,155.007730
944,USW00022536,15007,0.384183,-2.787395,0.927105,AWND,0.383691,-2.781016,0.927289,0.002967,21.983900,-159.340500,30.500000,HI,LIHUE WSO AP 1020.1 GSN 91165,PRCP,18.904727
945,USW00022536,15001,0.342045,-2.714030,0.942071,AWND,0.383691,-2.781016,0.927289,0.037601,21.983900,-159.340500,30.500000,HI,LIHUE WSO AP 1020.1 GSN 91165,PRCP,239.554080
946,USW00022536,15005,0.370337,-2.739713,0.932205,AWND,0.383691,-2.781016,0.927289,0.020329,21.983900,-159.340500,30.500000,HI,LIHUE WSO AP 1020.1 GSN 91165,PRCP,129.513057
