In [1]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date, timedelta

In [2]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

['capstone.cfg']

In [56]:
project_path = config["PATH"]["project"]
os.chdir(project_path)

Create spark session. Add driver postgress to enable to load from existing postgres DB


In [4]:
# add driver postgress to enable to load from existing postgres DB
spark = SparkSession \
    .builder \
    .appName("US_weather") \
    .config("spark.jars", "postgresql-42.2.18.jar")\
    .config( "spark.driver.extraClassPath", "postgresql-42.2.18.jar")\
    .getOrCreate()

Connect to postgres; read stations table from postgres DB "covid"

# Load data

## Load weather stations location data from postgres DB

In [5]:
stations = spark.read.format("jdbc")\
    .option("url" , "jdbc:postgresql://localhost:5432/covid")\
    .option("dbtable", "stations")\
    .option("user","sb")\
    .option("password", "sb")\
    .load()

In [25]:
stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- elevation: double (nullable = true)
 |-- state: string (nullable = true)
 |-- station_name: string (nullable = true)



## Load world US weather (prefiltered)

from raw 2020.csv, filter to keep only US stations, and remove failed measurements.
See US_weather_exporation.ipynb

In [7]:
%%time
weather_path = os.path.join(config["PATH"]["project"], "weather_2020_with_stations.parquet/" )
weather_2020 = spark.read.load(weather_path)

CPU times: user 6.1 ms, sys: 0 ns, total: 6.1 ms
Wall time: 11.3 s


In [15]:
weather_2020.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- measured: string (nullable = true)
 |-- v1: string (nullable = true)
 |-- date: integer (nullable = true)



In [8]:
%%time
nb_weather_records = weather_2020.count()
nb_weather_records

CPU times: user 3.25 ms, sys: 16 ms, total: 19.3 ms
Wall time: 1min


25074507

In [None]:
all_elements = weather_2020.select("measured").distinct().collect()

In [None]:
all_elements

In [19]:
l_measurements= ["SNOW", "SNWD", "PRCP", "TMAX", "TMIN", "TAVG"]

In [10]:
ndays_station_msr = weather_2020.groupBy("station_id", "measured").agg( F.countDistinct("date").alias("nb_days") )

keep only measurements for main elements

In [12]:
ndays_station_msr_filterered = ndays_station_msr.filter( ndays_station_msr["measured"].isin(l_measurements))

In [None]:
%%time
ndays_station_msr_filterered.count()

Keep only (station, element) were nb of measures > 350 over the year (i.e. almost 1 measure per day)

In [13]:
%%time
stations_msr_filtered2 = ndays_station_msr_filterered.filter( ndays_station_msr["nb_days"] > 350)

CPU times: user 5.17 ms, sys: 0 ns, total: 5.17 ms
Wall time: 61.5 ms


In [14]:
%%time
df_filtered_stations = stations_msr_filtered2.toPandas()

CPU times: user 519 ms, sys: 50.3 ms, total: 569 ms
Wall time: 4min 17s


Nb of stations with at least 1 main element measured most of the year

In [16]:
%%time
len(df_filtered_stations["station_id"].unique())

CPU times: user 9.88 ms, sys: 295 µs, total: 10.2 ms
Wall time: 13.1 ms


13318

In [71]:
df_stations = stations.toPandas()

In [50]:
def precompute_distance(l_ref):
    ''' 
    convert latitude and longitude to radian, and compute cos(latitude),
    which is necessary to compute the distance between any two GPS coordinates
    '''
    l_ref["latitude_degrees"] = l_ref["latitude"]
    l_ref["longitude_degrees"] = l_ref["longitude"]
    def degree_to_radian(x):
        return  float(x)* np.pi / 180.
    l_ref["latitude"] = l_ref["latitude"].apply( degree_to_radian) 
    l_ref["longitude"] = l_ref["longitude"].apply( degree_to_radian)
    l_ref["cos_latitude"] = np.cos(l_ref["latitude"])            

In [72]:
precompute_distance(df_stations)

In [73]:
df_stations

Unnamed: 0,station_id,latitude,longitude,elevation,state,station_name,latitude_degrees,longitude_degrees,cos_latitude
0,US009052008,0.763290,-1.686569,482.0,SD,SIOUX FALLS (ENVIRON. CANADA),43.7333,-96.6333,0.722565
1,US10RMHS145,0.707326,-1.834538,1569.1,CO,RMHS 1.6 SSW,40.5268,-105.1113,0.760102
2,US10adam001,0.708045,-1.719270,598.0,NE,JUNIATA 1.5 S,40.5680,-98.5069,0.759635
3,US10adam002,0.707021,-1.720010,601.1,NE,JUNIATA 6.0 SSW,40.5093,-98.5493,0.760301
4,US10adam003,0.706270,-1.721832,615.1,NE,HOLSTEIN 0.1 NW,40.4663,-98.6537,0.760788
...,...,...,...,...,...,...,...,...,...
65165,USC00516231,0.382808,-2.784092,,HI,MCBRYDES HOME,21.9333,-159.5167,0.927619
65166,USC00516271,0.492183,-3.095341,,HI,MIDWAY ISLAND,28.2000,-177.3500,0.881303
65167,USC00516594,0.362737,-2.724167,,HI,NAHIKU 1600 EL,20.7833,-156.0833,0.934929
65168,USC00517315,0.351539,-2.711879,,HI,PAAUILO ROCK CRUSHER,20.1417,-155.3792,0.938844


For each element, list of the stations which give the element

In [74]:
d_stations_per_element = {}
for element in l_measurements:
    d_stations_per_element[element] = pd.merge(
        df_filtered_stations[ df_filtered_stations["measured"] == element],
        df_stations, on = "station_id")\
        [ ["station_id", "latitude", "longitude", "cos_latitude"]].drop_duplicates()

In [46]:
for element, lst in d_stations_per_element.items():
    print(element, len(lst))

SNOW 3505
SNWD 3931
PRCP 11699
TMAX 5713
TMIN 5668
TAVG 2222


In [75]:
d_stations_per_element["TMAX"].head()

Unnamed: 0,station_id,latitude,longitude,cos_latitude
0,USW00024048,0.776966,-1.88639,0.713044
1,USW00094274,0.824974,-2.139357,0.678576
2,USS0005K27S,0.692198,-1.842893,0.769845
3,USC00339220,0.688038,-1.462973,0.772493
4,USS0045L01S,1.056797,-2.543817,0.491664


## US counties geographic info from Gazeeter

In [51]:
def counties_lat_long_(name):
    with open(name) as f :
        headers = f.readline().split()
        l_headers = len(headers)
        print("len of headers : ", l_headers )
        l_res = []
        for line in f :
            l_str = line.split()
            state = l_str[0]
            fips = l_str[1]
            latitude = l_str[-2]
            longitude = l_str[-1]
            l = len(l_str)
            n_words = l - l_headers + 1
            county = " ".join( l_str[3:3+n_words] )
            #d_res.setdefault(state, []).append( [ state, county, fips, latitude, longitude ])
            l_res.append( [state, county, fips, latitude, longitude ])
        return l_res


In [53]:
def counties_lat_long(name):
    l_gazeeter = counties_lat_long_(name)
    df_gazeeter = pd.DataFrame(l_gazeeter, columns = ["state", "county", "fips", "latitude", "longitude"])
    return df_gazeeter

In [57]:
data_root = os.path.join(project_path, 'DATA/')

In [58]:
gazeeter_counties= counties_lat_long( os.path.join(data_root,"2020_Gaz_counties_national.txt"))
print("number of counties from Gazeeter : ", len(gazeeter_counties))

len of headers :  10
number of counties from Gazeeter :  3221


Unnamed: 0,state,county,fips,latitude,longitude
0,AL,Autauga County,1001,32.532237,-86.64644
1,AL,Baldwin County,1003,30.659218,-87.746067
2,AL,Barbour County,1005,31.870253,-85.405104
3,AL,Bibb County,1007,33.015893,-87.127148
4,AL,Blount County,1009,33.977358,-86.56644


In [59]:
precompute_distance(gazeeter_counties)

In [61]:
gazeeter_counties.head()

Unnamed: 0,state,county,fips,latitude,longitude,latitude_degrees,longitude_degrees,cos_latitude
0,AL,Autauga County,1001,0.567795,-1.512266,32.532237,-86.64644,0.843089
1,AL,Baldwin County,1003,0.535104,-1.531458,30.659218,-87.746067,0.860215
2,AL,Barbour County,1005,0.556241,-1.4906,31.870253,-85.405104,0.849246
3,AL,Bibb County,1007,0.576236,-1.520656,33.015893,-87.127148,0.838519
4,AL,Blount County,1009,0.593017,-1.510869,33.977358,-86.56644,0.829258


In [92]:
def closest_ref(l_ref, latitude, longitude, cos_latitude):
        ''' computation of angular distance between 2 locations given by GPS coordinates
        exact formulas (maybe overkill), taken from :
        https://www.movable-type.co.uk/scripts/latlong.html
        '''    
        #print( f'latitude : {latitude}, longitude : {longitude}, cos(latitude) : {cos_lat}')
        # Haversine formula
        #print(l_ref.head())
        l_ref["delta_lat_term"] = ( np.sin( (l_ref["latitude"] - latitude) * 0.5 ) )**2
        l_ref["delta_long_term"] = ( np.sin( (l_ref["longitude"] - longitude) * 0.5) )**2
        l_ref["a"] = l_ref["delta_lat_term"] + l_ref["delta_long_term"] \
                        * cos_latitude * l_ref["cos_latitude"]
        l_ref["sqrt_a"] = l_ref["a"].apply( lambda x : np.sqrt(x) )
        l_ref["sqrt_1_a"] = l_ref["a"].apply(lambda x : np.sqrt(1.- x) )
        l_ref["angle"] = np.arctan2( np.sqrt(l_ref["a"]), np.sqrt( 1. - l_ref["a"] ) )
        closest = l_ref["angle"].idxmin()
        #print(closest)
        return closest, l_ref.iloc[closest]

In [93]:
import functools

In [96]:
def func_closest_station( df_row, df_stations):
    i, res = closest_ref( df_stations, df_row["latitude"], df_row["longitude"], df_row["cos_latitude"] )
    return res.loc[["station_id", "latitude", "longitude", "angle"]]

In [97]:
R_earth = 6371

In [103]:
closeststation_per_element ={}
for element in l_measurements:
    element_function = functools.partial(func_closest_station, df_stations=d_stations_per_element[element])
    %time res = gazeeter_counties.apply(element_function, axis = 1)
    res["distance"] = R_earth * res["angle"]
    closeststation_per_element[element] = pd.concat( [ res, gazeeter_counties[ ["fips", "state", "county"] ]], axis = 1)


CPU times: user 1min 27s, sys: 173 ms, total: 1min 27s
Wall time: 1min 27s
CPU times: user 1min 44s, sys: 151 ms, total: 1min 44s
Wall time: 1min 44s
CPU times: user 3min 13s, sys: 123 ms, total: 3min 13s
Wall time: 3min 13s
CPU times: user 1min 38s, sys: 99.7 ms, total: 1min 38s
Wall time: 1min 38s
CPU times: user 1min 39s, sys: 91.8 ms, total: 1min 40s
Wall time: 1min 40s
CPU times: user 51.8 s, sys: 31.9 ms, total: 51.8 s
Wall time: 51.8 s


In [108]:
res.head()

Unnamed: 0,station_id,latitude,longitude,angle,distance
0,USW00013895,0.563736,-1.508095,0.002686,17.114579
1,USR0000ABNS,0.528011,-1.532617,0.003582,22.819018
2,USW00013829,0.564614,-1.48353,0.005147,32.792927
3,USR0000AOKM,0.575212,-1.521414,0.000603,3.841172
4,USW00013876,0.58583,-1.513986,0.00382,24.334667


In [100]:
len(res)

3221

In [102]:
len(gazeeter_counties)

3221

In [104]:
for e, l  in closeststation_per_element.items():
    print(e, len(l))

SNOW 3221
SNWD 3221
PRCP 3221
TMAX 3221
TMIN 3221
TAVG 3221


In [105]:
closeststation_per_element["PRCP"].head()

Unnamed: 0,station_id,latitude,longitude,angle,distance,fips,state,county
0,USC00010823,0.570068,-1.513626,0.001273,8.1108,1001,AL,Autauga County
1,US1ALBW0079,0.533589,-1.533174,0.001058,6.740569,1003,AL,Baldwin County
2,US1ALBR0006,0.556456,-1.493721,0.001329,8.469317,1005,AL,Barbour County
3,USC00011100,0.576618,-1.518475,0.000934,5.950284,1007,AL,Bibb County
4,USC00016121,0.592127,-1.508333,0.001142,7.277017,1009,AL,Blount County


In [120]:
pd.merge( closeststation_per_element["PRCP"], df_stations, on = "station_id")\
    .rename(columns = { "latitude_degrees" : "latitude_station", "longitude_degrees" : "longitude_station", 
                      "state_x" : "state_gazeeter", "state_y" : "state_station"})\
    [ ["fips",  "county", "state_gazeeter", 
       "station_id", "station_name", "state_station", "latitude_station", "longitude_station", "distance" ] ]

Unnamed: 0,fips,county,state_gazeeter,station_id,station_name,state_station,latitude_station,longitude_station,distance
0,01001,Autauga County,AL,USC00010823,BILLINGSLEY,AL,32.6625,-86.7244,8.110800
1,01003,Baldwin County,AL,US1ALBW0079,DAPHNE 4.9 SE,AL,30.5724,-87.8444,6.740569
2,01005,Barbour County,AL,US1ALBR0006,CLAYTON 7.8 W,AL,31.8826,-85.5839,8.469317
3,01011,Bullock County,AL,US1ALBR0006,CLAYTON 7.8 W,AL,31.8826,-85.5839,13.711696
4,01109,Pike County,AL,US1ALBR0006,CLAYTON 7.8 W,AL,31.8826,-85.5839,17.527742
...,...,...,...,...,...,...,...,...,...
3216,72139,Trujillo Alto Municipio,PR,USC00081306,CAPE FLORIDA,FL,25.6719,-80.1567,834.956545
3217,72143,Vega Alta Municipio,PR,USC00081306,CAPE FLORIDA,FL,25.6719,-80.1567,817.090369
3218,72145,Vega Baja Municipio,PR,USC00081306,CAPE FLORIDA,FL,25.6719,-80.1567,813.783242
3219,72147,Vieques Municipio,PR,USC00081306,CAPE FLORIDA,FL,25.6719,-80.1567,866.766122
