In [10]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date, timedelta
from sqlalchemy import create_engine


In [11]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

['capstone.cfg']

In [12]:
project_path = config["PATH"]["project"]
os.chdir(project_path)

Create spark session. Add driver postgress to enable to load from existing postgres DB


In [13]:
# add driver postgress to enable to load from existing postgres DB
spark = SparkSession \
    .builder \
    .appName("US_weather") \
    .config("spark.jars", "postgresql-42.2.18.jar")\
    .config( "spark.driver.extraClassPath", "postgresql-42.2.18.jar")\
    .getOrCreate()

Connect to postgres; read stations table from postgres DB "covid"

# Load data

## Read all stations from sql DB

In [14]:
%%time
#engine = create_engine('postgresql://scott:tiger@localhost:5432/mydatabase')
engine = create_engine('postgresql://sb:sb@localhost:5432/covid')
df_stations = pd.read_sql_table("stations", engine)

CPU times: user 415 ms, sys: 66.4 ms, total: 481 ms
Wall time: 1.07 s


## Load filtered US weather stations

Only the stations yielding a valid measurement almost each day of 2020 are loaded.

In [15]:
df_filtered_stations = pd.read_csv("significant_stations.csv")

In [17]:
df_filtered_stations.head()

Unnamed: 0.1,Unnamed: 0,station_id,measured,nb_days
0,0,US1MDHW0007,PRCP,366
1,1,US1MNSE0002,PRCP,366
2,2,USW00014820,TAVG,366
3,3,USC00132638,SNOW,355
4,4,USR0000CROC,TMIN,366


In [24]:
df_filtered_stations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32738 entries, 0 to 32737
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  32738 non-null  int64 
 1   station_id  32738 non-null  object
 2   measured    32738 non-null  object
 3   nb_days     32738 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 1023.2+ KB


# Find closest station to fips

In [22]:
def precompute_distance(l_ref):
    ''' 
    convert latitude and longitude to radian, and compute cos(latitude),
    which is necessary to compute the distance between any two GPS coordinates
    '''
    l_ref["latitude_degrees"] = l_ref["latitude"]
    l_ref["longitude_degrees"] = l_ref["longitude"]
    def degree_to_radian(x):
        return  float(x)* np.pi / 180.
    l_ref["latitude"] = l_ref["latitude"].apply( degree_to_radian) 
    l_ref["longitude"] = l_ref["longitude"].apply( degree_to_radian)
    l_ref["cos_latitude"] = np.cos(l_ref["latitude"])            

In [23]:
precompute_distance(df_stations)

For each element, list of the stations which give the element

In [26]:
l_measurements= ["SNOW", "SNWD", "PRCP", "TMAX", "TMIN", "TAVG"]

In [27]:
d_stations_per_element = {}
for element in l_measurements:
    d_stations_per_element[element] = pd.merge(
        df_filtered_stations[ df_filtered_stations["measured"] == element],
        df_stations, on = "station_id")\
        [ ["station_id", "latitude", "longitude", "cos_latitude"]].drop_duplicates()

In [28]:
for element, lst in d_stations_per_element.items():
    print(element, len(lst))

SNOW 3505
SNWD 3931
PRCP 11699
TMAX 5713
TMIN 5668
TAVG 2222


In [29]:
d_stations_per_element["TMAX"].head()

Unnamed: 0,station_id,latitude,longitude,cos_latitude
0,USW00024048,0.013561,-0.032924,0.999908
1,USW00094274,0.014399,-0.037339,0.999896
2,USS0005K27S,0.012081,-0.032165,0.999927
3,USC00339220,0.012009,-0.025534,0.999928
4,USS0045L01S,0.018445,-0.044398,0.99983


## US counties geographic info from Gazeeter

In [30]:
def counties_lat_long_(name):
    with open(name) as f :
        headers = f.readline().split()
        l_headers = len(headers)
        print("len of headers : ", l_headers )
        l_res = []
        for line in f :
            l_str = line.split()
            state = l_str[0]
            fips = l_str[1]
            latitude = l_str[-2]
            longitude = l_str[-1]
            l = len(l_str)
            n_words = l - l_headers + 1
            county = " ".join( l_str[3:3+n_words] )
            #d_res.setdefault(state, []).append( [ state, county, fips, latitude, longitude ])
            l_res.append( [state, county, fips, latitude, longitude ])
        return l_res


In [31]:
def counties_lat_long(name):
    l_gazeeter = counties_lat_long_(name)
    df_gazeeter = pd.DataFrame(l_gazeeter, columns = ["state", "county", "fips", "latitude", "longitude"])
    return df_gazeeter

In [32]:
data_root = os.path.join(project_path, 'DATA/')

In [33]:
gazeeter_counties= counties_lat_long( os.path.join(data_root,"2020_Gaz_counties_national.txt"))
print("number of counties from Gazeeter : ", len(gazeeter_counties))

len of headers :  10
number of counties from Gazeeter :  3221


In [34]:
precompute_distance(gazeeter_counties)

In [35]:
gazeeter_counties.head()

Unnamed: 0,state,county,fips,latitude,longitude,latitude_degrees,longitude_degrees,cos_latitude
0,AL,Autauga County,1001,0.567795,-1.512266,32.532237,-86.64644,0.843089
1,AL,Baldwin County,1003,0.535104,-1.531458,30.659218,-87.746067,0.860215
2,AL,Barbour County,1005,0.556241,-1.4906,31.870253,-85.405104,0.849246
3,AL,Bibb County,1007,0.576236,-1.520656,33.015893,-87.127148,0.838519
4,AL,Blount County,1009,0.593017,-1.510869,33.977358,-86.56644,0.829258


In [36]:
def closest_ref(l_ref, latitude, longitude, cos_latitude):
        ''' computation of angular distance between 2 locations given by GPS coordinates
        exact formulas (maybe overkill), taken from :
        https://www.movable-type.co.uk/scripts/latlong.html
        '''    
        #print( f'latitude : {latitude}, longitude : {longitude}, cos(latitude) : {cos_lat}')
        # Haversine formula
        #print(l_ref.head())
        l_ref["delta_lat_term"] = ( np.sin( (l_ref["latitude"] - latitude) * 0.5 ) )**2
        l_ref["delta_long_term"] = ( np.sin( (l_ref["longitude"] - longitude) * 0.5) )**2
        l_ref["a"] = l_ref["delta_lat_term"] + l_ref["delta_long_term"] \
                        * cos_latitude * l_ref["cos_latitude"]
        l_ref["sqrt_a"] = l_ref["a"].apply( lambda x : np.sqrt(x) )
        l_ref["sqrt_1_a"] = l_ref["a"].apply(lambda x : np.sqrt(1.- x) )
        l_ref["angle"] = np.arctan2( np.sqrt(l_ref["a"]), np.sqrt( 1. - l_ref["a"] ) )
        closest = l_ref["angle"].idxmin()
        #print(closest)
        return closest, l_ref.iloc[closest]

In [37]:
import functools

In [38]:
def func_closest_station( df_row, df_stations):
    i, res = closest_ref( df_stations, df_row["latitude"], df_row["longitude"], df_row["cos_latitude"] )
    return res.loc[["station_id", "latitude", "longitude", "angle"]]

In [39]:
R_earth = 6371

!! approximately 11 min to complete

In [40]:
closeststation_per_element ={}
for element in l_measurements:
    element_function = functools.partial(func_closest_station, df_stations=d_stations_per_element[element])
    %time res = gazeeter_counties.apply(element_function, axis = 1)
    res["distance"] = R_earth * res["angle"]
    closeststation_per_element[element] = pd.concat( [ res, gazeeter_counties[ ["fips", "state", "county"] ]], axis = 1)


CPU times: user 1min 11s, sys: 128 ms, total: 1min 11s
Wall time: 1min 11s
CPU times: user 1min 19s, sys: 190 ms, total: 1min 19s
Wall time: 1min 19s
CPU times: user 3min 40s, sys: 411 ms, total: 3min 40s
Wall time: 3min 40s
CPU times: user 1min 56s, sys: 328 ms, total: 1min 56s
Wall time: 1min 56s
CPU times: user 1min 44s, sys: 188 ms, total: 1min 44s
Wall time: 1min 44s
CPU times: user 52.7 s, sys: 92 ms, total: 52.7 s
Wall time: 52.7 s


In [41]:
res.head()

Unnamed: 0,station_id,latitude,longitude,angle,distance
0,USR0000ATAH,0.020577,-0.049825,0.734198,4677.577403
1,USW00026617,0.019651,-0.050396,0.741797,4725.986164
2,USR0000ATAH,0.020577,-0.049825,0.724775,4617.540221
3,USR0000ATAH,0.020577,-0.049825,0.737888,4701.084677
4,USR0000ATAH,0.020577,-0.049825,0.734153,4677.29134


In [42]:
len(res)

3221

In [43]:
len(gazeeter_counties)

3221

In [44]:
for e, l  in closeststation_per_element.items():
    print(e, len(l))

SNOW 3221
SNWD 3221
PRCP 3221
TMAX 3221
TMIN 3221
TAVG 3221


In [45]:
closeststation_per_element["PRCP"].head()

Unnamed: 0,station_id,latitude,longitude,angle,distance,fips,state,county
0,USW00025711,0.017411,-0.05185,0.734199,4677.579562,1001,AL,Autauga County
1,USW00025711,0.017411,-0.05185,0.741743,4725.64643,1003,AL,Baldwin County
2,USW00025711,0.017411,-0.05185,0.724755,4617.414006,1005,AL,Barbour County
3,USW00026617,0.019651,-0.050396,0.737902,4701.170778,1007,AL,Bibb County
4,USW00026617,0.019651,-0.050396,0.734176,4677.436942,1009,AL,Blount County


In [46]:
pd.merge( closeststation_per_element["PRCP"], df_stations, on = "station_id")\
    .rename(columns = { "latitude_degrees" : "latitude_station", "longitude_degrees" : "longitude_station", 
                      "state_x" : "state_gazeeter", "state_y" : "state_station"})\
    [ ["fips",  "county", "state_gazeeter", 
       "station_id", "station_name", "state_station", "latitude_station", "longitude_station", "distance" ] ]

Unnamed: 0,fips,county,state_gazeeter,station_id,station_name,state_station,latitude_station,longitude_station,distance
0,01001,Autauga County,AL,USW00025711,ST. PAUL 4 NE 70309,AK,0.997587,-2.970758,4677.579562
1,01003,Baldwin County,AL,USW00025711,ST. PAUL 4 NE 70309,AK,0.997587,-2.970758,4725.646430
2,01005,Barbour County,AL,USW00025711,ST. PAUL 4 NE 70309,AK,0.997587,-2.970758,4617.414006
3,01011,Bullock County,AL,USW00025711,ST. PAUL 4 NE 70309,AK,0.997587,-2.970758,4632.806754
4,01013,Butler County,AL,USW00025711,ST. PAUL 4 NE 70309,AK,0.997587,-2.970758,4677.344435
...,...,...,...,...,...,...,...,...,...
3216,56037,Sweetwater County,WY,USS0062S01S,Kelly Station,AK,1.185602,-2.832320,5621.396314
3217,56039,Teton County,WY,USS0062S01S,Kelly Station,AK,1.185602,-2.832320,5654.372442
3218,56041,Uinta County,WY,USS0062S01S,Kelly Station,AK,1.185602,-2.832320,5694.350854
3219,56043,Washakie County,WY,USS0062S01S,Kelly Station,AK,1.185602,-2.832320,5548.572137


In [49]:
for element, l_stations in closeststation_per_element.items():
    out_name = os.path.join("OUT_DATA", f"closest_for_{element}.csv")
    l_stations.to_csv(out_name)
                    