In [None]:
import numpy as np
import pandas as pd
import os
import configparser

In [None]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

In [None]:
project_path = config["PATH"]["PROJECT"]
print(project_path)

# Functions to parse geographic info from various data sources

## Weather stations

In [None]:
def parse_stations_(name):
    l_res = []
    with open(name) as f:
        n=0
        for line in f:
            l_str = line.split()
            #print(l_str)
            res = l_str[0:5]
            #print(res)
            toto = " ".join(l_str[5:]) 
            #print(toto)
            res.append(toto)
            l_res.append(res)
            n+=1
#            if n ==10:
#                break
    return l_res

In [None]:
def parse_stations(name):
    l_stations = parse_stations_(name)
    df_stations = pd.DataFrame(l_stations, columns = ["ID", "latitude", "longitude", "elevation", "state", "name"])
    return df_stations

## geographic info from NYTimes covid data

In [None]:
def NYT_counties_(name):
    d_res = {}
    with open(name) as f :
        header = f.readline()
        for line in f :
            l_str = line.split(",")
            # state,county, fips
            d_res.setdefault( (l_str[2], l_str[1], l_str[3]), True  )
    return d_res


In [None]:
def NYT_counties(name):
    dict_counties= NYT_counties_(name)
    df_counties = pd.DataFrame(dict_counties.keys(), columns = ["state", "county", "fips"])
    return df_counties        

## US counties geographic info from Gazeeter

In [None]:
def counties_lat_long_(name):
    with open(name) as f :
        headers = f.readline().split()
        l_headers = len(headers)
        print("len of headers : ", l_headers )
        l_res = []
        for line in f :
            l_str = line.split()
            state = l_str[0]
            fips = l_str[1]
            latitude = l_str[-2]
            longitude = l_str[-1]
            l = len(l_str)
            n_words = l - l_headers + 1
            county = " ".join( l_str[3:3+n_words] )
            #d_res.setdefault(state, []).append( [ state, county, fips, latitude, longitude ])
            l_res.append( [state, county, fips, latitude, longitude ])
        return l_res


In [None]:
def counties_lat_long(name):
    l_gazeeter = counties_lat_long_(name)
    df_gazeeter = pd.DataFrame(l_gazeeter, columns = ["state", "county", "fips", "latitude", "longitude"])
    return df_gazeeter

# Read data

In [None]:
data_root = os.path.join(project_path, 'DATA/')

## Weather stations location

In [None]:
#df_stations = parse_stations( os.path.join(data_root, 'WEATHER/US_ghcnd_stations.txt'))
df_stations = parse_stations( os.path.join(data_root, 'WEATHER/US_PuertoRico_stations.txt'))
df_stations.head()

In [None]:
df_puertorico =parse_stations( os.path.join(data_root, 'WEATHER/PuertoRico_ghcnd_stations.txt'))
df_puertorico.head()

In [None]:
len(df_stations), len(df_puertorico)


## COVID deaths and case per county per day, from NewYork Times

In [None]:
nytimes_counties = NYT_counties( os.path.join(data_root, "us-counties.txt"))
print( "number of counties from NY Times:", len(nytimes_counties))
nytimes_counties.head()

## Counties fips to geographic coordinates (latitude, longitude) 


In [None]:
gazeeter_counties= counties_lat_long( os.path.join(data_root,"2020_Gaz_counties_national.txt"))
print("number of counties from Gazeeter : ", len(gazeeter_counties))
gazeeter_counties.head()

In [None]:
class DistanceToStation :
    def __init__(self, df_stations):
        ''' init computations for latitude and longitude'''
        self.df_stations = df_stations.copy(deep = True)
        self.df_stations["latitude_degrees"] = self.df_stations["latitude"]
        self.df_stations["longitude_degrees"] = self.df_stations["longitude"]
        
        self.df_stations["elevation"] = self.df_stations["elevation"].apply( lambda x : float(x) )
        def degree_to_radian(x):
            return  float(x)* np.pi / 180.
        self.df_stations["latitude"] = self.df_stations["latitude"].apply( degree_to_radian) 
        self.df_stations["longitude"] = self.df_stations["longitude"].apply( degree_to_radian)
        self.df_stations["cos_latitude"] = np.cos(self.df_stations["latitude"])
           
    
    def closest_station( self, lati, longi):
        latitude = float(lati) * np.pi / 180.
        longitude = float(longi) * np.pi / 180.
        cos_lat = np.cos(latitude)
        #print( f'latitude : {latitude}, longitude : {longitude}, cos(latitude) : {cos_lat}')
        # Haversine formula
        self.df_stations["delta_lat_term"] = ( np.sin( (self.df_stations["latitude"] - latitude) * 0.5 ) )**2
        self.df_stations["delta_long_term"] = ( np.sin( (self.df_stations["longitude"] - longitude) * 0.5) )**2
        self.df_stations["a"] = self.df_stations["delta_lat_term"] + self.df_stations["delta_long_term"] \
                        * cos_lat * self.df_stations["cos_latitude"]
        self.df_stations["sqrt_a"] = self.df_stations["a"].apply( lambda x : np.sqrt(x) )
        self.df_stations["sqrt_1_a"] = self.df_stations["a"].apply( lambda x : np.sqrt(1.- x) )
        self.df_stations["angle"] = np.arctan2( np.sqrt(self.df_stations["a"]), np.sqrt( 1. - self.df_stations["a"] ) )
        closest = self.df_stations["angle"].idxmin()
        #print(closest)
        return closest, self.df_stations.iloc[closest]
        

# Map fips to the closest weather station

for each county, find the station closest to its 'center'


In [None]:
station_distance = DistanceToStation(df_stations);

## Test with first county

In [None]:
from collections import OrderedDict
for county in gazeeter_counties.itertuples(index = False):
    d_county = county._asdict()
    idx_closest, closest = station_distance.closest_station( d_county["latitude"], d_county["longitude"] )
    print(d_county)
    print(idx_closest)
    print(closest)
    break

In [None]:
type(closest)

In [None]:
type(closest["latitude"]), type(closest["ID"])

In [None]:
closest.loc[["ID", "latitude"]]

In [None]:
len(df_stations)

## Test with Washingon DC (small state)

In [None]:
df_WashingtonDC_stations = df_stations[ df_stations["state"] == "DC" ]
df_WashingtonDC_stations

In [None]:
average_lat = np.average(df_WashingtonDC_stations["latitude"].apply(lambda x : float(x)))
average_long = np.average(df_WashingtonDC_stations["longitude"].apply(lambda x : float(x)))
average_lat, average_long

In [None]:
df_WashingtonDC_gazeeter = gazeeter_counties[ gazeeter_counties["state"] == "DC"]
df_WashingtonDC_gazeeter

In [None]:
county = df_WashingtonDC_gazeeter.iloc[0]
i_closest, closest = station_distance.closest_station( float(county["latitude"]), float(county["longitude"]) )


In [None]:
closest

## All county fips

In [None]:
def func_closest_station(df_row):
    i, res = station_distance.closest_station( df_row["latitude"], df_row["longitude"] )
    return res.loc[["ID", "latitude", "longitude","elevation","state","name"]]

res = gazeeter_counties.apply(func_closest_station, axis = 1)
res.to_csv("county_station.csv")
#gazeeter_counties[ ["station_ID", "station_latitude", "station_longitude", "station_elevation", "station_state", "station_name"]] = \
#    gazeeter_counties.apply(func_closest_station)
    

In [None]:
gazeeter_counties.head()

In [None]:
gazeeter_counties.to_csv("counties.csv")

In [None]:
len(station_distance.df_stations)

In [None]:
fips_puertorico = gazeeter_counties[gazeeter_counties["state"] == 'PR']
fips_puertorico

In [None]:
res = fips_puertorico.apply(func_closest_station, axis = 1)

In [None]:
res.to_csv("county_station_withPuertoRico.csv")

# Map NY Times location to Gazeeter locations

Check that all fips are unique in Gazeeter

In [None]:
gaz_counts = gazeeter_counties["fips"].value_counts()
len( gaz_counts[gaz_counts>1]) # 0 : all fips are unique !

Check whether all fips are unique in New York Times data

In [None]:
fips_counts = nytimes_counties["fips"].value_counts()
fips_counts[ fips_counts >1 ] # several empty fips

Remove empty fips so that we can join with a one-to-one match

In [None]:
nytimes_counties_withfips = nytimes_counties[ nytimes_counties["fips"] != '']
nytimes_counties_nofips = nytimes_counties[nytimes_counties["fips"] == '']
len(nytimes_counties), len(nytimes_counties_withfips), len(nytimes_counties_nofips)

In [None]:
nytimes_geo = pd.merge(nytimes_counties_withfips, gazeeter_counties, how = "inner", on = "fips", validate = "one_to_one")
len(nytimes_geo), len(nytimes_counties_withfips)

In [None]:
unknown_fips = set(nytimes_counties_withfips["fips"]) - set(nytimes_geo["fips"])
print("Unknown fips in NYTimes data")
unknown_fips

In [None]:
nytimes_unknown_fips = nytimes_counties_withfips[ nytimes_counties_withfips["fips"].isin(unknown_fips)]

In [None]:
nytimes_unknown_fips

In [None]:
nytimes_counties_nofips.sort_values("state")

New York : New York, Kings, Queens, Bronx and Richmond  :40.712740°N 74.005974°W
Kansas City (Missouri) : Kansas City town, can belong to Cass, Clay, Jackson and Platte county :  39.099724, and the longitude is -94.578331
Joplin (Missouri) : city :37.0842° N, 94.5133° W
Puerto Rico

In [None]:
gazeeter_counties["state"].unique()