In [1]:
import numpy as np
import pandas as pd
import os
import configparser

In [4]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

['capstone.cfg']

In [6]:
project_path = config["PATH"]["PROJECT"]
print(project_path)

/home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT


# Functions to parse geographic info from various data sources

## Weather stations

In [7]:
def parse_stations_(name):
    l_res = []
    with open(name) as f:
        n=0
        for line in f:
            l_str = line.split()
            #print(l_str)
            res = l_str[0:5]
            #print(res)
            toto = " ".join(l_str[5:]) 
            #print(toto)
            res.append(toto)
            l_res.append(res)
            n+=1
#            if n ==10:
#                break
    return l_res

In [8]:
def parse_stations(name):
    l_stations = parse_stations_(name)
    df_stations = pd.DataFrame(l_stations, columns = ["ID", "latitude", "longitude", "elevation", "state", "name"])
    return df_stations

## geographic info from NYTimes covid data

In [9]:
def NYT_counties_(name):
    d_res = {}
    with open(name) as f :
        header = f.readline()
        for line in f :
            l_str = line.split(",")
            # state,county, fips
            d_res.setdefault( (l_str[2], l_str[1], l_str[3]), True  )
    return d_res


In [10]:
def NYT_counties(name):
    dict_counties= NYT_counties_(name)
    df_counties = pd.DataFrame(dict_counties.keys(), columns = ["state", "county", "fips"])
    return df_counties        

## US counties geographic info from Gazeeter

In [11]:
def counties_lat_long_(name):
    with open(name) as f :
        headers = f.readline().split()
        l_headers = len(headers)
        print("len of headers : ", l_headers )
        l_res = []
        for line in f :
            l_str = line.split()
            state = l_str[0]
            fips = l_str[1]
            latitude = l_str[-2]
            longitude = l_str[-1]
            l = len(l_str)
            n_words = l - l_headers + 1
            county = " ".join( l_str[3:3+n_words] )
            #d_res.setdefault(state, []).append( [ state, county, fips, latitude, longitude ])
            l_res.append( [state, county, fips, latitude, longitude ])
        return l_res


In [12]:
def counties_lat_long(name):
    l_gazeeter = counties_lat_long_(name)
    df_gazeeter = pd.DataFrame(l_gazeeter, columns = ["state", "county", "fips", "latitude", "longitude"])
    return df_gazeeter

# Read data

In [13]:
data_root = os.path.join(project_path, 'DATA/')

## Weather stations location

In [14]:
#df_stations = parse_stations( os.path.join(data_root, 'WEATHER/US_ghcnd_stations.txt'))
df_stations = parse_stations( os.path.join(data_root, 'WEATHER/US_PuertoRico_stations.txt'))
df_stations.head()

Unnamed: 0,ID,latitude,longitude,elevation,state,name
0,US009052008,43.7333,-96.6333,482.0,SD,SIOUX FALLS (ENVIRON. CANADA)
1,US10RMHS145,40.5268,-105.1113,1569.1,CO,RMHS 1.6 SSW
2,US10adam001,40.568,-98.5069,598.0,NE,JUNIATA 1.5 S
3,US10adam002,40.5093,-98.5493,601.1,NE,JUNIATA 6.0 SSW
4,US10adam003,40.4663,-98.6537,615.1,NE,HOLSTEIN 0.1 NW


In [15]:
df_puertorico =parse_stations( os.path.join(data_root, 'WEATHER/PuertoRico_ghcnd_stations.txt'))
df_puertorico.head()

Unnamed: 0,ID,latitude,longitude,elevation,state,name
0,RQ1PRAB0001,18.2608,-66.1416,268.2,PR,AGUAS BUENAS 2.4 W
1,RQ1PRAC0001,18.4414,-66.6589,29.6,PR,ARECIBO 5.2 ESE
2,RQ1PRAC0002,18.4071,-66.7603,152.1,PR,CUCHI 0.3 ENE
3,RQ1PRAL0002,18.4774,-67.1437,71.3,PR,AGUADILLA 2.7 NNE
4,RQ1PRAL0003,18.5083,-67.1124,39.9,PR,AGUADILLA 5.5 NNE


In [16]:
len(df_stations), len(df_puertorico)


(65392, 222)

## COVID deaths and case per county per day, from NewYork Times

In [17]:
nytimes_counties = NYT_counties( os.path.join(data_root, "us-counties.txt"))
print( "number of counties from NY Times:", len(nytimes_counties))
nytimes_counties.head()

number of counties from NY Times: 3274


Unnamed: 0,state,county,fips
0,Washington,Snohomish,53061
1,Illinois,Cook,17031
2,California,Orange,6059
3,Arizona,Maricopa,4013
4,California,Los Angeles,6037


## Counties fips to geographic coordinates (latitude, longitude) 


In [18]:
gazeeter_counties= counties_lat_long( os.path.join(data_root,"2020_Gaz_counties_national.txt"))
print("number of counties from Gazeeter : ", len(gazeeter_counties))
gazeeter_counties.head()

len of headers :  10
number of counties from Gazeeter :  3221


Unnamed: 0,state,county,fips,latitude,longitude
0,AL,Autauga County,1001,32.532237,-86.64644
1,AL,Baldwin County,1003,30.659218,-87.746067
2,AL,Barbour County,1005,31.870253,-85.405104
3,AL,Bibb County,1007,33.015893,-87.127148
4,AL,Blount County,1009,33.977358,-86.56644


In [19]:
class DistanceToStation :
    def __init__(self, df_stations):
        ''' init computations for latitude and longitude'''
        self.df_stations = df_stations.copy(deep = True)
        self.df_stations["latitude_degrees"] = self.df_stations["latitude"]
        self.df_stations["longitude_degrees"] = self.df_stations["longitude"]
        
        self.df_stations["elevation"] = self.df_stations["elevation"].apply( lambda x : float(x) )
        def degree_to_radian(x):
            return  float(x)* np.pi / 180.
        self.df_stations["latitude"] = self.df_stations["latitude"].apply( degree_to_radian) 
        self.df_stations["longitude"] = self.df_stations["longitude"].apply( degree_to_radian)
        self.df_stations["cos_latitude"] = np.cos(self.df_stations["latitude"])
           
    
    def closest_station( self, lati, longi):
        latitude = float(lati) * np.pi / 180.
        longitude = float(longi) * np.pi / 180.
        cos_lat = np.cos(latitude)
        #print( f'latitude : {latitude}, longitude : {longitude}, cos(latitude) : {cos_lat}')
        # Haversine formula
        self.df_stations["delta_lat_term"] = ( np.sin( (self.df_stations["latitude"] - latitude) * 0.5 ) )**2
        self.df_stations["delta_long_term"] = ( np.sin( (self.df_stations["longitude"] - longitude) * 0.5) )**2
        self.df_stations["a"] = self.df_stations["delta_lat_term"] + self.df_stations["delta_long_term"] \
                        * cos_lat * self.df_stations["cos_latitude"]
        self.df_stations["sqrt_a"] = self.df_stations["a"].apply( lambda x : np.sqrt(x) )
        self.df_stations["sqrt_1_a"] = self.df_stations["a"].apply( lambda x : np.sqrt(1.- x) )
        self.df_stations["angle"] = np.arctan2( np.sqrt(self.df_stations["a"]), np.sqrt( 1. - self.df_stations["a"] ) )
        closest = self.df_stations["angle"].idxmin()
        #print(closest)
        return closest, self.df_stations.iloc[closest]
        

# Map fips to the closest weather station

for each county, find the station closest to its 'center'


In [20]:
station_distance = DistanceToStation(df_stations);

## Test with first county

In [21]:
from collections import OrderedDict
for county in gazeeter_counties.itertuples(index = False):
    d_county = county._asdict()
    idx_closest, closest = station_distance.closest_station( d_county["latitude"], d_county["longitude"] )
    print(d_county)
    print(idx_closest)
    print(closest)
    break

OrderedDict([('state', 'AL'), ('county', 'Autauga County'), ('fips', '01001'), ('latitude', '32.532237'), ('longitude', '-86.64644')])
38521
ID                        USC00010440
latitude                     0.566719
longitude                    -1.51285
elevation                          61
state                              AL
name                 AUTAUGAVILLE 3 N
latitude_degrees              32.4706
longitude_degrees            -86.6800
cos_latitude                 0.843667
delta_lat_term            2.89319e-07
delta_long_term           8.57706e-08
a                         3.50327e-07
sqrt_a                    0.000591884
sqrt_1_a                            1
angle                     0.000591884
Name: 38521, dtype: object


In [22]:
type(closest)

pandas.core.series.Series

In [23]:
type(closest["latitude"]), type(closest["ID"])

(numpy.float64, str)

In [24]:
closest.loc[["ID", "latitude"]]

ID          USC00010440
latitude       0.566719
Name: 38521, dtype: object

In [25]:
len(df_stations)

65392

## Test with Washingon DC (small state)

In [26]:
df_WashingtonDC_stations = df_stations[ df_stations["state"] == "DC" ]
df_WashingtonDC_stations

Unnamed: 0,ID,latitude,longitude,elevation,state,name
8467,US1DCDC0001,38.9171,-76.9626,17.1,DC,WASHINGTON 3.0 ENE
8468,US1DCDC0002,38.9574,-77.0337,85.0,DC,WASHINGTON 3.7 NNW
8469,US1DCDC0003,38.9377,-77.0991,91.1,DC,WASHINGTON 5.0 WNW
8470,US1DCDC0006,38.8928,-76.9834,21.9,DC,WASHINGTON 2.0 ESE
8471,US1DCDC0007,38.8943,-76.9982,25.0,DC,WASHINGTON 1.2 SE
8472,US1DCDC0008,38.8901,-76.9854,27.1,DC,WASHINGTON 2.0 ESE
8473,US1DCDC0009,38.8766,-77.0224,3.0,DC,WASHINGTON 2.0 SSW
8474,US1DCDC0010,38.9773,-77.0245,88.1,DC,WASHINGTON 5.0 N
8475,US1DCDC0011,38.9303,-77.0831,81.4,DC,WASHINGTON 4.0 WNW
8476,US1DCDC0014,38.9581,-77.0817,106.4,DC,WASHINGTON 5.1 NW


In [27]:
average_lat = np.average(df_WashingtonDC_stations["latitude"].apply(lambda x : float(x)))
average_long = np.average(df_WashingtonDC_stations["longitude"].apply(lambda x : float(x)))
average_lat, average_long

(38.92026, -77.02996)

In [28]:
df_WashingtonDC_gazeeter = gazeeter_counties[ gazeeter_counties["state"] == "DC"]
df_WashingtonDC_gazeeter

Unnamed: 0,state,county,fips,latitude,longitude
320,DC,District of Columbia,11001,38.904247,-77.016517


In [29]:
county = df_WashingtonDC_gazeeter.iloc[0]
i_closest, closest = station_distance.closest_station( float(county["latitude"]), float(county["longitude"]) )


In [30]:
closest

ID                         US1DCDC0007
latitude                      0.678834
longitude                     -1.34387
elevation                           25
state                               DC
name                 WASHINGTON 1.2 SE
latitude_degrees               38.8943
longitude_degrees             -76.9982
cos_latitude                  0.778306
delta_lat_term             7.53493e-09
delta_long_term            2.55507e-08
a                          2.30104e-08
sqrt_a                     0.000151692
sqrt_1_a                             1
angle                      0.000151692
Name: 8471, dtype: object

## All county fips

In [31]:
def func_closest_station(df_row):
    i, res = station_distance.closest_station( df_row["latitude"], df_row["longitude"] )
    return res.loc[["ID", "latitude", "longitude","elevation","state","name"]]

res = gazeeter_counties.apply(func_closest_station, axis = 1)
res.to_csv("county_station.csv")
#gazeeter_counties[ ["station_ID", "station_latitude", "station_longitude", "station_elevation", "station_state", "station_name"]] = \
#    gazeeter_counties.apply(func_closest_station)
    

In [32]:
gazeeter_counties.head()

Unnamed: 0,state,county,fips,latitude,longitude
0,AL,Autauga County,1001,32.532237,-86.64644
1,AL,Baldwin County,1003,30.659218,-87.746067
2,AL,Barbour County,1005,31.870253,-85.405104
3,AL,Bibb County,1007,33.015893,-87.127148
4,AL,Blount County,1009,33.977358,-86.56644


In [33]:
gazeeter_counties.to_csv("counties.csv")

In [34]:
len(station_distance.df_stations)

65392

In [35]:
fips_puertorico = gazeeter_counties[gazeeter_counties["state"] == 'PR']
fips_puertorico

Unnamed: 0,state,county,fips,latitude,longitude
3143,PR,Adjuntas Municipio,72001,18.181611,-66.758165
3144,PR,Aguada Municipio,72003,18.375673,-67.185745
3145,PR,Aguadilla Municipio,72005,18.480191,-67.143762
3146,PR,Aguas Buenas Municipio,72007,18.256524,-66.128496
3147,PR,Aibonito Municipio,72009,18.130723,-66.26446
...,...,...,...,...,...
3216,PR,Vega Baja Municipio,72145,18.455128,-66.397883
3217,PR,Vieques Municipio,72147,18.125418,-65.432474
3218,PR,Villalba Municipio,72149,18.130718,-66.472244
3219,PR,Yabucoa Municipio,72151,18.059858,-65.859871


In [36]:
res = fips_puertorico.apply(func_closest_station, axis = 1)

In [37]:
res.to_csv("county_station_withPuertoRico.csv")

# Map NY Times location to Gazeeter locations

Check that all fips are unique in Gazeeter

In [38]:
gaz_counts = gazeeter_counties["fips"].value_counts()
len( gaz_counts[gaz_counts>1]) # 0 : all fips are unique !

0

Check whether all fips are unique in New York Times data

In [39]:
fips_counts = nytimes_counties["fips"].value_counts()
fips_counts[ fips_counts >1 ] # several empty fips

    56
Name: fips, dtype: int64

Remove empty fips so that we can join with a one-to-one match

In [40]:
nytimes_counties_withfips = nytimes_counties[ nytimes_counties["fips"] != '']
nytimes_counties_nofips = nytimes_counties[nytimes_counties["fips"] == '']
len(nytimes_counties), len(nytimes_counties_withfips), len(nytimes_counties_nofips)

(3274, 3218, 56)

In [41]:
nytimes_geo = pd.merge(nytimes_counties_withfips, gazeeter_counties, how = "inner", on = "fips", validate = "one_to_one")
len(nytimes_geo), len(nytimes_counties_withfips)

(3212, 3218)

In [42]:
unknown_fips = set(nytimes_counties_withfips["fips"]) - set(nytimes_geo["fips"])
print("Unknown fips in NYTimes data")
unknown_fips

Unknown fips in NYTimes data


{'02261', '69110', '69120', '78010', '78020', '78030'}

In [43]:
nytimes_unknown_fips = nytimes_counties_withfips[ nytimes_counties_withfips["fips"].isin(unknown_fips)]

In [44]:
nytimes_unknown_fips

Unnamed: 0,state,county,fips
2493,Virgin Islands,St. Croix,78010
2494,Virgin Islands,St. John,78020
2495,Virgin Islands,St. Thomas,78030
2990,Alaska,Valdez-Cordova Census Area,2261
3218,Northern Mariana Islands,Saipan,69110
3219,Northern Mariana Islands,Tinian,69120


In [45]:
nytimes_counties_nofips.sort_values("state")

Unnamed: 0,state,county,fips
2229,Alaska,Unknown,
1228,Arizona,Unknown,
772,Arkansas,Unknown,
1034,California,Unknown,
1035,Colorado,Unknown,
1987,Connecticut,Unknown,
2618,Delaware,Unknown,
1040,Florida,Unknown,
798,Georgia,Unknown,
388,Guam,Unknown,


New York : New York, Kings, Queens, Bronx and Richmond  :40.712740°N 74.005974°W
Kansas City (Missouri) : Kansas City town, can belong to Cass, Clay, Jackson and Platte county :  39.099724, and the longitude is -94.578331
Joplin (Missouri) : city :37.0842° N, 94.5133° W
Puerto Rico

In [46]:
gazeeter_counties["state"].unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
       'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
       'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
       'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'PR'], dtype=object)