In [29]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.types import MapType, StringType
from collections import OrderedDict
import pandas as pd
import numpy as np

In [2]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

os.chdir(config["PATH"]["project"])
project_path = config["PATH"]["project"]


In [3]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .appName("covid_DB") \
        .getOrCreate()
    
    return spark

In [4]:
@udf(MapType( StringType(), StringType()))
def ParseGazetteerUDF(line):
    l_str = line.split()
    l = len(l_str)
    l_headers = 10
    n_words = l - l_headers + 1
    county = " ".join( l_str[3:3+n_words] )

    return{
        "state": l_str[0],
        "county" : county ,        
        "fips" : l_str[1], 
        "latitude" : l_str[-2], 
        "longitude" : l_str[-1] 
    }

In [5]:
# def main():
spark = create_spark_session()

nyt_covid = "us-counties.csv"
covid_daily_perfips = spark.read.csv( os.path.join(project_path, "DATA", "COVID",nyt_covid), header = True)
#covid_daily_perfips.printSchema()

# load gazetteer (geographic coordinates for all county fips)
fields = OrderedDict( [
    ( "state" , "string"),
    ("county" , "string"),
    ("fips" , "int"),
    ( "latitude" , "float"), 
    ("longitude" , "float") 
] )

exprs = [ f"CAST(parsed['{field}'] AS {fld_type}) AS {field}" for field, fld_type in fields.items() ]

gazetteer = spark.read.csv(os.path.join(project_path, "DATA", "2020_Gaz_counties_national.txt"))\
.withColumn("parsed", ParseGazetteerUDF("_c0")).selectExpr( *exprs)

#gazetteer.printSchema()


In [6]:
nyt_locations = covid_daily_perfips.select("state", "county", "fips").distinct()
fips_multi_locations = nyt_locations.groupby("fips").count().where("count > 1")
fips_multi_locations.show(10)
nyt_locations_with_fips = nyt_locations.where(col("fips").isNotNull())

+----+-----+
|fips|count|
+----+-----+
|null|   56|
+----+-----+



In [7]:
nyt_locations_with_fips.count()

3218

In [8]:
nyt_locations_with_geography = nyt_locations_with_fips.join( gazetteer, ["fips"])

In [9]:
nyt_locations_with_geography.show(10)

+-----+-------------+---------------+-----+---------------+---------+-----------+
| fips|        state|         county|state|         county| latitude|  longitude|
+-----+-------------+---------------+-----+---------------+---------+-----------+
|12117|      Florida|       Seminole|   FL|Seminole County|28.690065|  -81.13197|
|51510|     Virginia|Alexandria city|   VA|Alexandria city|38.819252|  -77.08367|
|54055|West Virginia|         Mercer|   WV|  Mercer County|37.403446|  -81.10645|
|13175|      Georgia|        Laurens|   GA| Laurens County| 32.39322| -82.926315|
|21209|     Kentucky|          Scott|   KY|   Scott County| 38.28571|  -84.57834|
|36063|     New York|        Niagara|   NY| Niagara County| 43.27267|  -78.81294|
|29101|     Missouri|        Johnson|   MO| Johnson County|38.741528|  -93.81187|
|13033|      Georgia|          Burke|   GA|   Burke County| 33.06018|  -82.00016|
|35015|   New Mexico|           Eddy|   NM|    Eddy County|32.457836|-104.306435|
|38051| North Da

In [10]:
nyt_locations_with_geography.count()

3210

In [11]:
nyt_locations.count()

3274

In [12]:
nyt_notmatched = nyt_locations.join(nyt_locations_with_geography, ["fips"], how = "left_anti")

In [13]:
df_nyt_notmatched = nyt_notmatched.toPandas()
df_nyt_notmatched

Unnamed: 0,fips,state,county
0,78030,Virgin Islands,St. Thomas
1,,Massachusetts,Unknown
2,,Minnesota,Unknown
3,,Missouri,Kansas City
4,,Guam,Unknown
...,...,...,...
59,69120,Northern Mariana Islands,Tinian
60,02997,Alaska,Bristol Bay plus Lake and Peninsula
61,69110,Northern Mariana Islands,Saipan
62,02261,Alaska,Valdez-Cordova Census Area


In [14]:
states_gps = pd.read_csv( os.path.join(project_path, "DATA", "US_states_GPS.csv"), sep = ",")
states_gps.loc[states_gps["State"] == "Washington State", "state"]= "Washington"
states_gps

Unnamed: 0,State,Latitude,Longitude,state
0,Wisconsin,44.5,-89.5,
1,West Virginia,39.0,-80.5,
2,Vermont,44.0,-72.699997,
3,Texas,31.0,-100.0,
4,South Dakota,44.5,-100.0,
5,Rhode Island,41.700001,-71.5,
6,Oregon,44.0,-120.5,
7,New York,43.0,-75.0,
8,New Hampshire,44.0,-71.5,
9,Nebraska,41.5,-100.0,


In [23]:
gps_no_county = df_nyt_notmatched.loc[df_nyt_notmatched["county"] =="Unknown"].merge( states_gps, how = "left", left_on = "state", right_on = "State")
gps_no_county = gps_no_county[ ["fips", "county", "state_x", "Latitude", "Longitude"] ]\
    .rename(columns = {"state_x" : "state"})
gps_no_county[ gps_no_county["Latitude"].isna() | gps_no_county["Longitude"].isna()]


Unnamed: 0,fips,county,state,Latitude,Longitude
2,,Unknown,Guam,,
13,,Unknown,Virgin Islands,,
30,,Unknown,Washington,,
35,,Unknown,Puerto Rico,,
38,,Unknown,Northern Mariana Islands,,


In [22]:
print( len(df_nyt_notmatched), len(gps_no_county))

64 53


In [27]:
gps_cities = df_nyt_notmatched[ df_nyt_notmatched["county"] != "Unknown"].copy()
gps_cities

Unnamed: 0,fips,state,county
0,78030.0,Virgin Islands,St. Thomas
3,,Missouri,Kansas City
23,,New York,New York City
32,,Missouri,Joplin
57,78020.0,Virgin Islands,St. John
58,78010.0,Virgin Islands,St. Croix
59,69120.0,Northern Mariana Islands,Tinian
60,2997.0,Alaska,Bristol Bay plus Lake and Peninsula
61,69110.0,Northern Mariana Islands,Saipan
62,2261.0,Alaska,Valdez-Cordova Census Area


In [26]:
gps_no_county.loc[ (gps_no_county["county"] == "Unknown") & (gps_no_county["state"]== "Puerto Rico") , "Latitude"] = 18.2223
gps_no_county.loc[ (gps_no_county["county"] == "Unknown") & (gps_no_county["state"]=="Puerto Rico"), "Longitude"] = -66.4303
gps_no_county.loc[ (gps_no_county["county"] == "Unknown") & (gps_no_county["state"]=="Virgin Islands"), "Latitude"] = 18.34
gps_no_county.loc[ (gps_no_county["county"] == "Unknown") & (gps_no_county["state"]=="Virgin Islands"), "Longitude"] = -64.90
gps_no_county.loc[ (gps_no_county["county"] == "Unknown") & (gps_no_county["state"]=="Guam"), "Latitude"] = 13.4440
gps_no_county.loc[ (gps_no_county["county"] == "Unknown") & (gps_no_county["state"]=="Guam"), "Longitude"] = 144.7671
# GPS coordinates of Saipan 15°11′N 145°45′E
gps_no_county.loc[ (gps_no_county["county"] == "Unknown") & (gps_no_county["state"]=="Northern Mariana Islands"), "Latitude"] = 15.16
gps_no_county.loc[ (gps_no_county["county"] == "Unknown") & (gps_no_county["state"]=="Northern Mariana Islands"), "Longitude"] = 145.7


In [19]:
df_nyt_notmatched

Unnamed: 0,fips,state,county
0,78030,Virgin Islands,St. Thomas
1,,Massachusetts,Unknown
2,,Minnesota,Unknown
3,,Missouri,Kansas City
4,,Guam,Unknown
...,...,...,...
59,69120,Northern Mariana Islands,Tinian
60,02997,Alaska,Bristol Bay plus Lake and Peninsula
61,69110,Northern Mariana Islands,Saipan
62,02261,Alaska,Valdez-Cordova Census Area


In [30]:
gps_cities["latitude"] = np.nan
gps_cities["longitude"] = np.nan
# Cities and metropolitan areas
gps_cities.loc[ gps_cities["county"] == "New York City", "latitude"] = 40.712740
gps_cities.loc[ gps_cities["county"] == "New York City", "longitude"] = -74.005974
gps_cities.loc[ gps_cities["county"] == "Kansas City", "latitude"] = 39.099724
gps_cities.loc[ gps_cities["county"] == "Kansas City", "longitude"] = -94.578331
gps_cities.loc[ gps_cities["county"] == "Joplin", "latitude"] = 37.0842
gps_cities.loc[ gps_cities["county"] == "Joplin", "longitude"] = -94.5133
# FIPS unknown in Census Bureau gazetteer
gps_cities.loc[ gps_cities["county"] == "St. Croix", "latitude"] =17.73
gps_cities.loc[ gps_cities["county"] == "St. Croix", "longitude"] = -64.78
# GPS for Cruz Bay (main city) 18.329936603847486, -64.79413842601294
gps_cities.loc[ gps_cities["county"] == "St. John", "latitude"] = 18.33
gps_cities.loc[ gps_cities["county"] == "St. John", "longitude"] = -64.794
# GPS for Charlotte Amalie (main city) 18.341684050871354, -64.93175706594377
gps_cities.loc[ gps_cities["county"] == "St. Thomas", "latitude"] = 18.34
gps_cities.loc[ gps_cities["county"] == "St. Thomas", "longitude"] = -64.93
gps_cities.loc[ gps_cities["county"] == "Valdez-Cordova Census Area", "latitude"] = 61.129050
# GPS of Valdez
gps_cities.loc[ gps_cities["county"] == "Valdez-Cordova Census Area", "longitude"] = -146.360130
# Saipan 15.216501472234945, 145.72103373823464
gps_cities.loc[ gps_cities["county"] == "Saipan", "latitude"] = 15.27
gps_cities.loc[ gps_cities["county"] == "Saipan", "longitude"] = 145.72
# Tinian 14.978910978711687, 145.63629283555494
gps_cities.loc[ gps_cities["county"] == "Tinian", "latitude"] = 14.98
gps_cities.loc[ gps_cities["county"] == "Tinian", "longitude"] = 145.636