In [29]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.types import MapType, StringType
from collections import OrderedDict
import pandas as pd
import numpy as np

In [2]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

os.chdir(config["PATH"]["project"])
project_path = config["PATH"]["project"]


In [3]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .appName("covid_DB") \
        .getOrCreate()
    
    return spark

In [4]:
@udf(MapType( StringType(), StringType()))
def ParseGazetteerUDF(line):
    l_str = line.split()
    l = len(l_str)
    l_headers = 10
    n_words = l - l_headers + 1
    county = " ".join( l_str[3:3+n_words] )

    return{
        "state": l_str[0],
        "county" : county ,        
        "fips" : l_str[1], 
        "latitude" : l_str[-2], 
        "longitude" : l_str[-1] 
    }

In [5]:
# def main():
spark = create_spark_session()

nyt_covid = "us-counties.csv"
covid_daily_perfips = spark.read.csv( os.path.join(project_path, "DATA", "COVID",nyt_covid), header = True)
#covid_daily_perfips.printSchema()

# load gazetteer (geographic coordinates for all county fips)
fields = OrderedDict( [
    ( "state" , "string"),
    ("county" , "string"),
    ("fips" , "int"),
    ( "latitude" , "float"), 
    ("longitude" , "float") 
] )

exprs = [ f"CAST(parsed['{field}'] AS {fld_type}) AS {field}" for field, fld_type in fields.items() ]

gazetteer = spark.read.csv(os.path.join(project_path, "DATA", "2020_Gaz_counties_national.txt"))\
.withColumn("parsed", ParseGazetteerUDF("_c0")).selectExpr( *exprs)

#gazetteer.printSchema()


In [67]:
nyt_locations = covid_daily_perfips.select("state", "county", "fips").distinct()
fips_multi_locations = nyt_locations.groupby("fips").count().where("count > 1")
fips_multi_locations.show(10)
nyt_locations_with_fips = nyt_locations.where(col("fips").isNotNull())

+----+-----+
|fips|count|
+----+-----+
|null|   56|
+----+-----+



In [7]:
nyt_locations_with_fips.count()

3218

In [72]:
nyt_locations_with_geography = nyt_locations_with_fips.alias("a").join( gazetteer.alias("b"), ["fips"])\
    .select("a.*", "b.latitude", "b.longitude" )

+-----+-------------+---------------+---------+----------+
| fips|        state|         county| latitude| longitude|
+-----+-------------+---------------+---------+----------+
|12117|      Florida|       Seminole|28.690065| -81.13197|
|51510|     Virginia|Alexandria city|38.819252| -77.08367|
|54055|West Virginia|         Mercer|37.403446| -81.10645|
|13175|      Georgia|        Laurens| 32.39322|-82.926315|
|21209|     Kentucky|          Scott| 38.28571| -84.57834|
+-----+-------------+---------------+---------+----------+
only showing top 5 rows



In [12]:
nyt_notmatched = nyt_locations.join(nyt_locations_with_geography, ["fips"], how = "left_anti")

In [13]:
df_nyt_notmatched = nyt_notmatched.toPandas()

Unnamed: 0,fips,state,county
0,78030,Virgin Islands,St. Thomas
1,,Massachusetts,Unknown
2,,Minnesota,Unknown
3,,Missouri,Kansas City
4,,Guam,Unknown
...,...,...,...
59,69120,Northern Mariana Islands,Tinian
60,02997,Alaska,Bristol Bay plus Lake and Peninsula
61,69110,Northern Mariana Islands,Saipan
62,02261,Alaska,Valdez-Cordova Census Area


In [48]:
states_gps = pd.read_csv( os.path.join(project_path, "DATA", "US_states_GPS.csv"), sep = ",")
states_gps.columns = states_gps.columns.str.lower()
states_gps.loc[states_gps["state"] == "Washington State", "state"]= "Washington"

In [54]:
gps_no_county = df_nyt_notmatched.loc[df_nyt_notmatched["county"] =="Unknown"].merge( states_gps, how = "left", on = "state")
gps_no_county[ gps_no_county["latitude"].isna() | gps_no_county["longitude"].isna()]

Unnamed: 0,fips,state,county,latitude,longitude
2,,Guam,Unknown,,
13,,Virgin Islands,Unknown,,
35,,Puerto Rico,Unknown,,
38,,Northern Mariana Islands,Unknown,,


In [55]:
gps_cities = df_nyt_notmatched[ df_nyt_notmatched["county"] != "Unknown"].copy()

In [56]:
# GPS of unincorporated territories
gps_no_county.loc[ (gps_no_county["county"] == "Unknown") & (gps_no_county["state"]== "Puerto Rico") , "latitude"] = 18.2223
gps_no_county.loc[ (gps_no_county["county"] == "Unknown") & (gps_no_county["state"]=="Puerto Rico"), "longitude"] = -66.4303
gps_no_county.loc[ (gps_no_county["county"] == "Unknown") & (gps_no_county["state"]=="Virgin Islands"), "latitude"] = 18.34
gps_no_county.loc[ (gps_no_county["county"] == "Unknown") & (gps_no_county["state"]=="Virgin Islands"), "longitude"] = -64.90
gps_no_county.loc[ (gps_no_county["county"] == "Unknown") & (gps_no_county["state"]=="Guam"), "latitude"] = 13.4440
gps_no_county.loc[ (gps_no_county["county"] == "Unknown") & (gps_no_county["state"]=="Guam"), "longitude"] = 144.7671
# GPS coordinates of Saipan 15°11′N 145°45′E
gps_no_county.loc[ (gps_no_county["county"] == "Unknown") & (gps_no_county["state"]=="Northern Mariana Islands"), "latitude"] = 15.16
gps_no_county.loc[ (gps_no_county["county"] == "Unknown") & (gps_no_county["state"]=="Northern Mariana Islands"), "longitude"] = 145.7


In [57]:
gps_cities["latitude"] = np.nan
gps_cities["longitude"] = np.nan
# Cities and metropolitan areas
gps_cities.loc[ gps_cities["county"] == "New York City", "latitude"] = 40.712740
gps_cities.loc[ gps_cities["county"] == "New York City", "longitude"] = -74.005974
gps_cities.loc[ gps_cities["county"] == "Kansas City", "latitude"] = 39.099724
gps_cities.loc[ gps_cities["county"] == "Kansas City", "longitude"] = -94.578331
gps_cities.loc[ gps_cities["county"] == "Joplin", "latitude"] = 37.0842
gps_cities.loc[ gps_cities["county"] == "Joplin", "longitude"] = -94.5133
# FIPS unknown in Census Bureau gazetteer
gps_cities.loc[ gps_cities["county"] == "St. Croix", "latitude"] =17.73
gps_cities.loc[ gps_cities["county"] == "St. Croix", "longitude"] = -64.78
# GPS for Cruz Bay (main city) 18.329936603847486, -64.79413842601294
gps_cities.loc[ gps_cities["county"] == "St. John", "latitude"] = 18.33
gps_cities.loc[ gps_cities["county"] == "St. John", "longitude"] = -64.794
# GPS for Charlotte Amalie (main city) 18.341684050871354, -64.93175706594377
gps_cities.loc[ gps_cities["county"] == "St. Thomas", "latitude"] = 18.34
gps_cities.loc[ gps_cities["county"] == "St. Thomas", "longitude"] = -64.93
gps_cities.loc[ gps_cities["county"] == "Valdez-Cordova Census Area", "latitude"] = 61.129050
# GPS of Valdez
gps_cities.loc[ gps_cities["county"] == "Valdez-Cordova Census Area", "longitude"] = -146.360130
# Saipan 15.216501472234945, 145.72103373823464
gps_cities.loc[ gps_cities["county"] == "Saipan", "latitude"] = 15.27
gps_cities.loc[ gps_cities["county"] == "Saipan", "longitude"] = 145.72
# Tinian 14.978910978711687, 145.63629283555494
gps_cities.loc[ gps_cities["county"] == "Tinian", "latitude"] = 14.98
gps_cities.loc[ gps_cities["county"] == "Tinian", "longitude"] = 145.636

In [58]:
df_fixed_gps = pd.concat( [gps_no_county, gps_cities] )
len(df_fixed_gps)

64

In [75]:
fixed_gps = spark.createDataFrame(df_fixed_gps)
fixed_gps.printSchema()

root
 |-- fips: string (nullable = true)
 |-- state: string (nullable = true)
 |-- county: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



In [76]:
nyt_locations_with_geography.printSchema()

root
 |-- fips: string (nullable = true)
 |-- state: string (nullable = true)
 |-- county: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)



In [80]:
nyt_geography = nyt_locations_with_geography.union(fixed_gps)

In [83]:
nyt_geography.count()

3274

In [88]:
nyt_geography.write.partitionBy("state").format("parquet").save(out_path)


NameError: name 'out_path' is not defined