In [1]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.types import MapType, StringType
from collections import OrderedDict
import pandas as pd

In [2]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

os.chdir(config["PATH"]["project"])
project_path = config["PATH"]["project"]


In [3]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .appName("covid_DB") \
        .getOrCreate()
    
    return spark

In [4]:
@udf(MapType( StringType(), StringType()))
def ParseGazetteerUDF(line):
    l_str = line.split()
    l = len(l_str)
    l_headers = 10
    n_words = l - l_headers + 1
    county = " ".join( l_str[3:3+n_words] )

    return{
        "state": l_str[0],
        "county" : county ,        
        "fips" : l_str[1], 
        "latitude" : l_str[-2], 
        "longitude" : l_str[-1] 
    }

In [5]:
# def main():
spark = create_spark_session()

nyt_covid = "us-counties.csv"
covid_daily_perfips = spark.read.csv( os.path.join(project_path, "DATA", "COVID",nyt_covid), header = True)
#covid_daily_perfips.printSchema()

# load gazetteer (geographic coordinates for all county fips)
fields = OrderedDict( [
    ( "state" , "string"),
    ("county" , "string"),
    ("fips" , "int"),
    ( "latitude" , "float"), 
    ("longitude" , "float") 
] )

exprs = [ f"CAST(parsed['{field}'] AS {fld_type}) AS {field}" for field, fld_type in fields.items() ]

gazetteer = spark.read.csv(os.path.join(project_path, "DATA", "2020_Gaz_counties_national.txt"))\
.withColumn("parsed", ParseGazetteerUDF("_c0")).selectExpr( *exprs)

#gazetteer.printSchema()


In [6]:
nyt_locations = covid_daily_perfips.select("state", "county", "fips").distinct()
fips_multi_locations = nyt_locations.groupby("fips").count().where("count > 1")
fips_multi_locations.show(10)
nyt_locations_with_fips = nyt_locations.where(col("fips").isNotNull())

+----+-----+
|fips|count|
+----+-----+
|null|   56|
+----+-----+



In [7]:
nyt_locations_with_fips.count()

3218

In [8]:
nyt_locations_with_geography = nyt_locations_with_fips.join( gazetteer, ["fips"])

In [9]:
nyt_locations_with_geography.show(10)

+-----+-------------+---------------+-----+---------------+---------+-----------+
| fips|        state|         county|state|         county| latitude|  longitude|
+-----+-------------+---------------+-----+---------------+---------+-----------+
|12117|      Florida|       Seminole|   FL|Seminole County|28.690065|  -81.13197|
|51510|     Virginia|Alexandria city|   VA|Alexandria city|38.819252|  -77.08367|
|54055|West Virginia|         Mercer|   WV|  Mercer County|37.403446|  -81.10645|
|13175|      Georgia|        Laurens|   GA| Laurens County| 32.39322| -82.926315|
|21209|     Kentucky|          Scott|   KY|   Scott County| 38.28571|  -84.57834|
|36063|     New York|        Niagara|   NY| Niagara County| 43.27267|  -78.81294|
|29101|     Missouri|        Johnson|   MO| Johnson County|38.741528|  -93.81187|
|13033|      Georgia|          Burke|   GA|   Burke County| 33.06018|  -82.00016|
|35015|   New Mexico|           Eddy|   NM|    Eddy County|32.457836|-104.306435|
|38051| North Da

In [10]:
nyt_locations_with_geography.count()

3210

In [11]:
nyt_locations.count()

3274

In [12]:
nyt_notmatched = nyt_locations.join(nyt_locations_with_geography, ["fips"], how = "left_anti")

In [None]:
df_nyt_notmatched = nyt_notmatched.toPandas()
df_nyt_notmatched

In [14]:
nyt_notmatched =nyt_notmatched.withColumn("latitude", lit(None).cast(StringType())).withColumn( "longitude", lit(None).cast(StringType()))

In [15]:
nyt_notmatched.show()

+-----+--------------+-----------+--------+---------+
| fips|         state|     county|latitude|longitude|
+-----+--------------+-----------+--------+---------+
|78030|Virgin Islands| St. Thomas|    null|     null|
| null| Massachusetts|    Unknown|    null|     null|
| null|     Minnesota|    Unknown|    null|     null|
| null|      Missouri|Kansas City|    null|     null|
| null|          Guam|    Unknown|    null|     null|
| null|      Maryland|    Unknown|    null|     null|
| null|         Maine|    Unknown|    null|     null|
| null|   Connecticut|    Unknown|    null|     null|
| null|    New Jersey|    Unknown|    null|     null|
| null|North Carolina|    Unknown|    null|     null|
| null|      Colorado|    Unknown|    null|     null|
| null|       Vermont|    Unknown|    null|     null|
| null|      Oklahoma|    Unknown|    null|     null|
| null|        Oregon|    Unknown|    null|     null|
| null|  South Dakota|    Unknown|    null|     null|
| null|Virgin Islands|    Un

In [16]:
nyt_notmatched.printSchema()

root
 |-- fips: string (nullable = true)
 |-- state: string (nullable = true)
 |-- county: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)



In [17]:
states_gps = spark.read.csv( os.path.join(project_path, "DATA", "US_states_GPS.csv"), sep = ",", header = True)#\
    #.rename(columns ={'Unnamed: 0': "state"})

In [18]:
states_gps.show()

+-------------+---------+-----------+
|        State| Latitude|  Longitude|
+-------------+---------+-----------+
|    Wisconsin|     44.5|      -89.5|
|West Virginia|       39|      -80.5|
|      Vermont|       44| -72.699997|
|        Texas|       31|       -100|
| South Dakota|     44.5|       -100|
| Rhode Island|41.700001|      -71.5|
|       Oregon|       44|     -120.5|
|     New York|       43|        -75|
|New Hampshire|       44|      -71.5|
|     Nebraska|     41.5|       -100|
|       Kansas|     38.5|        -98|
|  Mississippi|       33|        -90|
|     Illinois|       40|        -89|
|     Delaware|       39|      -75.5|
|  Connecticut|41.599998| -72.699997|
|     Arkansas|34.799999| -92.199997|
|      Indiana|40.273502| -86.126976|
|     Missouri|38.573936|  -92.60376|
|      Florida|27.994402| -81.760254|
|       Nevada|39.876019|-117.224121|
+-------------+---------+-----------+
only showing top 20 rows



In [19]:
nyt_notmatched.where(toto.state.contains("ashington")).show()

NameError: name 'toto' is not defined

In [None]:
states_gps.where(states_gps.State.contains("ashington")).show()

In [None]:
fix_state = udf( lambda x : "Washington" if x == "Washington State" else x )
states_gps = states_gps.withColumn("State", fix_state(states_gps.State))

In [None]:
nyt_notmatched_fix1 = nyt_notmatched.join(states_gps, (nyt_notmatched["state"] == states_gps["State"]) & (nyt_notmatched["county"] == "Unknown"), "left") 

In [None]:
nyt_notmatched_fix1.show()

In [None]:
nyt_notmatched_fix1.count()