In [1]:
import numpy as np
import pandas as pd
import os
import configparser

In [2]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

['capstone.cfg']

In [3]:
project_path = config["PATH"]["PROJECT"]
print(project_path)

/home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT


# Parsing functions

## US counties geographic info from Gazeeter

In [120]:
def counties_lat_long_(name):
    with open(name) as f :
        headers = f.readline().split()
        l_headers = len(headers)
        print("len of headers : ", l_headers )
        l_res = []
        for line in f :
            l_str = line.split()
            state = l_str[0]
            fips = l_str[1]
            latitude = l_str[-2]
            longitude = l_str[-1]
            l = len(l_str)
            n_words = l - l_headers + 1
            county = " ".join( l_str[3:3+n_words] )
            #d_res.setdefault(state, []).append( [ state, county, fips, latitude, longitude ])
            l_res.append( [state, county, int(fips), latitude, longitude ])
        return l_res


In [121]:
def counties_lat_long(name):
    l_gazeeter = counties_lat_long_(name)
    df_gazeeter = pd.DataFrame(l_gazeeter, columns = ["state", "county", "fips", "latitude", "longitude"])
    return df_gazeeter

# Read data

In [122]:
data_root = os.path.join(project_path, 'DATA/')

In [123]:
nyt = pd.read_csv( os.path.join(data_root, "us-counties.txt") )
nyt = nyt.astype( {"fips" : 'Int64', "deaths": 'Int64'})
nyt.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0
1,2020-01-22,Snohomish,Washington,53061,1,0
2,2020-01-23,Snohomish,Washington,53061,1,0
3,2020-01-24,Cook,Illinois,17031,1,0
4,2020-01-24,Snohomish,Washington,53061,1,0


In [124]:
nyt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 988671 entries, 0 to 988670
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   date    988671 non-null  object
 1   county  988671 non-null  object
 2   state   988671 non-null  object
 3   fips    979486 non-null  Int64 
 4   cases   988671 non-null  int64 
 5   deaths  967414 non-null  Int64 
dtypes: Int64(2), int64(1), object(3)
memory usage: 47.1+ MB


In [125]:
gazeeter_counties= counties_lat_long( os.path.join(data_root,"2020_Gaz_counties_national.txt"))
print("number of counties from Gazeeter : ", len(gazeeter_counties))
gazeeter_counties.head()

len of headers :  10
number of counties from Gazeeter :  3221


Unnamed: 0,state,county,fips,latitude,longitude
0,AL,Autauga County,1001,32.532237,-86.64644
1,AL,Baldwin County,1003,30.659218,-87.746067
2,AL,Barbour County,1005,31.870253,-85.405104
3,AL,Bibb County,1007,33.015893,-87.127148
4,AL,Blount County,1009,33.977358,-86.56644


In [126]:
gazeeter_counties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3221 entries, 0 to 3220
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   state      3221 non-null   object
 1   county     3221 non-null   object
 2   fips       3221 non-null   int64 
 3   latitude   3221 non-null   object
 4   longitude  3221 non-null   object
dtypes: int64(1), object(4)
memory usage: 125.9+ KB


# Map NY Times location to Gazeeter locations

Check that all fips are unique in Gazeeter

In [127]:
gaz_counts = gazeeter_counties["fips"].value_counts()
len( gaz_counts[gaz_counts>1]) # 0 : all fips are unique !

0

In [128]:
nyt_location = nyt[ ["state", "county", "fips"] ].drop_duplicates()
len(nyt_location)

3274

unique fips in New York Times data

In [129]:
len(nyt_location["fips"].unique())

3219

There are fewer unique fips than total fips => some must be duplicates. 
Check that duplicate fips are missing values

In [130]:
nyt_location.groupby("fips", dropna = False).count().sort_values( ["state", "county"], ascending = False).head()

Unnamed: 0_level_0,state,county
fips,Unnamed: 1_level_1,Unnamed: 2_level_1
,56,56
1001.0,1,1
1003.0,1,1
1005.0,1,1
1007.0,1,1


Remove empty fips so that we can join with a one-to-one match

In [131]:
nytimes_counties_withfips = nyt_location[ nyt_location["fips"] != '']
nytimes_counties_nofips = nyt_location[nyt_location["fips"].isna() ]
f"NYT : total={len(nyt_location)} with fips= {len(nytimes_counties_withfips)}  no fips = {len(nytimes_counties_nofips)}"

'NYT : total=3274 with fips= 3218  no fips = 56'

In [132]:
nytimes_geo = pd.merge(nytimes_counties_withfips, gazeeter_counties, how = "inner", on = "fips", validate = "one_to_one")
len(nytimes_geo), len(nytimes_counties_withfips)

(3212, 3218)

## Process locations that do not map to gazetteer

### Fips in NYT that are not found in gazeeter

In [133]:
unknown_fips = set(nytimes_counties_withfips["fips"]) - set(nytimes_geo["fips"])
nytimes_unknown_fips = nytimes_counties_withfips[ nytimes_counties_withfips["fips"].isin(unknown_fips)]
nytimes_unknown_fips

Unnamed: 0,state,county,fips
37818,Virgin Islands,St. Croix,78010
37819,Virgin Islands,St. John,78020
37820,Virgin Islands,St. Thomas,78030
121367,Alaska,Valdez-Cordova Census Area,2261
333508,Northern Mariana Islands,Saipan,69110
333509,Northern Mariana Islands,Tinian,69120


### Locations in NYT that do not have any fips associated with

In [18]:
nytimes_counties_nofips.sort_values("state")

Unnamed: 0,state,county,fips
26170,Alaska,Unknown,
9521,Arizona,Unknown,
5223,Arkansas,Unknown,
7215,California,Unknown,
7246,Colorado,Unknown,
19893,Connecticut,Unknown,
46082,Delaware,Unknown,
7304,Florida,Unknown,
5385,Georgia,Unknown,
2422,Guam,Unknown,


In [134]:
no_matching_fips = pd.concat([nytimes_counties_nofips, nytimes_unknown_fips], axis = 0)

There are 2 cases :
- county is not known -> only state is known
- stats are done per city or metropolitant area instead of county

## Only state is known : enter GPS coordinates directly

### Read gps coordinates for all states
GPS coordinates from https://www.latlong.net/category/states-236-14.html

In [101]:
states_gps = pd.read_csv( os.path.join(project_path, "DATA", "US_states_GPS.csv"), sep = ";")\
    .rename(columns ={'Unnamed: 0': "state"})
# change name of the state of Washington to match name in state abbreviation dataset
states_gps.loc[states_gps["state"] == "Washington State", "state"]= "Washington"
print(states_gps.shape)
states_gps.head()

(50, 3)


Unnamed: 0,state,Latitude,Longitude
0,Wisconsin,44.5,-89.5
1,West Virginia,39.0,-80.5
2,Vermont,44.0,-72.699997
3,Texas,31.0,-100.0
4,South Dakota,44.5,-100.0


Map states to state abbreviation

In [91]:
states_abbr = pd.read_csv(os.path.join(project_path, "DATA", "US_states_abbr.csv"))
print(states_abbr.shape)
states_abbr.head()

(50, 2)


Unnamed: 0,US STATE,ABBREVIATION
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


Join full state name with abreviation

In [102]:
states_gps_abbr = pd.merge(states_gps, states_abbr, left_on = "state", right_on = "US STATE")
print(states_gps_abbr.shape)
states_gps_abbr.head()

(50, 5)


Unnamed: 0,state,Latitude,Longitude,US STATE,ABBREVIATION
0,Wisconsin,44.5,-89.5,Wisconsin,WI
1,West Virginia,39.0,-80.5,West Virginia,WV
2,Vermont,44.0,-72.699997,Vermont,VT
3,Texas,31.0,-100.0,Texas,TX
4,South Dakota,44.5,-100.0,South Dakota,SD


### Join GPS coordinates for locations with state info only

This will update only locations within  US states, not locations in unincorporated territories

In [147]:
no_county_gps = no_matching_fips.loc[no_matching_fips["county"] =="Unknown"].merge( states_gps_abbr, on = "state", how = 'outer')
no_county_gps[ no_county_gps["Latitude"].isna() | no_county_gps["Longitude"].isna()]

Unnamed: 0,state,county,fips,Latitude,Longitude,US STATE,ABBREVIATION


Manually enter GPS coordinates for unincorporated territories

In [150]:
no_county_gps.loc[ (no_county_gps["county"] == "Unknown") & (no_county_gps["state"]== "Puerto Rico") , "Latitude"] = 18.2223
no_county_gps.loc[ (no_county_gps["county"] == "Unknown") & (no_county_gps["state"]=="Puerto Rico"), "Longitude"] = -66.4303
no_county_gps.loc[ (no_county_gps["county"] == "Unknown") & (no_county_gps["state"]=="Virgin Islands"), "Latitude"] = 18.34
no_county_gps.loc[ (no_county_gps["county"] == "Unknown") & (no_county_gps["state"]=="Virgin Islands"), "Longitude"] = -64.90
no_county_gps.loc[ (no_county_gps["county"] == "Unknown") & (no_county_gps["state"]=="Guam"), "Latitude"] = 13.4440
no_county_gps.loc[ (no_county_gps["county"] == "Unknown") & (no_county_gps["state"]=="Guam"), "Longitude"] = 144.7671
# GPS coordinates of Saipan 15°11′N 145°45′E
no_county_gps.loc[ (no_county_gps["county"] == "Unknown") & (no_county_gps["state"]=="Northern Mariana Islands"), "Latitude"] = 15.16
no_county_gps.loc[ (no_county_gps["county"] == "Unknown") & (no_county_gps["state"]=="Northern Mariana Islands"), "Longitude"] = 145.7



## Enter GPS coordinates for cities or metropolitan areaq

* New York : New York, Kings, Queens, Bronx and Richmond  :40.712740°N 74.005974°W
* Kansas City (Missouri) : Kansas City town, can belong to Cass, Clay, Jackson and Platte county :  39.099724, and the longitude is -94.578331
* Joplin (Missouri) : city :37.0842° N, 94.5133° W
* Puerto Rico

# Check importance of non matched locations

In [48]:
#nb of cases and death per location
# the data are cumulative, so the max of data for one location is the total number of cases/deaths for this locations
nb_covid_per_location = nyt.groupby(["state", "county", "fips"], dropna=False).max().reset_index()
total_cases = nb_covid_per_location["cases"].sum()
total_deaths = nb_covid_per_location["deaths"].sum()
print(f"total cases : {total_cases}; total deaths : {total_deaths}")
# rank of location for number of deaths
nb_covid_per_location["rank_cases"] = nb_covid_per_location["cases"].rank(method = 'min', ascending = False).astype('Int64')
nb_covid_per_location["rank_deaths"] = nb_covid_per_location["deaths"].rank(method = 'min', ascending = False).astype('Int64')
nb_covid_per_location["ratio_cases"] = nb_covid_per_location["cases"]/total_cases
nb_covid_per_location["ratio_deaths"] = nb_covid_per_location["deaths"]/total_deaths
nb_covid_per_location.sort_values("deaths", ascending = False).head(20)

total cases : 26410616; total deaths : 445856


Unnamed: 0,state,county,fips,date,cases,deaths,rank_cases,rank_deaths,ratio_cases,ratio_deaths
1887,New York,New York City,,2021-02-01,613480,27226,2,1,0.023229,0.061065
205,California,Los Angeles,6037.0,2021-02-01,1121349,16854,1,2,0.042458,0.037801
620,Illinois,Cook,17031.0,2021-02-01,454485,9433,4,3,0.017208,0.021157
102,Arizona,Maricopa,4013.0,2021-02-01,475261,7462,3,4,0.017995,0.016736
367,Florida,Miami-Dade,12086.0,2021-02-01,373422,4905,5,5,0.014139,0.011001
2743,Texas,Harris,48201.0,2021-02-01,319800,4264,6,6,0.012109,0.009564
1332,Michigan,Wayne,26163.0,2021-02-01,97048,3984,29,7,0.003675,0.008936
1776,Nevada,Clark,32003.0,2021-02-01,214780,3277,13,8,0.008132,0.00735
219,California,Riverside,6065.0,2021-02-01,275872,3171,7,9,0.010445,0.007112
1243,Massachusetts,Middlesex,25017.0,2021-02-01,102399,3134,26,10,0.003877,0.007029


Check the rank of locations not in gazetteer

In [50]:
no_gazeeter_fips_rank = pd.merge(nb_covid_per_location, no_gazeeter_fips, on =["state", "county", "fips"])\
    .sort_values("rank_deaths")
no_gazeeter_fips_rank.head(20)

Unnamed: 0,state,county,fips,date,cases,deaths,rank_cases,rank_deaths,ratio_cases,ratio_deaths
34,New York,New York City,,2021-02-01,613480,27226,2,1,0.023229,0.061065
45,Puerto Rico,Unknown,,2021-02-01,4321,1836,952,35,0.000164,0.004118
35,New York,Unknown,,2020-04-18,0,1170,3270,65,0.0,0.002624
16,Kansas,Unknown,,2021-02-01,0,730,3270,102,0.0,0.001637
26,Missouri,Kansas City,,2021-02-01,35700,440,138,181,0.001352,0.000987
13,Illinois,Unknown,,2021-02-01,12048,292,431,277,0.000456,0.000655
18,Louisiana,Unknown,,2021-02-01,6957,234,662,343,0.000263,0.000525
46,Rhode Island,Unknown,,2021-02-01,16867,209,306,378,0.000639,0.000469
49,Tennessee,Unknown,,2021-02-01,13775,154,374,497,0.000522,0.000345
20,Maryland,Unknown,,2021-02-01,34,135,3236,556,1e-06,0.000303


In [37]:
no_gazeeter_fips_rank[ no_gazeeter_fips_rank["state"] == "Virgin Islands"]

Unnamed: 0,state,county,fips,date,cases,deaths,rank_cases,rank_deaths,ratio_cases,ratio_deaths
55,Virgin Islands,St. Thomas,78030.0,2021-02-01,1196,15,2133.0,2330.5,4.5e-05,3.4e-05
53,Virgin Islands,St. Croix,78010.0,2021-02-01,1021,8,2304.0,2671.5,3.9e-05,1.8e-05
56,Virgin Islands,Unknown,,2020-07-22,42,7,3230.0,2726.5,2e-06,1.6e-05
54,Virgin Islands,St. John,78020.0,2021-02-01,204,1,3096.5,3082.5,8e-06,2e-06


In [38]:
no_gazeeter_fips_rank[ no_gazeeter_fips_rank["state"] == "Northern Mariana Islands"]

Unnamed: 0,state,county,fips,date,cases,deaths,rank_cases,rank_deaths,ratio_cases,ratio_deaths
40,Northern Mariana Islands,Unknown,,2020-11-22,33,2,3238.0,3018.0,1.249498e-06,4e-06
38,Northern Mariana Islands,Saipan,69110.0,2021-02-01,129,2,3155.5,3018.0,4.8844e-06,4e-06
39,Northern Mariana Islands,Tinian,69120.0,2021-02-01,2,0,3264.5,3156.5,7.572712e-08,0.0


Manual process of missing fips : select only the most significant fips (rank < 500 for cases or deaths)

In [56]:
no_gazeeter_fips_rank[ (no_gazeeter_fips_rank["rank_deaths"]<500) | (no_gazeeter_fips_rank["rank_cases"]<500)]

Unnamed: 0,state,county,fips,date,cases,deaths,rank_cases,rank_deaths,ratio_cases,ratio_deaths
34,New York,New York City,,2021-02-01,613480,27226,2,1,0.023229,0.061065
45,Puerto Rico,Unknown,,2021-02-01,4321,1836,952,35,0.000164,0.004118
35,New York,Unknown,,2020-04-18,0,1170,3270,65,0.0,0.002624
16,Kansas,Unknown,,2021-02-01,0,730,3270,102,0.0,0.001637
26,Missouri,Kansas City,,2021-02-01,35700,440,138,181,0.001352,0.000987
13,Illinois,Unknown,,2021-02-01,12048,292,431,277,0.000456,0.000655
18,Louisiana,Unknown,,2021-02-01,6957,234,662,343,0.000263,0.000525
46,Rhode Island,Unknown,,2021-02-01,16867,209,306,378,0.000639,0.000469
49,Tennessee,Unknown,,2021-02-01,13775,154,374,497,0.000522,0.000345
21,Massachusetts,Unknown,,2021-02-01,27487,103,193,702,0.001041,0.000231


In [58]:
no_gazeeter_fips

Unnamed: 0,state,county,fips
416,New York,New York City,
418,Rhode Island,Unknown,
1511,New Jersey,Unknown,
1858,Puerto Rico,Unknown,
2267,Virgin Islands,Unknown,
...,...,...,...
37819,Virgin Islands,St. John,78020
37820,Virgin Islands,St. Thomas,78030
121367,Alaska,Valdez-Cordova Census Area,2261
333508,Northern Mariana Islands,Saipan,69110


In [59]:
nytimes_geo

Unnamed: 0,state_x,county_x,fips,state_y,county_y,latitude,longitude
0,Washington,Snohomish,53061,WA,Snohomish County,48.054913,-121.765038
1,Illinois,Cook,17031,IL,Cook County,41.894294,-87.645455
2,California,Orange,6059,CA,Orange County,33.675687,-117.777207
3,Arizona,Maricopa,4013,AZ,Maricopa County,33.345176,-112.49893
4,California,Los Angeles,6037,CA,Los Angeles County,34.196398,-118.261862
...,...,...,...,...,...,...,...
3207,Texas,King,48269,TX,King County,33.614268,-100.245345
3208,Alaska,Skagway Municipality,2230,AK,Skagway Municipality,59.560379,-135.338279
3209,Nevada,Esmeralda,32009,NV,Esmeralda County,37.778966,-117.632382
3210,Texas,Loving,48301,TX,Loving County,31.844936,-103.561229


Read gps coordinates for all states (from https://www.latlong.net/category/states-236-14.html)