In [4]:
import numpy as np
import pandas as pd
import os
import configparser

In [5]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

['capstone.cfg']

In [6]:
project_path = config["PATH"]["PROJECT"]
print(project_path)

/home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT


# Parsing functions

## US counties geographic info from Gazeeter

In [73]:
def counties_lat_long_(name):
    with open(name) as f :
        headers = f.readline().split()
        l_headers = len(headers)
        print("len of headers : ", l_headers )
        l_res = []
        for line in f :
            l_str = line.split()
            state = l_str[0]
            fips = l_str[1]
            latitude = l_str[-2]
            longitude = l_str[-1]
            l = len(l_str)
            n_words = l - l_headers + 1
            county = " ".join( l_str[3:3+n_words] )
            #d_res.setdefault(state, []).append( [ state, county, fips, latitude, longitude ])
            l_res.append( [state, county, int(fips), latitude, longitude ])
        return l_res


In [74]:
def counties_lat_long(name):
    l_gazeeter = counties_lat_long_(name)
    df_gazeeter = pd.DataFrame(l_gazeeter, columns = ["state", "county", "fips", "latitude", "longitude"])
    return df_gazeeter

# Read data

In [7]:
data_root = os.path.join(project_path, 'DATA/')

In [30]:
nyt = pd.read_csv( os.path.join(data_root, "us-counties.txt") )
nyt = nyt.astype( {"fips" : 'Int64', "deaths": 'Int64'})
nyt.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0
1,2020-01-22,Snohomish,Washington,53061,1,0
2,2020-01-23,Snohomish,Washington,53061,1,0
3,2020-01-24,Cook,Illinois,17031,1,0
4,2020-01-24,Snohomish,Washington,53061,1,0


In [71]:
nyt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 988671 entries, 0 to 988670
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   date    988671 non-null  object
 1   county  988671 non-null  object
 2   state   988671 non-null  object
 3   fips    979486 non-null  Int64 
 4   cases   988671 non-null  int64 
 5   deaths  967414 non-null  Int64 
dtypes: Int64(2), int64(1), object(3)
memory usage: 47.1+ MB


In [75]:
gazeeter_counties= counties_lat_long( os.path.join(data_root,"2020_Gaz_counties_national.txt"))
print("number of counties from Gazeeter : ", len(gazeeter_counties))
gazeeter_counties.head()

len of headers :  10
number of counties from Gazeeter :  3221


Unnamed: 0,state,county,fips,latitude,longitude
0,AL,Autauga County,1001,32.532237,-86.64644
1,AL,Baldwin County,1003,30.659218,-87.746067
2,AL,Barbour County,1005,31.870253,-85.405104
3,AL,Bibb County,1007,33.015893,-87.127148
4,AL,Blount County,1009,33.977358,-86.56644


In [76]:
gazeeter_counties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3221 entries, 0 to 3220
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   state      3221 non-null   object
 1   county     3221 non-null   object
 2   fips       3221 non-null   int64 
 3   latitude   3221 non-null   object
 4   longitude  3221 non-null   object
dtypes: int64(1), object(4)
memory usage: 125.9+ KB


# Map NY Times location to Gazeeter locations

Check that all fips are unique in Gazeeter

In [25]:
gaz_counts = gazeeter_counties["fips"].value_counts()
len( gaz_counts[gaz_counts>1]) # 0 : all fips are unique !

0

In [42]:
nyt_location = nyt[ ["state", "county", "fips"] ].drop_duplicates()
len(nyt_location)

3274

unique fips in New York Times data

In [51]:
len(nyt_location["fips"].unique())

3219

There are fewer unique fips than total fips => some must be duplicates. 
Check that duplicate fips are missing values

In [65]:
nyt_location.groupby("fips", dropna = False).count().sort_values( ["state", "county"], ascending = False).head()

Unnamed: 0_level_0,state,county
fips,Unnamed: 1_level_1,Unnamed: 2_level_1
,56,56
1001.0,1,1
1003.0,1,1
1005.0,1,1
1007.0,1,1


Remove empty fips so that we can join with a one-to-one match

In [90]:
nytimes_counties_withfips = nyt_location[ nyt_location["fips"] != '']
nytimes_counties_nofips = nyt_location[nyt_location["fips"].isna() ]
f"NYT : total={len(nyt_location)} with fips= {len(nytimes_counties_withfips)}  no fips = {len(nytimes_counties_nofips)}"

'NYT : total=3274 with fips= 3218  no fips = 56'

In [77]:
nytimes_geo = pd.merge(nytimes_counties_withfips, gazeeter_counties, how = "inner", on = "fips", validate = "one_to_one")
len(nytimes_geo), len(nytimes_counties_withfips)

(3212, 3218)

Fips in NYT that are not found in gazeeter

In [97]:
unknown_fips = set(nytimes_counties_withfips["fips"]) - set(nytimes_geo["fips"])
nytimes_unknown_fips = nytimes_counties_withfips[ nytimes_counties_withfips["fips"].isin(unknown_fips)]
nytimes_unknown_fips

Unnamed: 0,state,county,fips
37818,Virgin Islands,St. Croix,78010
37819,Virgin Islands,St. John,78020
37820,Virgin Islands,St. Thomas,78030
121367,Alaska,Valdez-Cordova Census Area,2261
333508,Northern Mariana Islands,Saipan,69110
333509,Northern Mariana Islands,Tinian,69120


Locations in NYT that do not have any fips associated with

In [96]:
nytimes_counties_nofips.sort_values("state")

Unnamed: 0,state,county,fips
26170,Alaska,Unknown,
9521,Arizona,Unknown,
5223,Arkansas,Unknown,
7215,California,Unknown,
7246,Colorado,Unknown,
19893,Connecticut,Unknown,
46082,Delaware,Unknown,
7304,Florida,Unknown,
5385,Georgia,Unknown,
2422,Guam,Unknown,


New York : New York, Kings, Queens, Bronx and Richmond  :40.712740°N 74.005974°W
Kansas City (Missouri) : Kansas City town, can belong to Cass, Clay, Jackson and Platte county :  39.099724, and the longitude is -94.578331
Joplin (Missouri) : city :37.0842° N, 94.5133° W
Puerto Rico

In [128]:
#nb of cases and death per location
nb_covid_per_location = nyt.groupby(["state", "county", "fips"], dropna=False).sum().reset_index()
total_cases = nb_covid_per_location["cases"].sum()
total_deaths = nb_covid_per_location["deaths"].sum()
# rank of location for number of deaths
nb_covid_per_location["rank_cases"] = nb_covid_per_location["cases"].rank(ascending = False)
nb_covid_per_location["rank_deaths"] = nb_covid_per_location["deaths"].rank(ascending = False)
nb_covid_per_location["ratio_cases"] = nb_covid_per_location["cases"]/total_cases
nb_covid_per_location["ratio_deaths"] = nb_covid_per_location["deaths"]/total_deaths
nb_covid_per_location.sort_values("deaths", ascending = False).head(20)

Unnamed: 0,state,county,fips,cases,deaths,rank_cases,rank_deaths,ratio_cases,ratio_deaths
1887,New York,New York City,,82848124,6807794,2.0,1.0,0.033358,0.115245
205,California,Los Angeles,6037.0,91472090,1769205,1.0,2.0,0.03683,0.02995
620,Illinois,Cook,17031.0,53123048,1554822,3.0,3.0,0.02139,0.026321
1332,Michigan,Wayne,26163.0,12389253,855651,21.0,4.0,0.004988,0.014485
102,Arizona,Maricopa,4013.0,43315757,812281,5.0,5.0,0.017441,0.013751
367,Florida,Miami-Dade,12086.0,43932399,727032,4.0,6.0,0.017689,0.012307
1886,New York,Nassau,36059.0,16260737,702205,16.0,7.0,0.006547,0.011887
1243,Massachusetts,Middlesex,25017.0,10792692,601823,27.0,8.0,0.004346,0.010188
1809,New Jersey,Essex,34013.0,7958006,593908,45.0,9.0,0.003204,0.010054
1906,New York,Suffolk,36103.0,16621905,591168,15.0,10.0,0.006693,0.010008


Check the rank of locations not in gazetteer

In [131]:
no_gazeeter_fips = pd.concat([nytimes_counties_nofips, nytimes_unknown_fips], axis = 0)
#no_gazeeter_fips

In [132]:
no_gazeeter_fips_rank = pd.merge(nb_covid_per_location, no_gazeeter_fips, on =["state", "county", "fips"])\
    .sort_values("rank_deaths")
no_gazeeter_fips_rank

Unnamed: 0,state,county,fips,cases,deaths,rank_cases,rank_deaths,ratio_cases,ratio_deaths
34,New York,New York City,,82848124,6807794,2.0,1.0,3.335808e-02,0.115245
45,Puerto Rico,Unknown,,507300,188560,752.0,56.0,2.042599e-04,0.003192
26,Missouri,Kansas City,,3650832,44136,134.0,206.0,1.469976e-03,0.000747
13,Illinois,Unknown,,310908,33265,1102.0,261.0,1.251844e-04,0.000563
16,Kansas,Unknown,,0,30776,3272.0,284.0,0.000000e+00,0.000521
...,...,...,...,...,...,...,...,...,...
12,Idaho,Unknown,,1219,0,3243.0,3195.5,4.908197e-07,0.000000
27,Missouri,Unknown,,425,0,3254.0,3195.5,1.711226e-07,0.000000
36,North Carolina,Unknown,,998,0,3248.0,3195.5,4.018360e-07,0.000000
43,Oregon,Unknown,,1,0,3269.0,3195.5,4.026413e-10,0.000000
