In [5]:
import pandas as pd

In [6]:
pop_df = pd.read_parquet("../00_data/population/population.parquet")
pop_df = pop_df[(pop_df["Year"] > 2002) & (pop_df["Year"] < 2016)]
pop_df.head()

Unnamed: 0,FIPS_CODE,REGION,DIVISION,ST_NAME,CTY_NAME,Year,Population
9309,1001,3,6,Alabama,Autauga County,2003,46800
9310,1003,3,6,Alabama,Baldwin County,2003,151509
9311,1005,3,6,Alabama,Barbour County,2003,28594
9312,1007,3,6,Alabama,Bibb County,2003,21399
9313,1009,3,6,Alabama,Blount County,2003,53457


In [8]:
mort_df = pd.read_csv("../00_data/all_death_causes_2003_2015.csv")
# subset to appropriate states
mort_df = mort_df[mort_df["State"] != "AK"]
mort_df = mort_df[mort_df["State"] != "CT"]
mort_df.head()

Unnamed: 0,County,State,FIPS_CODE,Year,Deaths
0,Baldwin County,AL,1003,2003,10.0
1,Jefferson County,AL,1073,2003,37.0
2,Jefferson County,AL,1073,2003,32.0
3,Mobile County,AL,1097,2003,26.0
6,Cochise County,AZ,4003,2003,11.0


In [10]:
pop_df.dtypes

FIPS_CODE     object
REGION         int64
DIVISION       int64
ST_NAME       object
CTY_NAME      object
Year           int64
Population     int64
dtype: object

In [12]:
# make strings so can appropriately be used as keys
mort_df["FIPS_CODE"] = mort_df["FIPS_CODE"].astype(str)
pop_df["FIPS_CODE"] = pop_df["FIPS_CODE"].astype(str)

In [14]:
merged_df = pd.merge(
    pop_df,
    mort_df,
    how="left",
    on=["FIPS_CODE", "Year"],
    indicator=True,
)

merged_df.head()

Unnamed: 0,FIPS_CODE,REGION,DIVISION,ST_NAME,CTY_NAME,Year,Population,County,State,Deaths,_merge
0,1001,3,6,Alabama,Autauga County,2003,46800,,,,left_only
1,1003,3,6,Alabama,Baldwin County,2003,151509,,,,left_only
2,1005,3,6,Alabama,Barbour County,2003,28594,,,,left_only
3,1007,3,6,Alabama,Bibb County,2003,21399,,,,left_only
4,1009,3,6,Alabama,Blount County,2003,53457,,,,left_only


In [15]:
merged_df["_merge"].value_counts()

_merge
left_only     30926
both           7032
right_only        0
Name: count, dtype: int64

In [18]:
bad_merge = merged_df[merged_df["_merge"] == "left_only"]
bad_merge.head()

Unnamed: 0,FIPS_CODE,REGION,DIVISION,ST_NAME,CTY_NAME,Year,Population,County,State,Deaths,_merge
0,1001,3,6,Alabama,Autauga County,2003,46800,,,,left_only
1,1003,3,6,Alabama,Baldwin County,2003,151509,,,,left_only
2,1005,3,6,Alabama,Barbour County,2003,28594,,,,left_only
3,1007,3,6,Alabama,Bibb County,2003,21399,,,,left_only
4,1009,3,6,Alabama,Blount County,2003,53457,,,,left_only


seems like a consistent issue across states and years

In [22]:
# bad_merge["Year"].value_counts()
# bad_merge["ST_NAME"].value_counts()
bad_merge["CTY_NAME"].value_counts()

CTY_NAME
Washington County    283
Lincoln County       267
Franklin County      249
Jefferson County     238
Jackson County       217
                    ... 
Licking County         1
LaSalle County         1
Cabarrus County        1
Cowlitz County         1
Okaloosa County        1
Name: count, Length: 1704, dtype: int64

In [23]:
pop_df.head()

Unnamed: 0,FIPS_CODE,REGION,DIVISION,ST_NAME,CTY_NAME,Year,Population
9309,1001,3,6,Alabama,Autauga County,2003,46800
9310,1003,3,6,Alabama,Baldwin County,2003,151509
9311,1005,3,6,Alabama,Barbour County,2003,28594
9312,1007,3,6,Alabama,Bibb County,2003,21399
9313,1009,3,6,Alabama,Blount County,2003,53457


In [40]:
washington_pop_df = pop_df[(pop_df["ST_NAME"] == "Washington")]
washington_mort_df = mort_df[mort_df["State"] == "WA"]

In [41]:
counties_df1 = set(washington_pop_df["CTY_NAME"])
counties_df2 = set(washington_mort_df["County"])

# Find counties only in df1
only_in_df1 = counties_df1 - counties_df2

# Find counties only in df2
only_in_df2 = counties_df2 - counties_df1

# Counties present in either DataFrame but not both
not_in_both = only_in_df1.union(only_in_df2)

# Output the results
print("Counties only in Population:")
print(len(only_in_df1))
for county in only_in_df1:
    print(county)

print("\nCounties only in Mortality:")
print(len(only_in_df2))
for county in only_in_df2:
    print(county)

# print("\nCounties not in both:")
# for county in not_in_both:
#     print(county)

Counties only in Population:
18
Garfield County
Franklin County
Wahkiakum County
Whitman County
Jefferson County
San Juan County
Lincoln County
Pacific County
Asotin County
Pend Oreille County
Okanogan County
Douglas County
Columbia County
Skamania County
Ferry County
Kittitas County
Klickitat County
Adams County

Counties only in Mortality:
0


In [42]:
florida_pop_df = pop_df[(pop_df["ST_NAME"] == "Florida")]
florida_mort_df = mort_df[mort_df["State"] == "FL"]

In [43]:
counties_df1 = set(florida_pop_df["CTY_NAME"])
counties_df2 = set(florida_mort_df["County"])

# Find counties only in df1
only_in_df1 = counties_df1 - counties_df2

# Find counties only in df2
only_in_df2 = counties_df2 - counties_df1

# Counties present in either DataFrame but not both
not_in_both = only_in_df1.union(only_in_df2)

# Output the results
print("Counties only in Population:")
print(len(only_in_df1))
for county in only_in_df1:
    print(county)

print("\nCounties only in Mortality:")
print(len(only_in_df2))
for county in only_in_df2:
    print(county)

# print("\nCounties not in both:")
# for county in not_in_both:
#     print(county)

Counties only in Population:
24
Glades County
Lafayette County
Hardee County
Union County
Holmes County
Suwannee County
Washington County
Dixie County
Madison County
Jackson County
Jefferson County
Hendry County
Bradford County
Gadsden County
Gilchrist County
Calhoun County
Wakulla County
Baker County
Franklin County
Taylor County
DeSoto County
Liberty County
Hamilton County
Gulf County

Counties only in Mortality:
0
