## Combining population and mortality data

First, I read in the cleaned mortality and population files.

In [73]:
import pandas as pd
import numpy as np

Read in data for both population and mortality

In [74]:
pop = pd.read_parquet("../20_intermediate_files/population_clean.parquet")
mortality = pd.read_parquet("../20_intermediate_files/df_clean_mortality.parquet")


Mortality files states are in abbreviation format, so use below to convert to names (after stripping extra space from state abbreviations column).

In [75]:
us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}
abbrev_to_us_state = dict(map(reversed, us_state_to_abbrev.items()))
mortality["State"] = mortality["State"].str.strip()
mortality["State"] = mortality["State"].map(abbrev_to_us_state)


After merging initially, I checked for rows that did not have population values. 

I noticed that there were different names for some counties, which I changed in the mortality dataset.

"La Porte" county is "LaPorte" in the population dataset, so I changed it in the mortality. 

Also, Dona Ana county is Doña Ana County.

Continually, in the mortality McKean county is "Mc Kean", which I fixed.

In [76]:
mortality.loc[:, "County"][
    mortality.loc[:, "County"] == "La Porte County"
] = "LaPorte County"
mortality.loc[:, "County"][
    mortality.loc[:, "County"] == "Dona Ana County"
] = "Doña Ana County"
mortality.loc[:, "County"][
    mortality.loc[:, "County"] == "Mc Kean County"
] = "McKean County"


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mortality.loc[:, "County"][
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mortality.loc[:, "County"][
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mortality.loc[:, "County"][


Renamed columns in mortality dataset to match population dataset.

In [77]:
mortality.rename(
    {"State": "state", "County": "county", "Year": "year"}, axis=1, inplace=True
)


Conducted outer merge.

In [78]:
mortality_pop = mortality.merge(
    pop, how="outer", on=["state", "county", "year"], sort=True, validate="1:1"
)


Checked for any missing values in the population column.
Yay, there are none!

Now, looked at the counties that did not have observations for all 13 years.

Because none of them had observations for mortality, I decided to drop those states (AK, LA, SD).

I dropped them becasue none are our policy change, and the changes in borders could screw with population.

In [79]:
temp = [
    "Petersburg Borough",
    "Wade Hampton Census Area",
    "La Salle Parish",
    "LaSalle Parish",
    "Oglala Lakota County",
    "Shannon County",
    "Bedford city",
]
mortality_pop[mortality_pop["county"].isin(temp)]


Unnamed: 0,state,county,year,Deaths,population
1124,Alaska,Petersburg Borough,2010,,3219
1125,Alaska,Petersburg Borough,2011,,3255
1126,Alaska,Petersburg Borough,2012,,3277
1127,Alaska,Petersburg Borough,2013,,3291
1128,Alaska,Petersburg Borough,2014,,3261
1129,Alaska,Petersburg Borough,2015,,3253
1202,Alaska,Wade Hampton Census Area,2003,,7201
1203,Alaska,Wade Hampton Census Area,2004,,7330
1204,Alaska,Wade Hampton Census Area,2005,,7313
1205,Alaska,Wade Hampton Census Area,2006,,7292


In [80]:
drop_states = ["Alaska, Louisiana", "South Dakota", "Virginia"]
mortality_pop = mortality_pop[~mortality_pop["state"].isin(["Alaska", "Louisiana", "South Dakota", "Virginia"])]


Filled NA values with 0, because these indicate counties with deaths <10, and that's important to include.

Created column with opioid overdose deaths per 100,000 residents.

In [81]:
mortality_pop["deaths_per_100k"] = (
    100000 * mortality_pop["Deaths"] / mortality_pop["population"]
)


In [82]:
mortality_pop["replaced"] = np.nan


def replace_values(state, year):
    mortality_pop.loc[
        (mortality_pop.loc[:, "state"] == state)
        & (mortality_pop.loc[:, "Deaths"].isnull())
        & (mortality_pop.loc[:, "year"] == year),
        "deaths_per_100k",
    ] = mortality_pop.loc[(mortality_pop["state"] == state) & (mortality_pop["year"] == year), "deaths_per_100k"].mean()

years = [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
for i in us_state_to_abbrev.keys():
    for j in years:
        replace_values(i, j)


In [83]:
mortality_pop.loc[mortality_pop["Deaths"].isnull(), "Deaths"] = (
    mortality_pop.loc[mortality_pop["Deaths"].isnull(), "deaths_per_100k"]
    * mortality_pop.loc[mortality_pop["Deaths"].isnull(), "population"]
    / 1000000
)


In [84]:
mortality_pop.shape

(37050, 7)

In [85]:
null_states = list(
    mortality_pop[mortality_pop["Deaths"].isnull()].state.value_counts().index
)


def replace_values2(state):
    mortality_pop.loc[
        (mortality_pop.loc[:, "state"] == state)
        & (mortality_pop.loc[:, "Deaths"].isnull()),
        "deaths_per_100k",
    ] = mortality_pop.loc[
        (mortality_pop["state"] == state),
        "deaths_per_100k",
    ].mean()

for i in null_states:
    replace_values2(i)

mortality_pop.loc[mortality_pop["Deaths"].isnull(), "Deaths"] = (
    mortality_pop.loc[mortality_pop["Deaths"].isnull(), "deaths_per_100k"]
    * mortality_pop.loc[mortality_pop["Deaths"].isnull(), "population"]
    / 1000000
)


In [86]:
mortality_pop["Deaths"].describe()

count    37050.000000
mean        10.403281
std         39.825859
min          0.000454
25%          0.140965
50%          0.374510
75%          1.166445
max        862.000000
Name: Deaths, dtype: float64

In [91]:
mortality_pop["deaths_per_100k"].describe()

count    37050.000000
mean        14.170513
std          8.041547
min          0.811790
25%          9.432771
50%         12.330625
75%         16.533004
max        126.552242
Name: deaths_per_100k, dtype: float64

Conducted asserts to make sure data is in reasonable bounds.

population greater than 0 and less than 10.2 mil (most populous county in the time period)

mortality rate is non-negative and less than everyone in the county dying

same number of observations for each year

In [96]:
assert (
    (mortality_pop["population"] >= 0) & (mortality_pop["population"] <= 10200000)
).all()
assert (
    (mortality_pop["deaths_per_100k"]) > 0).all()
assert (mortality_pop.groupby(["year"])["county"].count() == 2850).all()


output to parquet.

In [97]:
mortality_pop.to_parquet(
    "../20_intermediate_files/moratality_population.parquet", engine="fastparquet"
)
