# Combining Population and Shipment Data

First, I read in the data.

In [24]:
import pandas as pd

In [25]:
pop = pd.read_parquet("../20_intermediate_files/population_clean.parquet")
opioid = pd.read_parquet("../20_intermediate_files/opioid_annual_total.parquet")

In [26]:
us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}
abbrev_to_us_state = dict(map(reversed, us_state_to_abbrev.items()))


Here, I converted state abbreviations in the shipment data to the full state name.

After checking for null values, I found that Palau is included in this data, so I dropped those values.

In [27]:
opioid["BUYER_STATE"] = opioid["BUYER_STATE"].map(abbrev_to_us_state)
opioid[opioid["BUYER_STATE"].isnull().values]


Unnamed: 0,TRANSACTION_YEAR,BUYER_COUNTY,BUYER_STATE,CALC_BASE_WT_IN_GM
2151,2006-01-01,PALAU,,52.9725
5241,2007-01-01,PALAU,,88.237045
8319,2008-01-01,PALAU,,22.702499
11394,2009-01-01,PALAU,,41.62125


In [28]:
opioid = opioid[~opioid["BUYER_STATE"].isnull().values]


I next turned the year column into an integer and changed the column names for merging and sorted by values for easier comparison

In [29]:
opioid["year"] = opioid["TRANSACTION_YEAR"].dt.year
opioid.rename(
    {
        "BUYER_COUNTY": "county",
        "BUYER_STATE": "state",
        "CALC_BASE_WT_IN_GM": "shipment",
    },
    axis=1,
    inplace=True,
)
opioid.drop(["TRANSACTION_YEAR"], axis=1, inplace=True)
opioid = opioid[["state", "county", "year", "shipment"]].sort_values(
    ["state", "county", "year"]
)


Here, I do all the data cleaning necessary to make sure that the two sets merge appropriately on county values.

The two datasets often refer to counties differently, with extra spaces, or little differences like "saint" vs "st.".

The general strategy is to make everything lowercase and delete all spaces.

There are also specific changes that had to be made.

In [30]:
opioid["county"] = opioid["county"].str.replace(
    "^[Ss][Tt][Ee]?(\.)? ", "saint", regex=True
)
pop["county"] = pop["county"].str.replace("^[Ss][Tt][Ee]?(\.)? ", "saint", regex=True)
opioid["county"] = opioid["county"].str.replace("SAINTE", "saint")
pop["county"] = pop["county"].str.replace("ñ", "n")
opioid["county"] = opioid["county"].str.lower()
pop["county"] = pop["county"].str.lower()
pop["county"] = pop["county"].str.replace(" county", "")
pop["county"] = pop["county"].str.replace(" ", "")
opioid["county"] = opioid["county"].str.replace(" ", "")
pop["county"] = pop["county"].str.replace("[^\w\s]", "", regex=True)
opioid["county"] = opioid["county"].str.replace("[^\w\s]", "", regex=True)
pop["county"] = pop["county"].str.replace("parish", "")
pop["county"] = pop["county"].str.replace("bristolcity", "bristol")
pop["county"] = pop["county"].str.replace("radfordcity", "radford")
pop["county"] = pop["county"].str.replace("salemcity", "salem")


Here, I merge, and then remove Alaska per Nick's instructions

I also remove U.S. territories to limit this analysis to States and D.C.

In [31]:
opioid_pop = opioid.merge(pop, how="outer", on=["state", "county", "year"], sort=True, validate = "1:1")
opioid_pop = opioid_pop[
    (opioid_pop["state"] != "Alaska")
    & (opioid_pop["state"] != "Guam")
    & (opioid_pop["state"] != "Northern Mariana Islands")
    & (opioid_pop["state"] != "Puerto Rico")
    & (opioid_pop["state"] != "U.S. Virgin Islands")
]


Here, I check to make sure there are no null values for population.

In [32]:
opioid_pop[opioid_pop["population"].isnull().values]


Unnamed: 0,state,county,year,shipment,population


Limit the data to relevant years.

In [33]:
opioid_pop = opioid_pop[(opioid_pop['year'] >= 2006) & (opioid_pop['year'] <= 2012)]

Check for number of missing values. There are 914, and they are spread fairly evenly across states.

In [34]:
opioid_pop[opioid_pop["shipment"].isnull().values]


Unnamed: 0,state,county,year,shipment,population
1733,Arkansas,cleveland,2009,,8649.0
1734,Arkansas,cleveland,2010,,8677.0
1735,Arkansas,cleveland,2011,,8674.0
1736,Arkansas,cleveland,2012,,8609.0
2563,California,alpine,2007,,1274.0
...,...,...,...,...,...
40825,Wisconsin,menominee,2008,,4214.0
40826,Wisconsin,menominee,2009,,4191.0
40827,Wisconsin,menominee,2010,,4269.0
40828,Wisconsin,menominee,2011,,4372.0


In [35]:
opioid_pop[opioid_pop["shipment"].isnull().values][['state', 'county', 'year']].groupby(['state']).count()

Unnamed: 0_level_0,county,year
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Arkansas,4,4
California,1,1
Colorado,38,38
Florida,3,3
Georgia,50,50
Hawaii,7,7
Idaho,24,24
Illinois,5,5
Indiana,7,7
Kansas,35,35


Filled NAs with 0s, assuming that's what they are.

Created column for opioid shipments per 1,000 people.

In [36]:
opioid_pop.fillna(0, inplace=True)
opioid_pop["shipment_per_1k"] = 1000 * opioid_pop["shipment"] / opioid_pop["population"]


Checked for counties that changed names between census years.

In this case, Louisiana's two counties were merged, because of the string manipulation done. The population seems fine so it's okay to keep.

However, I will drop South Dakota and Virginia still.



In [37]:
temp = [
    "lasalle",
    "oglalalakota",
    "shannon",
    "bedfordcity",
]
opioid_pop[opioid_pop["county"].isin(temp)]


Unnamed: 0,state,county,year,shipment,population,shipment_per_1k
8510,Illinois,lasalle,2006,17582.316406,113139.0,155.404555
8511,Illinois,lasalle,2007,18146.105469,113794.0,159.464524
8512,Illinois,lasalle,2008,20522.978516,114171.0,179.756488
8513,Illinois,lasalle,2009,21063.107422,113999.0,184.765726
8514,Illinois,lasalle,2010,21869.603516,113823.0,192.136949
8515,Illinois,lasalle,2011,23316.505859,113533.0,205.372059
8516,Illinois,lasalle,2012,24882.023438,112918.0,220.354806
14984,Louisiana,lasalle,2006,3062.153809,14519.0,210.906657
14985,Louisiana,lasalle,2007,3859.90625,14570.0,264.9215
14986,Louisiana,lasalle,2008,4231.744141,14667.0,288.521443


In [38]:
drop = ["South Dakota", "Virginia"]
opioid_pop = opioid_pop[~opioid_pop["state"].isin(drop)]

Now I conducted asserts to make sure the data is reasonable.

population greater than 0 and less than 10 mil (most populous county in the time period)

Shipment rate is non-negative

same number of observations for each year


In [39]:
assert ((opioid_pop["population"] >= 0) & (opioid_pop["population"] <= 10000000)).all()
assert (opioid_pop["shipment_per_1k"] >= 0).all()
assert (opioid_pop.groupby(["year"])["county"].count() == 2914).all()


In [40]:
opioid_pop.to_parquet(
    "../20_intermediate_files/opioid_population.parquet", engine="fastparquet"
)
