In [1]:
import pandas as pd


In [2]:
pop = pd.read_parquet("../20_intermediate_files/population_clean.parquet")
opioid = pd.read_parquet("../20_intermediate_files/opioid_monthly_total.parquet")


In [3]:
us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}
abbrev_to_us_state = dict(map(reversed, us_state_to_abbrev.items()))


In [4]:
opioid["BUYER_STATE"] = opioid["BUYER_STATE"].map(abbrev_to_us_state)


In [5]:
opioid = opioid[~opioid["BUYER_STATE"].isnull().values]


In [6]:
opioid["year"] = pd.DatetimeIndex(opioid["TRANSACTION_MONTH"]).year.astype(int)
opioid["month"] = pd.DatetimeIndex(opioid["TRANSACTION_MONTH"]).month.astype(int)


In [7]:
opioid.rename(
    {
        "TRANSACTION_MONTH": "date",
        "BUYER_COUNTY": "county",
        "BUYER_STATE": "state",
        "shipment_quantity": "shipment",
    },
    axis=1,
    inplace=True,
)

In [8]:
opioid = opioid[["date", "year", "month", "state", "county", "shipment"]].sort_values(
    ["state", "county", "date"]
)

In [9]:
opioid["county"] = opioid["county"].str.replace(
    "^[Ss][Tt][Ee]?(\.)? ", "saint", regex=True
)
pop["county"] = pop["county"].str.replace("^[Ss][Tt][Ee]?(\.)? ", "saint", regex=True)
opioid["county"] = opioid["county"].str.replace("SAINTE", "saint")
pop["county"] = pop["county"].str.replace("ñ", "n")
opioid["county"] = opioid["county"].str.lower()
pop["county"] = pop["county"].str.lower()
pop["county"] = pop["county"].str.replace(" county", "")
pop["county"] = pop["county"].str.replace(" ", "")
opioid["county"] = opioid["county"].str.replace(" ", "")
pop["county"] = pop["county"].str.replace("[^\w\s]", "", regex=True)
opioid["county"] = opioid["county"].str.replace("[^\w\s]", "", regex=True)
pop["county"] = pop["county"].str.replace("parish", "")
pop["county"] = pop["county"].str.replace("bristolcity", "bristol")
pop["county"] = pop["county"].str.replace("radfordcity", "radford")
pop["county"] = pop["county"].str.replace("salemcity", "salem")

In [10]:
column_names = list(pop.columns)
column_names.append("month")
final_pop = pd.DataFrame(columns=column_names)
final = []
for i in range(1, 13):
    temp = pop.copy()
    temp["month"] = i
    final_pop = pd.concat([final_pop, temp])


In [11]:
opioid_pop = opioid.merge(
    final_pop, how="outer", on=["state", "month", "county", "year"], sort=True, validate="1:1"
)

In [12]:
opioid_pop = opioid_pop[
    (opioid_pop["state"] != "Alaska")
    & (opioid_pop["state"] != "Guam")
    & (opioid_pop["state"] != "Northern Mariana Islands")
    & (opioid_pop["state"] != "Puerto Rico")
    & (opioid_pop["state"] != "U.S. Virgin Islands")
]

In [13]:
opioid_pop[opioid_pop["population"].isnull().values]

Unnamed: 0,date,year,month,state,county,shipment,population


In [14]:
opioid_pop = opioid_pop[(opioid_pop["year"] >= 2006) & (opioid_pop["year"] <= 2012)]


In [15]:
opioid_pop[opioid_pop["shipment"].isnull().values]


Unnamed: 0,date,year,month,state,county,shipment,population
18829,NaT,2009,1,Arkansas,cleveland,,8649
18830,NaT,2010,1,Arkansas,cleveland,,8677
18831,NaT,2011,1,Arkansas,cleveland,,8674
18832,NaT,2012,1,Arkansas,cleveland,,8609
19804,NaT,2009,2,Arkansas,cleveland,,8649
...,...,...,...,...,...,...,...
494256,NaT,2008,12,Wisconsin,menominee,,4214
494257,NaT,2009,12,Wisconsin,menominee,,4191
494258,NaT,2010,12,Wisconsin,menominee,,4269
494259,NaT,2011,12,Wisconsin,menominee,,4372


In [16]:
opioid_pop[opioid_pop["shipment"].isnull().values][["state", "county", "year"]].groupby(
    ["state"]
).count()

Unnamed: 0_level_0,county,year
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Arkansas,50,50
California,73,73
Colorado,648,648
Florida,41,41
Georgia,664,664
Hawaii,84,84
Idaho,358,358
Illinois,83,83
Indiana,108,108
Iowa,1,1


In [17]:
opioid_pop.fillna(0, inplace=True)
opioid_pop["shipment_per_resident"] = opioid_pop["shipment"] / opioid_pop["population"]

In [18]:
temp = [
    "lasalle",
    "oglalalakota",
    "shannon",
    "bedfordcity",
]
opioid_pop[opioid_pop["county"].isin(temp)]
drop = ["South Dakota", "Virginia"]
opioid_pop = opioid_pop[~opioid_pop["state"].isin(drop)]

In [19]:
assert ((opioid_pop["population"] >= 0) & (opioid_pop["population"] <= 10000000)).all()
assert (opioid_pop["shipment_per_resident"] >= 0).all()
assert (opioid_pop.groupby(["year"])["county"].count() == 34968).all()


In [20]:
opioid_pop['date'] = pd.to_datetime(opioid_pop.year.astype(int).astype(str) + "/" + opioid_pop.month.astype(int).astype(str))

# drop the original 'year' and 'month' columns, and rename 'date' to 'year'
# this will ensure minimal changes to subsequent scripts since we had used 'year' to denote the column representing dates
opioid_pop = opioid_pop.drop(columns=['year', 'month']).rename(columns={'date':'year'})

# save
opioid_pop.to_parquet(
    "../20_intermediate_files/opioid_population_monthly.parquet", engine="fastparquet"
)