In [1]:
import numpy as np
import pandas as pd


In [33]:
test = pd.read_parquet('../20_intermediate_files/florida_mortality.parquet')
control = pd.read_parquet('../20_intermediate_files/other_mortality.parquet')

In [None]:
# calculate slope of death_per_100k in the pre-period

# store the pre post indicator column as a variable
# this is so that we can easily reuse the code for other states
pre_post_indicator = "category_flo"

# calculate slope in the pre period by county
from scipy import stats


def reg(df, x, y):
    slope, _, _, _, _ = stats.linregress(df[x], df[y])
    return slope


slopes = (
    test.loc[
        test[pre_post_indicator] == "pre",
    ]
    .groupby(["state", "county"])
    .apply(reg, "year", "deaths_per_100k")
).reset_index(name='slope')


test = pd.merge(test, slopes.reset_index(), on=['state', 'county'], how='left', indicator=True)


In [53]:

def read_in(state, kind):
    state_data = pd.read_parquet(f"../20_intermediate_files/{state}_{kind}.parquet")
    control_data = pd.read_parquet(f"../20_intermediate_files/other_{kind}.parquet")
    return state_data, control_data

def prepare_data(state_data, control_data, state):
    years = {"Florida": 2010, "Texas": 2007, "Washington": 2011}
    cut_year = years[state]
    category = f"category_{state[0:3].lower()}"
    control_data[category] = np.where(control_data["year"] >= cut_year, "post", "pre")
    control_data["type"] = "Control States"
    state_data["type"] = f"{state}"
    state_pre = state_data[state_data[category] == "pre"]
    state_post = state_data[state_data[category] == "post"]
    control_pre = control_data[control_data[category] == "pre"]
    control_post = control_data[control_data[category] == "post"]
    pre = pd.concat([state_pre, control_pre])
    post = pd.concat([control_post, state_post])
    return pre, post, cut_year

state_data, control_data = read_in('Florida', 'mortality')
pre, post, cut_year = prepare_data(state_data, control_data, 'Florida')


In [71]:
# inputs: state_pre, control_pre
state_pre = state_data[state_data['category_flo'] == "pre"]
control_pre = control_data[control_data['category_flo'] == "pre"]
# Step 1: calculate slope for each county in state_pre, control_pre

# Step 2: for each county in state_pre, choose 3 most similar counties; similarity is determined by 1) similar slope, 2) similar population


In [142]:
def grab_encodings(kind):
    kind_coding = {
        "shipment": [
            "Opioid Shipments per County per Year",
            "shipment_kg_per_100k",
            "Opioid Shipments per 100,000 Residents (Kg)",
        ],
        "mortality": [
            "Opioid Mortality per County per Year",
            "deaths_per_100k",
            "Opioid Mortality per 100,000 Residents)",
        ],
    }
    encodings = kind_coding[kind]
    return encodings

In [143]:
# calculate slope
def calulate_slope_by_county(df_pre, kind):
    # helper function for running regression
    def reg(df, x, y):
        slope, _, _, _, _ = stats.linregress(df[x], df[y])
        return slope

    # get the key column name (shipment_kg_per_100k or deaths_per_100k)
    encodings = grab_encodings(kind)
    key_metric_column_name = encodings[1]

    # calculate slopes for each county
    slopes = (
        df_pre.groupby(["state", "county"])
        .apply(reg, "year", key_metric_column_name)
        .reset_index(name="slope")
    )
    df_pre = pd.merge(df_pre, slopes, on=["state", "county"], how="left")

    return df_pre

In [144]:
state_pre = calulate_slope_by_county(state_pre, kind)
control_pre = calulate_slope_by_county(control_pre, kind)

In [145]:
def calculate_avg_population(df_pre):
    avg_pops = df_pre.groupby(["state", "county"])['population'].mean().reset_index(name='avg_population')
    df_pre = pd.merge(df_pre, avg_pops, on=['state', 'county'], how='left')
    return df_pre

In [146]:
state_pre = calculate_avg_population(state_pre)
control_pre = calculate_avg_population(control_pre)

In [None]:
# find similar counties in chunks below

In [95]:
# pull out these columns and dedupe to compare each test county to control counties
state_pre_for_comparison = state_pre[['state', 'county', 'slope', 'avg_population']].drop_duplicates()
state_pre_filtered_columns['source'] = 'state'
control_pre_for_comparison  = control_pre[['state', 'county', 'slope', 'avg_population']].drop_duplicates()
control_pre_filtered_columns['source'] = 'control'

In [133]:
selected_control = pd.DataFrame()
for idx, row in state_pre.iterrows():
    # pull values for the test county we're currently on
    current_county = row['county']
    current_data = state_pre_filtered_columns.loc[state_pre_filtered_columns.county == current_county]
    current_slope = current_data['slope'].values
    current_avg_population = current_data['avg_population'].values

    # calculate differences between current county and all control counties
    control_pre_filtered_columns['slope_diff'] = (control_pre_filtered_columns['slope'] - current_slope).abs()
    control_pre_filtered_columns['avg_pop_diff'] = (control_pre_filtered_columns['avg_population'] - current_avg_population).abs()
    # sort by difference, then select the top 4 counties
    selected_control_counties = control_pre_filtered_columns.sort_values(['slope_diff', 'avg_pop_diff'])[['state','county']][0:4]
    # add data for the 4 selected counties into a data frame
    additional_control_data = control_data[control_data['state'].isin(selected_control_counties['state']) & (control_data['county'].isin(selected_control_counties['county']))]
    selected_control = pd.concat([selected_control, additional_control_data])

In [147]:
def grab_control_counties(state_pre, control_pre, control_data, n):
    # pull out these columns and dedupe to compare each test county to control counties
    state_pre_for_comparison = state_pre[
        ["state", "county", "slope", "avg_population"]
    ].drop_duplicates()
    control_pre_for_comparison = control_pre[
        ["state", "county", "slope", "avg_population"]
    ].drop_duplicates()

    # initiate DF to store data for selected controls
    selected_control = pd.DataFrame()
    # use a loop to find similar control counties
    for idx, row in state_pre.iterrows():
        # pull values for the test county we're currently on
        current_county = row["county"]
        current_data = state_pre_for_comparison.loc[
            state_pre_for_comparison.county == current_county
        ]
        current_slope = current_data["slope"].values
        current_avg_population = current_data["avg_population"].values

        # calculate differences between current county and all control counties
        control_pre_for_comparison["slope_diff"] = (
            control_pre_for_comparison["slope"] - current_slope
        ).abs()
        control_pre_for_comparison["avg_pop_diff"] = (
            control_pre_for_comparison["avg_population"] - current_avg_population
        ).abs()
        # sort by difference, then select the top n counties
        selected_control_counties = control_pre_for_comparison.sort_values(
            ["slope_diff", "avg_pop_diff"]
        )[["state", "county"]][0:n]
        
        # add data for the 4 selected counties into the data frame we initiated
        additional_control_data = control_data[
            control_data["state"].isin(selected_control_counties["state"])
            & (control_data["county"].isin(selected_control_counties["county"]))
        ]
        selected_control = pd.concat([selected_control, additional_control_data])
    
    return selected_control

In [149]:
kind ='mortality'
def find_controls(kind, state_pre, control_pre, control_data, n=4):
    state_pre = calulate_slope_by_county(state_pre, kind)
    control_pre = calulate_slope_by_county(control_pre, kind)
    state_pre = calculate_avg_population(state_pre)
    control_pre = calculate_avg_population(control_pre)
    selected_control_data = grab_control_counties(state_pre, control_pre, control_data, n)
    return selected_control_data

In [151]:
find_controls(kind, state_pre, control_pre, control_data, n=2)

Unnamed: 0_level_0,state,county,year,Deaths,population,deaths_per_100k,Deaths_by_pop,category_flo,category_tex,category_was,type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2236,Arkansas,Saline County,2003,0.0,88627,0.000000,0.000000,pre,pre,pre,Control States
2237,Arkansas,Saline County,2004,0.0,90961,0.000000,0.000000,pre,pre,pre,Control States
2238,Arkansas,Saline County,2005,10.0,93493,10.695988,0.000107,pre,pre,pre,Control States
2239,Arkansas,Saline County,2006,10.0,96661,10.345434,0.000103,pre,pre,pre,Control States
2240,Arkansas,Saline County,2007,0.0,100043,0.000000,0.000000,pre,post,pre,Control States
...,...,...,...,...,...,...,...,...,...,...,...
18728,Mississippi,Leake County,2011,0.0,23256,0.000000,0.000000,post,post,pre,Control States
18729,Mississippi,Leake County,2012,0.0,23195,0.000000,0.000000,post,post,post,Control States
18730,Mississippi,Leake County,2013,0.0,23267,0.000000,0.000000,post,post,post,Control States
18731,Mississippi,Leake County,2014,0.0,23181,0.000000,0.000000,post,post,post,Control States
