In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, inspect
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Float

# Read in state data
df = pd.read_csv('cdc_data/states_mcd_per_year.csv')

# Use pivot_table() to create columns for number of deaths from each type of drug
states_year = df.pivot_table(values='deaths', index=['state', 'state-code', 'year', 'population'],
                             columns='multiple-cause-of-death', fill_value=0, aggfunc='max').reset_index()

# Rename columns
states_year.columns = ['state','state_code','year','population','heroin_deaths','methadone_deaths',
                  'other_opioids_deaths','other_synthetics_deaths']

# Replace any instances of suppressed data with 0
states_year = states_year.replace('Suppressed',0)

# Set data types in deaths columns as integers
deaths_cols = ['heroin_deaths','methadone_deaths','other_opioids_deaths','other_synthetics_deaths']

states_year[deaths_cols] = states_year[deaths_cols].astype('int64')

# Calculate the death rate per population for each drug
for col in ['heroin', 'other_opioids', 'methadone', 'other_synthetics']:
    states_year[col + '_death_rate'] = round((states_year[col + '_deaths'] / states_year['population'] * 100000), 1)

# Reorder columns
states_year = states_year[['state_code','state','year','heroin_deaths','other_opioids_deaths',
                           'methadone_deaths','other_synthetics_deaths','population',
                           'heroin_death_rate','other_opioids_death_rate','methadone_death_rate',
                           'other_synthetics_death_rate']]


# Read in county data
c_df = pd.read_csv('cdc_data/counties_mcd_2006-2017.csv')

# Use pivot_table() to create columns for each type of drug
counties_range = c_df.pivot_table(values='deaths', index=['county-code', 'county', 'state', 'population'],
                                  columns='multiple-cause-of-death', fill_value=0, aggfunc='max').reset_index()

#Rename columns
counties_range.columns = ['county_code','county','state','population','heroin_deaths','methadone_deaths',
                          'other_opioids_deaths','other_synthetics_deaths']

# Replace any instances of suppressed or missing data with 0
counties_range = counties_range.replace(['Suppressed','Missing'],0)

# Set data types in deaths columns as integers
c_deaths_cols = ['population','heroin_deaths','methadone_deaths','other_opioids_deaths','other_synthetics_deaths']

counties_range[c_deaths_cols] = counties_range[c_deaths_cols].astype('int64')

# Calculate the death rate per population for each drug
for col in ['heroin', 'other_opioids', 'methadone', 'other_synthetics']:
    counties_range[col + '_death_rate'] = round((counties_range[col + '_deaths'] / counties_range['population'] * 100000), 1)
    
# Reorder columns
counties_range = counties_range[['county_code','county','state','heroin_deaths','other_opioids_deaths',
                                 'methadone_deaths','other_synthetics_deaths','population',
                                 'heroin_death_rate','other_opioids_death_rate','methadone_death_rate',
                                 'other_synthetics_death_rate']]


# Function that scrapes prescribing rate data for each state
def state_prescribing_rate():
    years = list(range(2006,2018))

    tables = []

    for year in years:
        url = f'https://www.cdc.gov/drugoverdose/maps/rxstate{year}.html'

        table = pd.read_html(url)[0]

        table.columns = ['state','abbr',f'prescribing_rate_{year}']

        tables.append(table)
        
    # Merge each year's dataframe into one
    states_prescribe_df = pd.merge(tables[0],tables[1],how='inner',on=['state','abbr'])

    for x in range(len(tables)):
        try:
            states_prescribe_df = pd.merge(states_prescribe_df,tables[x+2],how='inner',on=['state','abbr'])

        except:
            break
            
    # Clean the combined dataframe
    states_prescribe_df = states_prescribe_df.drop(columns='abbr')
    
    states_prescribe_df = states_prescribe_df.T.reset_index()

    for x in range(len(states_prescribe_df)):
        try:
            states_prescribe_df['index'].loc[x+1] = years[x]
        except:
            break
            
    states_prescribe_df.columns = states_prescribe_df.loc[0]

    states_prescribe_df = states_prescribe_df.iloc[1:]\
                                                .rename(columns={'state':'year'})\
                                                .set_index('year').unstack()\
                                                .to_frame(name='states_prescribe_df')\
                                                .reset_index()
    
    # Rename columns
    states_prescribe_df.columns = ['state','year','prescribing_rate']
    
    # Replace any instances of missing data with 0
    states_prescribe_df = states_prescribe_df.replace(['–'],0)
    
    # Set data types in prescribing rate columns as floats
    states_prescribe_df['prescribing_rate'] = states_prescribe_df['prescribing_rate'].astype('float64')

    return states_prescribe_df

# Create state prescribing rate dataframe using function
states_prescribe_df = state_prescribing_rate()

# Merge death rate and prescribing rate dataframes
states_each_year = pd.merge(states_year,states_prescribe_df,how='outer',on=['state','year'])

# Create dataframe with the mean prescribing rate for each state across all years
states_prescribing_rate = states_each_year.groupby('state')['prescribing_rate'].agg('mean').round(1)\
                            .to_frame(name='mean_prescribing_rate').reset_index()

# Create dataframe with the mean death rate for each state across years 2006-17
states_death_rate = states_each_year[['state','year','other_opioids_death_rate']].set_index('year')\
                                        .drop([1999,2000,2001,2002,2003,2004,2005]).reset_index()\
                                        .groupby('state')['other_opioids_death_rate'].agg('mean').round(1)\
                                        .to_frame(name='mean_opioids_death_rate').reset_index()

# Merge mean prescribing rate and mean death rate into one column
states_rates = pd.merge(states_death_rate,states_prescribing_rate,how='inner')


# Function that scrapes prescribing rate data for each county
def county_prescribing_rate():
    years = list(range(2006,2017))

    tables = []

    for year in years:
        url = f'https://www.cdc.gov/drugoverdose/maps/rxcounty{year}.html'

        table = pd.read_html(url)[0]

        table.columns = ['county','state','county_code',f'prescribing_rate_{year}']

        split = table['county'].str.split(', ',n=1,expand=True)

        table['county'] = split[0]

        tables.append(table)
        
    # Merge each year's dataframe into one
    counties_prescribe_df = pd.merge(tables[0],tables[1],how='inner',on=['county','state','county_code'])

    for x in range(len(tables)):
        try:
            counties_prescribe_df = pd.merge(counties_prescribe_df,tables[x+2],how='inner',
                                             on=['county','state','county_code'])

        except:
            break
    
    # Format changed for 2017, scrape separately
    url_2017 = f'https://www.cdc.gov/drugoverdose/maps/rxcounty2017.html'

    table_2017 = pd.read_html(url_2017)[0]

    table_2017.columns = ['county','state','county_code','prescribing_rate_2017']
    
    table_2017 = table_2017.drop(columns=['county','state'])

    # Merge 2017 data with other years
    counties_prescribe_df = pd.merge(counties_prescribe_df,table_2017,how='inner',on='county_code')

    counties_prescribe_df = counties_prescribe_df.drop(columns='state')
    
    # Replace any instances of missing data with 0
    counties_prescribe_df = counties_prescribe_df.replace(['–'],0)
    
    # Set data types in prescribing rate columns as floats
    prescribe_cols = [f'prescribing_rate_{x}' for x in range(2006, 2018)]

    counties_prescribe_df[prescribe_cols] = counties_prescribe_df[prescribe_cols].astype('float64')
    
    # Create column with the mean prescribing rate for each county across all years
    counties_prescribe_df['mean_prescribing_rate'] = round(counties_prescribe_df[[f'prescribing_rate_{x}' for x in range(2006, 2018)]].mean(axis=1), 1)

    return counties_prescribe_df

# Create county prescribing rate dataframe using function
counties_prescribe_df = county_prescribing_rate()

# Merge death rate and prescribing rate dataframes
counties_all_years = pd.merge(counties_range,counties_prescribe_df,how='inner',on='county_code')\
                        .rename(columns={'county_x':'county'})\
                        .drop(columns='county_y')

# Send dataframes to csv
states_each_year.to_csv('states_each_year.csv',index=False)

states_prescribing_rate.to_csv('states_prescribing_rate.csv', index=False)

states_rates.to_csv('states_rates.csv', index=False)

counties_all_years.to_csv('counties_all_years.csv',index=False)

In [2]:
states_each_year.head()

Unnamed: 0,state_code,state,year,heroin_deaths,other_opioids_deaths,methadone_deaths,other_synthetics_deaths,population,heroin_death_rate,other_opioids_death_rate,methadone_death_rate,other_synthetics_death_rate,prescribing_rate
0,1,Alabama,1999,0,10,16,10,4430141,0.0,0.2,0.4,0.2,
1,1,Alabama,2000,0,26,12,11,4447100,0.0,0.6,0.3,0.2,
2,1,Alabama,2001,0,22,26,0,4467634,0.0,0.5,0.6,0.0,
3,1,Alabama,2002,0,30,33,0,4480089,0.0,0.7,0.7,0.0,
4,1,Alabama,2003,0,22,25,0,4503491,0.0,0.5,0.6,0.0,


In [3]:
states_prescribing_rate.head()

Unnamed: 0,state,mean_prescribing_rate
0,Alabama,128.3
1,Alaska,63.9
2,Arizona,78.9
3,Arkansas,114.5
4,California,51.9


In [4]:
states_rates.head()

Unnamed: 0,state,mean_opioids_death_rate,mean_prescribing_rate
0,Alabama,1.7,128.3
1,Alaska,5.4,63.9
2,Arizona,4.4,78.9
3,Arkansas,3.8,114.5
4,California,2.7,51.9


In [5]:
counties_all_years.head()

Unnamed: 0,county_code,county,state,heroin_deaths,other_opioids_deaths,methadone_deaths,other_synthetics_deaths,population,heroin_death_rate,other_opioids_death_rate,...,prescribing_rate_2009,prescribing_rate_2010,prescribing_rate_2011,prescribing_rate_2012,prescribing_rate_2013,prescribing_rate_2014,prescribing_rate_2015,prescribing_rate_2016,prescribing_rate_2017,mean_prescribing_rate
0,1001,Autauga County,Alabama,0,0,0,0,653405,0.0,0.0,...,147.5,151.7,144.1,157.8,166.7,145.3,129.9,129.6,106.6,141.2
1,1003,Baldwin County,Alabama,0,75,61,37,2276081,0.0,3.3,...,143.9,143.8,150.0,156.4,154.3,143.5,132.1,123.8,106.7,137.8
2,1005,Barbour County,Alabama,0,0,0,0,324547,0.0,0.0,...,88.5,97.0,99.4,118.3,107.5,102.0,93.3,92.7,90.7,93.3
3,1007,Bibb County,Alabama,0,0,0,0,271373,0.0,0.0,...,109.6,58.9,57.7,69.2,70.5,75.8,69.4,97.2,80.6,86.6
4,1009,Blount County,Alabama,15,10,17,13,687927,2.2,1.5,...,56.2,60.1,64.8,66.6,65.9,63.2,57.9,56.9,48.9,57.0
