In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

A few days ago, I watched a documentary called 'The Pharmacist' and it was a dark but riveting account of how a father (Dan Schneider) lost his son due to addiction and then made it his personal mission to take down pill mills at Lousiana so that young lives do not have to be wrongfully taken. He was at the forefront of combating the nation's devastating opioid epidemic. He inspired me and made me want to have a glimpse into how he saw the opioid epidemic before anyone else did.

In this exploratory data analysis, my aim is to: 
* Understand the trend behind the opioid-related deaths and the prescriptions dispensed by US retailers averagely acrosss the states
* Know which state is at of highest risk - also known as, crude rate
* Understand trend or growth in the amount of prescription dispensed by US retailers from 1999 to 2014
* Generate and understand the Opioid Prescribing Rate per person across the states
* Determine how many states are at a high risk of high OPR 
* Generate a heatmap based on the opioid prescribing rate

It took me some time to find a suitable dataset mainly because of the following reasons:
* There are a couple of state-only prescription-related data, but it was not easy to find the entire nation's data
* I managed to find an entire nation's data but it was not available in csv and only available via SPSS, SAS, R etc. which I do not know how to do the convert, nor do I have the tools to open those data source at the moment
* There was a good data source, however, it was already in Kaggle and I wanted to explore extracting data that is not in Kaggle at the moment. 

I found the following dataset from the open source data.world and further cleaned it up so that we can explore and work on some useful analysis from it.

In [None]:
# original data 

dataset = pd.read_csv("../input/opioid-overdose-deaths/Multiple Cause of Death 1999-2014 v1.1.csv")
df = pd.DataFrame(dataset)
df

Firstly, I want to drill down to just knowing the states, year, deaths, population, crude rate and prescriptions dispensed by US retailers in that year (millions). 

In [None]:
# cleaned up and renamed one of the columns to make it cleaner and readable
updated_df = df.rename(columns={"Prescriptions Dispensed by US Retailers in that year (millions)": "Prescriptions (mils)"})

# drop the columns on confidence intervals 
df = updated_df.drop(columns=['Crude Rate Lower 95% Confidence Interval','Crude Rate Upper 95% Confidence Interval'])
df

In some of the fields, the field only shows 'Surpressed' or 'Unreliable' and having these data might not be very useful to get a descriptive understanding of the trend. Hence, I deleted the rows that mentioned any of the terms. 

In [None]:
# delete all rows in Deaths and Crude Rate if it contains 'Suppressed' and 'Unreliable'
drop_rows = df[ (df['Deaths'] == 'Suppressed') | (df['Deaths'] == 'Unreliable') | (df['Crude Rate'] == 'Suppressed') | (df['Crude Rate'] == 'Unreliable') ].index
df.drop(drop_rows , inplace=True)
df

Now lets take a look at how the opioid deaths are distributed over this 15 years generally. It gives a broad understanding of the trend in general.
The thing behind this data table is that it is non-linear in a sense that it by states and separated into years. To give me an understanding about the general trend of opioid deaths over the 15 years period, I need to group or categorized all the states and sum the values in each columns of Deaths, Population & Prescriptions.

In [None]:
plt.style.use('seaborn-dark')

# plotting a bar chart that compares opioid deaths by segmenting states over the 15 years period
df[['Deaths', 'Population','Crude Rate','Prescriptions (mils)']] = df[['Deaths', 'Population','Crude Rate','Prescriptions (mils)']].apply(pd.to_numeric)
avg_deaths = df.groupby('State')['Deaths','Prescriptions (mils)'].mean().sort_values(by='Deaths', ascending=False)
avg_deaths.head(10) # top 10 states that has the highest average deaths due to opioid

Scarily from the above, Washington state, unfortunately, took the 9th spot in the top average Opioid-related deaths over the 15 years data in the nation. Note that the interesting findings from this averaged values is how similar the average precriptions of opioids are across many of the states over the 15 years period. I actually tried to average it out manually in the CSV file as I thought it was some error, but in fact, the numbers are correct I think. This made me think whether the data has an issue, or where did the opioid users get their drugs from if the prescriptions distributions are roughly the same across every state. 

**Exploration (1) 
I decided to plot a horizontal bar to understand the trend behind the average opioid-related deaths across states in the nation and the prescriptions dispensed by US retailers (in millions) over 15 years. From the bar below, its interesting to see how the average prescriptions are very much similar across the states, but yet the average death rates have a vast difference throughout. **

In [None]:
# plotting the average deaths with states in a horizontal bar chart
ax = avg_deaths.plot(kind = 'barh', figsize=(20,20))
plt.title('Annual Average Opioid-related Deaths Across States',fontsize = 25, fontweight='bold')
plt.ylabel('States',fontsize = 18, fontweight='bold')
plt.xlabel('Average Number of Deaths due to Opioid', fontsize = 18, fontweight='bold')

# display the value of each state's death numbers
# reference: https://stackoverflow.com/questions/30228069/how-to-display-the-value-of-the-bar-on-each-bar-with-pyplot-barh
for i, v in enumerate(avg_deaths['Deaths'].round()):
    ax.text(v + 3, i - .35, str(v), color='black', fontweight='regular')
plt.show()

However, although that gives me a perspective across states, it might not paint a full comparative picture because different states have different populations relative to each other and that would also affect how many people in comparison to the large population that might be at risk. For this exploration, we will be using the crude rate data. **A crude rate is the number of new cases (or deaths) occurring in a specified population per year, usually expressed as the number of cases per 100,000 population at risk.  **

**Exploration (2) 
**Using each state's average crude rate data to determine which state is at the highest risk. 

In [None]:
# store the dataframe of grouped state vs average crude rates in avg_crude df
avg_crude = df.groupby('State')['Crude Rate'].mean().sort_values(ascending=False)
crude_bar = avg_crude.plot(kind = 'bar', figsize=(20,5), legend = True)

# plot the bar chart to show the average crude rate across states  
plt.title('Average no. of new cases/deaths per 100,000 people in each state in a year',fontsize = 25, fontweight='bold')
plt.ylabel('No. of deaths/100,000 people',fontsize = 18, fontweight='bold')
plt.xlabel('States', fontsize = 18, fontweight='bold')
plt.xticks(rotation=75)
plt.show()

Surprisingly, as you can see above, West Virginia is in fact the highest at risk state, while California, although has the highest number of opioid death numbers, is far behind relative to West Virginia. 

**Exploration (3) 
**Understand trend or growth in the amount of prescription dispensed by US retailers from 1999 to 2014.
This would give us a glimpse into how the amount of prescription dispensed might have an effect in the death rates in the nation. 

In [None]:
# store the dataframe of grouped years vs prescriptions in millions & deaths across states over the years 
avg_prescrip = df.groupby('Year')['Deaths','Prescriptions (mils)'].sum().sort_values(by = 'Year', ascending=False)
prescrip_effect = avg_prescrip.plot(kind = 'line', figsize=(20,10), legend = True, linewidth=10)

# plot the line graph  
plt.title('No. of Opiate-related Deaths & Prescription (Millions) over 15 years',fontsize = 25, fontweight='bold')
plt.ylabel('No. of opiate deaths & opiate prescription',fontsize = 18, fontweight='bold')
plt.xlabel('Year 1994 to 2014', fontsize = 18, fontweight='bold')
plt.xticks(rotation=75)
plt.show()

Just as the documentary was explaining, the number of opiate-related deaths were very much caused or started by the rise in unnecessary prescription of opiates by doctors and clinics, leading to addiction and abuse of painkillers. 

**Exploration (4) 
**Generate and understand the Opioid Prescribing Rate per 100 across the states. This will take into account the population density in the state and how much opiate was prescribed to.

In [None]:
# to calculate the Opioid Prescribing Rate per 100, 
# i would need to take the Prescription (in mils) divide by the the population and then divide it by 100

# create a new column that calculates OPR per person
df['OPR'] = df['Prescriptions (mils)'] * 1000000 / df['Population']
df['OPR'] = df['OPR'].round(decimals=2)
df.sort_values(by = 'OPR', ascending=False)

The data above looks like something is amissed California has the highest number of opiate-related death rates but it has the lowest OPR per person. Whereas Wyoming did not leave a mark in the top 5 list of opiate-related deaths but instead has the top OPR rates. At this point, I am wondering if the prescriptions data is amissed (or my calculations are wrong), or is it just simply opiate users in various states, like California, perhaps gotten their opiate from other sources or the environmental context causes increased drug abuse, or the population that has lower death rates do succumb to lower risk of drug abuse even though they may have a high prescription rate of opiates. This really got me thinking hard about reliance to data purely as it might not be wise to simply view prescriptions and opiate abuse correlation. 

**Exploration (5) 
**Determine how many states are at a high risk of high OPR 

In [None]:
# create opr ranking system
def opr_rates(opr):
    if opr > 300:
        return "Too High Prescription"
    elif opr < 300 and opr >= 100:
        return "High Prescription"
    elif opr < 100 and opr >= 50:
        return "Moderate Prescription"
    elif opr < 50:
        return "Low Prescription"
    
# store the dataframe of grouped states vs OPR across states over the years 
avg_opr = df.groupby('State')['OPR','Deaths'].mean().sort_values(by = 'State', ascending=False)
avg_opr['OPR'].apply(opr_rates)

In [None]:
# if we visualize the average OPR across different states over the 15 years in a piechart
plt.title('Nation OPR Ratings',fontsize = 18, fontweight='bold')
opr_pie = avg_opr['OPR'].apply(opr_rates).value_counts()
opr_pie.plot(kind='pie',figsize=(10,8))

In [None]:
# to have a more in-depth comparison across the two variables, OPR & opiate-related deaths, we plot a hbar across all the states

opr_effect = avg_opr.plot(kind = 'barh', figsize=(20,10), legend = True, linewidth=20)

# plot the bar chart
plt.title('OPR vs Opiate-Death Levels Across States',fontsize = 18, fontweight='bold')
plt.ylabel('States',fontsize = 18, fontweight='bold')
plt.xlabel('Average Opiate Prescription Per Person & Opiate-related Death Numbers ',fontsize = 18, fontweight='bold')
plt.xticks(rotation=75)
plt.show()

I am also interested to find out how the crude rate is compared across states and years over a heatmap. 

**Exploration (6) 
**Generate a heatmap based on the opioid prescribing rate.
I thought this would be a good way to sum up the exploratory data analysis! 


In [None]:
# reshaping the table to be a pandas pivot table 
heatmap_data = pd.pivot_table(df, values='OPR', index=['State'], columns=['Year'])
heatmap_data

In [None]:
import seaborn as sb
# plotting a heatmap based on the pivot table 
fig, ax = plt.subplots(figsize=(20,15))
sb.heatmap(heatmap_data, cmap="BuGn",linewidths=.5, ax=ax)

# putting titles and labels 
plt.title('OPR Values Across States and Years',fontsize = 18, fontweight='bold')
plt.ylabel('States',fontsize = 18, fontweight='bold')
plt.xlabel('Years ',fontsize = 18, fontweight='bold')

plt.show()

Below code is to test exporting the heatmap on the google maps. Just exploring around! 

In [None]:
# pip install opencage # also added in the console

In [None]:
# # pip installed geopy and gmplot via the console already
# import gmplot
# # For improved table display in the notebook
# from IPython.display import display

# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# secret_value_1 = user_secrets.get_secret("opencage") # make sure this matches the Label of your key
# key1 = secret_value_1

# from opencage.geocoder import OpenCageGeocode
# geocoder = OpenCageGeocode(key1)

# for i in df['State']:
#     query = i  
#     results = geocoder.geocode(query)
#     lat = str(results[0]['geometry']['lat'])
#     lng = str(results[0]['geometry']['lng'])
    


# gmap = gmplot.GoogleMapPlotter(34.0522, -118.2437, 10)
# gmap.draw("my_heatmap.html")