In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline

sns.set(style = 'darkgrid')

In [None]:
m = pd.read_csv('../input/mort.csv')
list(m.columns)

We have a lot of columns that we will not use. Rather than type out all the name, I made a list and deleted the ones I didnt want. We can then iterate through the column names and only keep desired columns.

Also dropping rows that have aggregates in them already.

In [None]:
tokeep = ['Location',
 'FIPS',
 'Category',
 'Mortality Rate, 1980*',
 'Mortality Rate, 1985*',
 'Mortality Rate, 1990*',
 'Mortality Rate, 1995*',
 'Mortality Rate, 2000*',
 'Mortality Rate, 2005*',
 'Mortality Rate, 2010*',
 'Mortality Rate, 2014*',]

m = m[(m.Location.str.contains(',') == True)]
m.reset_index(drop=True, inplace=True)

In [None]:
x = 0
while x < len(m.columns):
    if m.columns[x] not in tokeep:
        m = m.drop(m.columns[x],1)
    x=x+1
m.info()

This splits the county and state. Adds each to a new column

In [None]:
m['County'] = ''
m['State'] = ''

county = []
state = []

x=0
while x<len(m):
    p = m['Location'][x].index(',')
    county.append(m['Location'][x][:p])
    state.append(m['Location'][x][p+2:])
    x=x+1
    
m['County'] = county
m['State'] = state
m.info()

Now that we have all the columns we want, we need to un pivot the data into a more useful format.

In [None]:
n = pd.melt(m, id_vars = ['Location','FIPS','Category','County','State'], value_vars = ['Mortality Rate, 1980*',
                                                                                    'Mortality Rate, 1985*',
                                                                                   'Mortality Rate, 1990*',
                                                                                   'Mortality Rate, 1995*',
                                                                                   'Mortality Rate, 2000*',
                                                                                   'Mortality Rate, 2005*',
                                                                                   'Mortality Rate, 2010*',
                                                                                   'Mortality Rate, 2014*',])
n=n.rename(columns = {'value':'Mortality Rate'})

x=0
year = []
while x<len(n):
    year.append(n['variable'][x][16:20])
    x=x+1
    
n['Year'] = year
n.Year = n.Year.astype(float)
n=n.drop('variable',1)

In [None]:
y = sorted(list(n['State'].unique()))

plt.figure(figsize = (15,3))
sns.violinplot(y = 'Mortality Rate', x = 'State', data = n[(n['Year'] == 2014)], inner = 'quartile', 
               palette = 'Set2', order = y)

This shows a severe skew in the data. To visualize a little better we will unskew it and graph the states on the y axis.

In [None]:
n['Sk Mortality Rate']= np.log1p(n['Mortality Rate'])

y = sorted(list(n['State'].unique()))

plt.figure(figsize = (3,15))
sns.violinplot(x = 'Sk Mortality Rate', y = 'State', data = n[(n['Year'] == 2014)], inner = 'quartile', 
               palette = 'Set2', order = y)

To look at mortality rates correlations we will use a correlation matrix. The matrix compares variables, so we need to pivot the data to look at Category as the columns.

First I am going to clean up some of the column names to make them fit better on the screen.

In [None]:
o = pd.pivot_table(n[(n['Year'] == 2014)], values = 'Mortality Rate', index = ['County','Year','State'], 
                   columns = 'Category', aggfunc = np.mean)
    

ocols = ['Cardiovascular',
 'Chronic resp',
 'Chronic liver',
 'Diabetes',
 'Diarrhea',
 'Digestive diseases',
 'Non Natural',
 'HIV/AIDS and TB',
 'Maternal disorders',
 'Mental disorders',
 'Musculoskeletal disorders',
 'Tropical diseases',
 'Neonatal disorders',
 'Neoplasms',
 'Neurological disorders',
 'Nutritional deficiencies',
 'Other communicable',
 'Other non-communicable',
 'Violence',
 'Transport injuries',
 'Unintentional injuries']


o.columns = ocols
o.head()

Building the correlation matrix

The matrix is mirrored across the diagonal line, so I went ahead and hid the top part to avoid confusion. Dark red squares mean that it is highly correlated.

In [None]:
corr = o.corr()
fg, ax = plt.subplots(figsize = (11,9))

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(corr, mask = mask, linewidths = .5, square = True)

Neoplasms and Cardiovascular disease are very highly correlated. Let's look at a scatter plot of just those two.

Adding a kde and bar graphs on the edges helps us understand the concentration of points in the center.

In [None]:
sns.set(style = 'ticks')
p = (sns.jointplot(o['Neoplasms'], o['Cardiovascular'],
                  stat_func=None,edgecolor="w",xlim = (0,550), ylim = (0,550))
     .plot_joint(sns.kdeplot, zorder = 1, n_levels = 10))

Zooming back out from those two diseases, lets look at some or the relationships between other diseases.

Because the top and bottom corners are mirror images, we changed the bottom to a kde plot to get a better understanding of the density of values in the concentrations.

In [None]:
p=o.ix[:,:5]
sns.set(style = 'darkgrid')
g = sns.PairGrid(p)
g.map_upper(plt.scatter, s = 5)
g.map_diag(plt.hist, lw = 0, edgecolor = 'w')
g.map_lower(sns.kdeplot, cmap = 'Blues_d')

g.set(ylim = (0,None))
g.set(xlim = (0,None))

Cardiovascular Analysis

Let's make a table with only cardiovascular diseases but with multiple years. Because there are many counties in each state, so we will take the average for the state per year.

In [None]:
cardio = n[(n['Category'] == 'Cardiovascular diseases')]
cardio = cardio[['Year','State','Mortality Rate']]
cardio = cardio.groupby(['Year','State']).mean()
cardio.reset_index(level=0, inplace=True)
cardio.reset_index(level=0, inplace=True)
cardio.head()

We can use this table to see the year over year change in mortality rates of cardiovascular disease.

We can see that mortality rates are dropping in every state.

In [None]:
grid = sns.FacetGrid(cardio, col = 'State', hue = 'State', col_wrap = 5, size = 2)

grid.map(plt.plot, 'Year', 'Mortality Rate',ms = 4, marker ='o')

grid.set(xlim=(1980, 2014), ylim = (0,None))

grid.set_xticklabels(rotation = 45)

grid.fig.tight_layout(w_pad = 1)

The last chart showed us that there was a general decline in mortality rate, but it wasn't clear on how each state was doing relative to each other.

For this we will find the % change in mortality rates for cardiovascular disease from 1980 to 2014.

To do this will will need to reshape our table to do that calculation easily.

In [None]:
cardio = pd.pivot_table(cardio, values = 'Mortality Rate', index = ['State'], 
                   columns = 'Year', aggfunc = np.mean)

cardio.reset_index(level=0, inplace=True)

In [None]:
cardio.columns = ['State','One','Two','Three','Four','Five','Sx','Seven','Eight']
#changing columns because I could not call when the column name was a year number

In [None]:
cardio['Delta'] = ''
delta = []
x=0
while x < len(cardio):
    y = cardio['One'][x] 
    z = cardio['Eight'][x]
    delta.append((z-y)/y)
    
    x=x+1

cardio['Delta'] = delta
    
cardio.head()

Now let's see which state has had the largest reduction in mortality rate.

We are looking for the lowest number (high drop in mortality).

Congrats Massachusetts!

Further analysis might try to find some regional correlation to mortality rates decline.

In [None]:
sns.set(style="darkgrid")

#fg, ax = plt.subplots(figsize = (11,9))

g = sns.PairGrid(cardio.sort_values(by = 'Delta', ascending = True), x_vars = 'Delta', y_vars = 'State', size = 10, aspect = .4)

g.map(sns.stripplot, size = 10, orient = 'h', palette = 'coolwarm', edgecolor = 'w')

g.set(xlabel='', ylabel='')

titles = ['1980 - 2014 % Change']

for ax, title in zip(g.axes.flat, titles):

    # Set a different title for each axes
    ax.set(title=title)

    ax.xaxis.grid(False)
    ax.yaxis.grid(True)