In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import plotly #used plotly graphing library for maps
import plotly.express as px
import plotly.graph_objects as go 

### EDA / Data Viz (Output cleared because of file size. Visuals are in `visuals` folder

In [None]:
final_df = pd.read_csv('../data/cleaned_merged_df.csv')

In [None]:
final_df.rename(columns={'Unnamed: 0': 'fips_code'}, inplace=True)

**Function to build heatmap correlations**

In [None]:
def build_heatmap_simple(df, string):
    df_list = [col for col in df.columns if string in col]
    df_list.append('deaths')
    new_df = pd.DataFrame(df, columns=df_list)
    fig, ax = plt.subplots(figsize=(8, 5)) 
    sns.heatmap(new_df.corr()[['deaths']].sort_values('deaths'),
            annot=True,
            cmap='twilight',
            vmin=-1,
            vmax=1
           )
    plt.title(f'{string} and deaths correlation')
    plt.tight_layout()
    plt.savefig(f'../visuals/{string}.png', dpi=100);

In [None]:
build_heatmap_simple(final_df, 'grad')

**Interpretation:** This graph is not particularly telling, but we could potentially infer that a higher education is correlated to lower death rate

In [None]:
build_heatmap_simple(final_df, 'income')

**Interpretation:** lower income is more positively correlated to death rate

In [None]:
build_heatmap_simple(final_df, 'employ')

**Interpretation:** unemployment and working class occupations have a more positive correlation to death rate

In [None]:
build_heatmap_simple(final_df, '20+')

In [None]:
build_heatmap_simple(final_df, '65+')

In [None]:
final_df.columns

In [None]:
final_df.shape

In [None]:
# from scipy import stats
# final_df[(np.abs(stats.zscore(final_df._get_numeric_data())) < 3).all(axis=1)]

# dropping outliers removes too much data

### EDA / Data Viz

**Plot death rate per county**

In [None]:
final_df['fips_code'] = final_df['fips_code'].apply(lambda x: str(int(x)).zfill(5))

In [None]:
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties_json = json.load(response)
    
fig = px.choropleth(final_df, geojson=counties_json, locations=final_df['fips_code'], color='death_rate',
                           color_continuous_scale='portland',
                           range_color=(0, max(final_df['death_rate'])),
                           scope='usa',
                           labels={'deaths': 'Death count'},
                          )
   
fig.update_layout(title_text='Covid Death Rate by US county as of June 2021',
                 coloraxis_colorbar=dict(title='Death Rate'))
fig.show();

**Same map for poverty rate**

In [None]:
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties_json = json.load(response)
    
fig = px.choropleth(final_df, geojson=counties_json, locations=final_df['fips_code'], color='poverty_rate',
                           color_continuous_scale='portland',
                           range_color=(0, max(final_df['poverty_rate'])),
                           scope='usa',
                           labels={'poverty': 'poverty rate'},
                          )
   
fig.update_layout(title_text='Covid Poverty Rate by US county as of June 2021',
                 coloraxis_colorbar=dict(title='Poverty Rate'))
fig.show();

**Ten largest correlations to Poverty Rate**

In [None]:
corr_columns = final_df.corr().nlargest(10, 'poverty_rate').index

# create heatmap
mask = np.zeros_like(final_df[corr_columns].corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)]= True

f, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(final_df[corr_columns].corr(), 
            annot=True, 
            square= True, 
            mask = mask,
            cmap=sns.color_palette('twilight', 7),
            annot_kws={'size': 12},
            cbar_kws={'shrink': 0.5},
            linewidth = 0.1,
            yticklabels=corr_columns.values, 
            xticklabels=corr_columns.values,
            vmin = -1)
ax.set_xlim(0,10)
ax.set_ylim(0,10)
ax.tick_params(labelsize=8)
plt.title('Features with the 10 largest correlations to Poverty Rate', fontsize=16)
plt.tight_layout()
plt.savefig('../visuals/poverty_corr.png', dpi=100);

In [None]:
corr_columns = final_df.corr().nlargest(10, 'death_rate').index

# create heatmap
mask = np.zeros_like(final_df[corr_columns].corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)]= True

f, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(final_df[corr_columns].corr(), 
            annot=True, 
            square= True, 
            mask = mask,
            cmap=sns.color_palette('twilight', 7),
            annot_kws={'size': 12},
            cbar_kws={'shrink': 0.5},
            linewidth = 0.1,
            yticklabels=corr_columns.values, 
            xticklabels=corr_columns.values,
            vmin = -1)
ax.set_xlim(0,10)
ax.set_ylim(0,10)
ax.tick_params(labelsize=8)
plt.title('Features with the 10 largest correlations to Death Rate', fontsize=16)
plt.tight_layout()
plt.savefig('../visuals/death_corr.png', dpi=100);

### Other visualizations

In [None]:
#staffed beds per county to visualize hospital preparedness
 
fig = px.choropleth(final_df, geojson=counties_json, locations=final_df['fips_code'], color='staffed_icu_beds',
                           color_continuous_scale='deep',
                           range_color=(0, max(final_df['staffed_icu_beds'])),
                           scope='usa',
                           labels={'staffed_icu_beds': 'Staffed ICU bed count'},
                          )
   
fig.update_layout(title_text='Staffed ICU beds per US county as of June 2021')
fig.show()
plt.savefig('../visuals/icu_beds.png', dpi=100);