# Analyzing CDC deaths in California re: Covid-19

### Import Python tools and Jupyter configuration

In [50]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [51]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [52]:
import altair as alt
import altair_latimes as lat

In [53]:
alt.themes.register("latimes", lat.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('latimes')

In [54]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.set_option("display.max_colwidth", None)

### Load deaths data from past years and 2020

In [55]:
# metadata: https://data.cdc.gov/NCHS/Weekly-Counts-of-Deaths-by-State-and-Select-Causes/muzy-jte6

In [56]:
url_current = "https://data.cdc.gov/api/views/muzy-jte6/rows.csv?accessType=DOWNLOAD"

In [57]:
# #metadata: https://data.cdc.gov/NCHS/Weekly-Counts-of-Deaths-by-State-and-Select-Causes/3yf8-kanr

In [58]:
url_history = "https://data.cdc.gov/api/views/3yf8-kanr/rows.csv?accessType=DOWNLOAD"

### Read the current data (2020-21) and the historical data (2014-2019)

In [59]:
df_current = pd.read_csv(
    url_current,
    parse_dates=["Week Ending Date"],
    dtype={"MMWR Year": str, "MMWR Week": str},
)

In [60]:
df_history = pd.read_csv(
    url_history,
    parse_dates=["Week Ending Date"],
    dtype={"MMWR Year": str},
)

### Clean up slight differences in two dataframes' column headers

In [61]:
df_history.rename(columns={"All  Cause": "All Cause"}, inplace=True)

### Concatenate them into one frame

In [62]:
df = pd.concat(
    [
        df_history[
            [
                "Jurisdiction of Occurrence",
                "MMWR Year",
                "MMWR Week",
                "Week Ending Date",
                "All Cause",
                "Natural Cause",
            ]
        ],
        df_current[
            [
                "Jurisdiction of Occurrence",
                "MMWR Year",
                "MMWR Week",
                "Week Ending Date",
                "All Cause",
                "Natural Cause",
            ]
        ],
    ]
)

### Strip out junk from column headers

In [63]:
df.columns = (
    df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_", regex=False)
    .str.replace(":", "", regex=False)
    .str.replace("/", "_", regex=False)
    .str.replace(",", "_", regex=False)
    .str.replace("*", "", regex=False)
    .str.replace("(s)", "s", regex=False)
)

### Make a copy of the dataframe, excluding this year

In [225]:
deaths_past = df[
    (df["week_ending_date"] < "2021/01/01")
    & (df["jurisdiction_of_occurrence"] != "United States")
].copy()

---

### Get population estimates y state from U.S. Census Bureau

In [303]:
url = "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv"
pop_src = pd.read_csv(
    url,
    dtype={
        "STATE": "str",
    },
).drop(range(0, 5))

### Slim down the frame and clean up headers

In [304]:
pop_df = pop_src[
    [
        "STATE",
        "NAME",
        "POPESTIMATE2014",
        "POPESTIMATE2015",
        "POPESTIMATE2016",
        "POPESTIMATE2017",
        "POPESTIMATE2018",
        "POPESTIMATE2019",
    ]
].copy()

In [305]:
pop_df.columns = (
    pop_df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_", regex=False)
    .str.replace("popestimate", "pop", regex=False)
)

In [306]:
pop_df["pop2020"] = pop_df["pop2019"]

### Population percentage change

In [307]:
pop_df["pop14-19chg"] = (
    ((pop_df["pop2019"] - pop_df["pop2014"]) / pop_df["pop2014"]) * 100
).round(3)

### What's the average of the population estimates from 2015-2019?

In [308]:
col_list = list(pop_df)
col_list.remove("pop14-19chg")

In [309]:
pop_df["pop2014_19_avg"] = pop_df[col_list].mean(axis=1).round()

In [310]:
pop_df.sort_values("pop2014_19_avg", ascending=False).head()

Unnamed: 0,state,name,pop2014,pop2015,pop2016,pop2017,pop2018,pop2019,pop2020,pop14-19chg,pop2014_19_avg
9,6,California,38596972,38918045,39167117,39358497,39461588,39512223,39512223,2.371,39218095.0
48,48,Texas,26964333,27470056,27914410,28295273,28628666,28995881,28995881,7.534,28180643.0
14,12,Florida,19845911,20209042,20613477,20963613,21244317,21477737,21477737,8.222,20833119.0
37,36,New York,19651049,19654666,19633428,19589572,19530351,19453561,19453561,-1.005,19566598.0
43,42,Pennsylvania,12788313,12784826,12782275,12787641,12800922,12801989,12801989,0.107,12792565.0


In [311]:
pop_df_slim = pop_df[["state", "name", "pop2019", "pop14-19chg", "pop2014_19_avg"]]

---

### Merge population with deaths dataframe

In [312]:
deaths_year = (
    deaths_past.groupby(
        [
            "jurisdiction_of_occurrence",
            "mmwr_year",
        ]
    )
    .agg({"all_cause": "sum"})
    .reset_index()
)

In [313]:
deaths_year.head(7)

Unnamed: 0,jurisdiction_of_occurrence,mmwr_year,all_cause
0,Alabama,2014,50229.0
1,Alabama,2015,50661.0
2,Alabama,2016,51130.0
3,Alabama,2017,52132.0
4,Alabama,2018,53146.0
5,Alabama,2019,53057.0
6,Alabama,2020,62480.0


In [314]:
deaths_pivot = pd.pivot_table(
    deaths_past,
    values="all_cause",
    index="jurisdiction_of_occurrence",
    columns="mmwr_year",
    aggfunc="sum",
).reset_index()

In [315]:
deaths_pivot.head()

mmwr_year,jurisdiction_of_occurrence,2014,2015,2016,2017,2018,2019,2020
0,Alabama,50229.0,50661.0,51130.0,52132.0,53146.0,53057.0,62480.0
1,Alaska,4081.0,4170.0,4305.0,4255.0,4289.0,4503.0,4950.0
2,Arizona,52761.0,54382.0,56583.0,57885.0,59495.0,60450.0,75921.0
3,Arkansas,30508.0,30830.0,30847.0,31707.0,31744.0,32183.0,37390.0
4,California,250552.0,258512.0,260595.0,267106.0,268145.0,268775.0,314966.0


### Merge deaths and population

In [316]:
deaths_merge = pd.merge(
    deaths_pivot, pop_df, left_on="jurisdiction_of_occurrence", right_on="name"
)

### Calculate crude death rates per 100,000 for each year

In [334]:
years = range(2014, 2021)

In [335]:
for y in years:
    deaths_merge[f"death_rate_{y}"] = (
        deaths_merge[f"{y}"] / deaths_merge[f"pop{y}"]
    ) * 100000

### Slim the dataframe

In [336]:
deaths_merge_slim = deaths_merge[
    [
        "state",
        "name",
        "death_rate_2014",
        "death_rate_2015",
        "death_rate_2016",
        "death_rate_2017",
        "death_rate_2018",
        "death_rate_2019",
        "death_rate_2020",
    ]
].copy()

### Death rate increase, by state, from 2019-2020

In [338]:
deaths_merge_slim["pct_increase"] = (
    deaths_merge_slim["death_rate_2020"] - deaths_merge_slim["death_rate_2019"]
) / deaths_merge_slim["death_rate_2019"]

In [341]:
deaths_merge_slim.sort_values("pct_increase", ascending=False).head()

Unnamed: 0,state,name,death_rate_2014,death_rate_2015,death_rate_2016,death_rate_2017,death_rate_2018,death_rate_2019,death_rate_2020,pct_increase
30,34,New Jersey,804.013751,798.459711,806.576433,828.189668,838.51891,831.630488,1064.275815,0.279746
2,4,Arizona,783.919204,796.26032,815.191083,821.762269,831.165137,830.503508,1043.054703,0.255931
8,11,District of Columbia,840.671087,836.097128,871.372017,880.406846,873.355598,844.776259,1043.713842,0.235491
44,48,Texas,702.935986,695.750311,691.882795,705.506535,711.807529,707.365988,864.188262,0.221699
18,22,Louisiana,966.965424,937.266594,943.388765,979.925319,987.962719,988.665017,1207.194812,0.221035


In [342]:
deaths_merge_slim.sort_values("pct_increase", ascending=False).tail()

Unnamed: 0,state,name,death_rate_2014,death_rate_2015,death_rate_2016,death_rate_2017,death_rate_2018,death_rate_2019,death_rate_2020,pct_increase
37,41,Oregon,875.444459,886.226179,872.450107,882.48816,862.003412,883.55438,946.407991,0.071137
39,72,Puerto Rico,867.187911,805.445763,855.937995,922.507117,904.033815,917.87128,972.197086,0.059187
19,23,Maine,1022.613082,1072.529365,1051.214699,1082.112254,1083.374345,1102.430271,1153.240709,0.046089
11,15,Hawaii,776.86142,779.577681,763.82132,799.287837,804.382395,823.379515,846.121683,0.027621
33,37,North Carolina,883.730984,892.625198,894.317045,911.948531,909.86807,918.21347,930.036411,0.012876


### Melt the dataframe

In [348]:
deaths_rates_melt = pd.melt(
    deaths_merge_slim,
    id_vars=["state", "name"],
    value_vars=[
        "death_rate_2014",
        "death_rate_2015",
        "death_rate_2016",
        "death_rate_2017",
        "death_rate_2018",
        "death_rate_2019",
        "death_rate_2020",
    ],
    var_name="year",
    value_name="death_rate",
)

In [356]:
deaths_rates_melt["year"] = deaths_rates_melt["year"].str.replace(
    "death_rate_", "", regex=False
)

In [422]:
viz = (
    alt.Chart(deaths_rates_melt)
    .mark_bar(width=13)
    .encode(
        x=alt.X(
            "year:T",
            title=" ",
            axis=alt.Axis(
                grid=False,
                tickSize=0,
                domainOpacity=0,
                tickCount=2,
                offset=4,
                gridWidth=0.6,
            ),
        ),
        y=alt.Y(
            "death_rate",
            title=" ",
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=3,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
            ),
        ),
        color=alt.condition(
            alt.datum.year == 2020,
            alt.value("#3580b1"),
            alt.value("#82c6df"),
        ),
        facet=alt.Facet("name", columns=13, title=" "),
    )
    .properties(width=100, height=100, title="Deaths per 100,000 residents: 2014-2020")
)
viz.configure_view(strokeOpacity=0)

In [424]:
viz.configure_view(strokeOpacity=0).save(
    "visualization_state_bars_timeseries_multiples.png"
)