### Necessary packages

In [1]:
%load_ext lab_black

In [2]:
import requests
import urllib.request
import numpy as np

In [3]:
import pandas as pd
import gspread
from oauth2client.service_account import ServiceAccountCredentials

In [4]:
import altair as alt
import altair_grid as altgrid

alt.themes.register("grid", altgrid.theme)
alt.themes.enable("grid")

ThemeRegistry.enable('grid')

In [5]:
from datawrapper import Datawrapper

dw = Datawrapper(
    access_token="FtIwtvFtoGLaRT9a3gjX69PLu4wSuRyKddoOz6SOPw3k9wWyNICMHTkcPhOGCR5Z"
)

In [6]:
# scope = ['https://spreadsheets.google.com/feeds']
# credentials = ServiceAccountCredentials.from_json_keyfile_name('jupyter-integration-349314-25735d35924b.json', scope)
# gc = gspread.authorize(credentials)

In [7]:
# spreadsheet_key = "1sCb1YbQ3-1oiL-cnK0yCkwFcQpvpf0efIngAUeC1ixo"
# book = gc.open_by_key(spreadsheet_key)

In [8]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

### Maternal mortality over time with CDC-limited cause of death codes

In [9]:
year_cause_cdclimited = pd.read_table(
    "data/raw/byyear_andcause.txt", dtype={"Year": str, "Year Code": str}
)

In [10]:
year_cause_cdclimited.columns = year_cause_cdclimited.columns.str.lower().str.replace(
    " ", "_"
)

### Maternal mortality over time with all obstetric cause death codes

In [11]:
year_cause_all = pd.read_table(
    "data/raw/after42days.txt", dtype={"Year": str, "Year Code": str}
)

In [12]:
year_cause_all.columns = year_cause_all.columns.str.lower().str.replace(" ", "_")

#### Percent comparisons: deaths included by cdc and those excluded (42 days or more post pregnancy)

In [13]:
yc_all_pct = year_cause_all.dropna(subset=["year"]).drop(columns="notes")

In [14]:
yc_all_pct.value_counts("cause_of_death_code")
post42 = ["O96.0", "O96.1", "O97", "O97.0", "O97.1", "O97.9"]
conditions = [
    (yc_all_pct["cause_of_death_code"].isin(post42)),
    (~yc_all_pct["cause_of_death_code"].isin(post42)),
]

values = ["after 42 days", "within 42 days"]

yc_all_pct["later_death"] = np.select(conditions, values)

In [15]:
timing = (
    yc_all_pct.groupby(["year", "later_death"])
    .agg(sum_after42=pd.NamedAgg(column="deaths", aggfunc=sum))
    .reset_index()
)

In [16]:
timing_comparison = timing.pivot(
    index="year", columns="later_death", values="sum_after42"
).dropna(subset=["after 42 days"])

In [17]:
timing_comparison["pct"] = (timing_comparison["after 42 days"]) / (
    timing_comparison["after 42 days"] + timing_comparison["within 42 days"]
)

In [18]:
timing_comparison = timing_comparison.reset_index()

In [19]:
timing_comparison["yeardate"] = pd.to_datetime(timing_comparison["year"])

#### Percent of total obstetric deaths from causes 42 days or more after over time

In [20]:
alt.Chart(timing_comparison).mark_line().encode(x=alt.X("year:N"), y=(alt.Y("pct")))

### 2020 Maternal mortality broken down by race and cause
##### includes all obstetric causes, not just those CDC counts

In [21]:
mm_byrace = pd.read_table("data/raw/mm_2020_byrace_bycause.txt")

In [22]:
mm_byrace.columns = mm_byrace.columns.str.lower().str.replace(" ", "_")

#### coding for causes included in CDC maternal mortality surveillance

In [23]:
conditions = [
    (mm_byrace["cause_of_death_code"].isin(post42)),
    (~mm_byrace["cause_of_death_code"].isin(post42)),
]

values = ["not recorded", "recorded"]

mm_byrace["cdc_maternal_mortality"] = np.select(conditions, values)

#### coding race and hispanic origin combined variable

In [24]:
race_ho_conditions = [
    (
        (mm_byrace["race"] == "White")
        & (mm_byrace["hispanic_origin"] == "Not Hispanic or Latino")
    ),
    (
        (mm_byrace["race"] == "Black or African American")
        & (mm_byrace["hispanic_origin"] == "Not Hispanic or Latino")
    ),
    (
        (mm_byrace["race"] == "Asian or Pacific Islander")
        & (mm_byrace["hispanic_origin"] == "Not Hispanic or Latino")
    ),
    (
        (mm_byrace["race"] == "American Indian or Alaska Native")
        & (mm_byrace["hispanic_origin"] == "Not Hispanic or Latino")
    ),
    (mm_byrace["hispanic_origin"] == "Hispanic or Latino"),
    (mm_byrace["race"].isna())
    & (mm_byrace["hispanic_origin"].isna())
    & (mm_byrace["notes"] == "Total"),
]

race_ho_values = [
    "White",
    "Black",
    "AAPI",
    "American Indian or Alaska Native",
    "Hispanic or Latino",
    "Overall",
]

mm_byrace["race_whispanicorigin"] = np.select(race_ho_conditions, race_ho_values)

In [25]:
rawdeaths_byrace_cdcrec = (
    mm_byrace[
        (mm_byrace["race_whispanicorigin"] != "0") & (mm_byrace["notes"] != "Total")
    ]
    .groupby(["race_whispanicorigin", "cdc_maternal_mortality"])
    .agg(deaths=pd.NamedAgg(column="deaths", aggfunc=sum))
    .reset_index()
)

In [26]:
rawdeaths_byrace_cdcrec_wide = rawdeaths_byrace_cdcrec.pivot(
    index="race_whispanicorigin", columns="cdc_maternal_mortality", values="deaths"
)

### 2020 natality data by race (live births for race mortality rates)

In [27]:
births_byrace = pd.read_table(
    "data/raw/births_byrace_1620.txt", dtype={"Year": str, "Year Code": str}
)

FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/births_byrace_1620.txt'

In [None]:
births_byrace_2020 = births_byrace[births_byrace["Year"] == "2020"]

In [None]:
births_byrace_2020.columns = (
    births_byrace_2020.columns.str.lower().str.replace(" ", "_").str.replace("'", "")
)

In [None]:
births_byrace_2020.columns = [
    "notes",
    "year",
    "year_code",
    "race",
    "mothers_single_race_6_code",
    "hispanic_origin",
    "mothers_hispanic_origin_code",
    "births",
]

#### processing race so they match the maternal mortality categories

In [None]:
aapi = ["Native Hawaiian or Other Pacific Islander", "Asian"]

birth_conditions = [
    (
        (births_byrace_2020["race"] == "White")
        & (births_byrace_2020["hispanic_origin"] == "Not Hispanic or Latino")
        & (births_byrace_2020["notes"] != "Total")
    ),
    (
        (births_byrace_2020["race"] == "Black or African American")
        & (births_byrace_2020["hispanic_origin"] == "Not Hispanic or Latino")
        & (births_byrace_2020["notes"] != "Total")
    ),
    (
        (births_byrace_2020["race"].isin(aapi))
        & (births_byrace_2020["hispanic_origin"] == "Not Hispanic or Latino")
        & (births_byrace_2020["notes"] != "Total")
    ),
    (
        (births_byrace_2020["race"] == "American Indian or Alaska Native")
        & (births_byrace_2020["hispanic_origin"] == "Not Hispanic or Latino")
        & (births_byrace_2020["notes"] != "Total")
    ),
    (
        (births_byrace_2020["hispanic_origin"] == "Hispanic or Latino")
        & (births_byrace_2020["notes"] != "Total")
    ),
]

birth_values = [
    "White",
    "Black",
    "AAPI",
    "American Indian or Alaska Native",
    "Hispanic or Latino",
]

births_byrace_2020["race_whispanicorigin"] = np.select(birth_conditions, birth_values)

In [None]:
births_only = births_byrace_2020.groupby("race_whispanicorigin").agg(
    births_2020=pd.NamedAgg(column="births", aggfunc=sum)
)

#### Merging births with 2020 CDC MM raw numbers to calculate rates per 100k live births by race
#### comparison--deaths recorded vs. ignored by cdc metric

In [None]:
mm_byrace_cdcrec_wide = births_only.merge(
    rawdeaths_byrace_cdcrec_wide, on="race_whispanicorigin"
).reset_index()
mm_byrace_cdcrec_wide["rate_recorded"] = 100000 * (
    mm_byrace_cdcrec_wide["recorded"] / mm_byrace_cdcrec_wide["births_2020"]
)
mm_byrace_cdcrec_wide["rate_unrecorded"] = 100000 * (
    mm_byrace_cdcrec_wide["not recorded"] / mm_byrace_cdcrec_wide["births_2020"]
)

In [None]:
mm_byrace_cdcrec = births_only.merge(
    rawdeaths_byrace_cdcrec, on="race_whispanicorigin"
).reset_index()
mm_byrace_cdcrec["rate"] = 100000 * (
    mm_byrace_cdcrec["deaths"] / mm_byrace_cdcrec["births_2020"]
)

In [None]:
alt.Chart(mm_byrace_cdcrec).mark_bar().encode(
    x="cdc_maternal_mortality:N",
    y="rate",
    color="cdc_maternal_mortality:N",
    column="race_whispanicorigin:N",
).properties(width=150, height=300)

### Listing specific causes of death by race (2020)

In [None]:
cause_byrace = (
    mm_byrace.groupby(["race_whispanicorigin", "cause_of_death"])
    .agg(deaths=pd.NamedAgg(column="deaths", aggfunc=sum))
    .reset_index()
)

In [None]:
cause_byrace_wide = cause_byrace.pivot(
    index="cause_of_death", columns="race_whispanicorigin", values="deaths"
).reset_index()

In [None]:
cause_byrace_wide.columns = [
    "cause_of_death",
    "missing",
    "aapi",
    "american_indian",
    "black",
    "hisp_latino",
    "white",
]

### International maternal mortality data

In [28]:
international_mm = pd.read_csv("data/raw/mm_intl.csv", dtype={"Period": str})

In [29]:
international_mm.columns = international_mm.columns.str.lower().str.replace(" ", "_")

In [30]:
intl_mm_rates = international_mm[
    (
        international_mm["indicator"]
        == "Maternal mortality ratio (per 100 000 live births)"
    )
    & (international_mm["period"] == "2017")
]
intl_mm_rates = intl_mm_rates[
    ["spatialdimvaluecode", "location", "period", "factvaluenumeric"]
]

In [31]:
close_to_us = intl_mm_rates[
    (intl_mm_rates["factvaluenumeric"] > 15) & (intl_mm_rates["factvaluenumeric"] < 20)
]

In [32]:
countrycodes = close_to_us["spatialdimvaluecode"]

In [33]:
overtime = international_mm[international_mm["spatialdimvaluecode"].isin(countrycodes)]

In [34]:
rates_overtime = overtime[
    overtime["indicator"] == "Maternal mortality ratio (per 100 000 live births)"
]

In [35]:
rates_overtime = rates_overtime[["location", "period", "factvaluenumeric"]].pivot(
    index="location", columns="period", values="factvaluenumeric"
)

In [36]:
target_countries = [
    "United States of America",
    "Russian Federation",
    "Iran (Islamic Republic of)",
    "Saudi Arabia",
    "Uruguay",
]

In [36]:
g7_countries = ['United States of America', 'France']

In [37]:
intl_rates_overtime_long = overtime[
    (overtime["indicator"] == "Maternal mortality ratio (per 100 000 live births)")
    & (overtime["location"].isin(target_countries))
]

In [38]:
intl_rates_overtime_long = intl_rates_overtime_long[
    ["location", "period", "factvaluenumeric"]
]

In [39]:
alt.Chart(intl_rates_overtime_long).mark_line(point=True).encode(
    x=alt.X("period:O", timeUnit="year"),
    y=alt.Y("factvaluenumeric"),
    color=alt.Color("location:N"),
)