### Necessary packages

In [1]:
%load_ext lab_black

In [2]:
import requests
import urllib.request
import numpy as np

In [3]:
import pandas as pd
import gspread
from oauth2client.service_account import ServiceAccountCredentials

In [4]:
import altair as alt
import altair_grid as altgrid

alt.themes.register("grid", altgrid.theme)
alt.themes.enable("grid")

ThemeRegistry.enable('grid')

In [5]:
from datawrapper import Datawrapper

dw = Datawrapper(
    access_token="FtIwtvFtoGLaRT9a3gjX69PLu4wSuRyKddoOz6SOPw3k9wWyNICMHTkcPhOGCR5Z"
)

In [6]:
# scope = ['https://spreadsheets.google.com/feeds']
# credentials = ServiceAccountCredentials.from_json_keyfile_name('jupyter-integration-349314-25735d35924b.json', scope)
# gc = gspread.authorize(credentials)

In [7]:
# spreadsheet_key = "1sCb1YbQ3-1oiL-cnK0yCkwFcQpvpf0efIngAUeC1ixo"
# book = gc.open_by_key(spreadsheet_key)

In [8]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

### See if you can query CDC using this package

In [9]:
%pip install cdcwonderpy

Note: you may need to restart the kernel to use updated packages.


### Maternal mortality over time with CDC-limited cause of death codes

In [10]:
year_cause_cdclimited = pd.read_table(
    "data/raw/byyear_andcause.txt", dtype={"Year": str, "Year Code": str}
)

In [11]:
year_cause_cdclimited.columns = year_cause_cdclimited.columns.str.lower().str.replace(
    " ", "_"
)

### Maternal mortality over time with all obstetric cause death codes

In [12]:
year_cause_all = pd.read_table(
    "data/raw/after42days.txt", dtype={"Year": str, "Year Code": str}
)

In [13]:
year_cause_all.columns = year_cause_all.columns.str.lower().str.replace(" ", "_")

#### Percent comparisons: deaths included by cdc and those excluded (42 days or more post pregnancy)

In [14]:
yc_all_pct = year_cause_all.dropna(subset=["year"]).drop(columns="notes")

In [15]:
yc_all_pct.value_counts("cause_of_death_code")
post42 = ["O96.0", "O96.1", "O97", "O97.0", "O97.1", "O97.9"]
conditions = [
    (yc_all_pct["cause_of_death_code"].isin(post42)),
    (~yc_all_pct["cause_of_death_code"].isin(post42)),
]

values = ["after 42 days", "within 42 days"]

yc_all_pct["later_death"] = np.select(conditions, values)

yc_all_pct.head()

Unnamed: 0,year,year_code,cause_of_death,cause_of_death_code,deaths,population,crude_rate,later_death
0,1999,1999,Tubal pregnancy,O00.1,5.0,279040168.0,Unreliable,within 42 days
1,1999,1999,"Ectopic pregnancy, unspecified",O00.9,14.0,279040168.0,Unreliable,within 42 days
2,1999,1999,"Hydatidiform mole, unspecified",O01.9,1.0,279040168.0,Unreliable,within 42 days
3,1999,1999,Blighted ovum and nonhydatidiform mole,O02.0,1.0,279040168.0,Unreliable,within 42 days
4,1999,1999,"Spontaneous abortion, complete or unspecified,...",O03.5,1.0,279040168.0,Unreliable,within 42 days


In [16]:
timing = (
    yc_all_pct.groupby(["year", "later_death"])
    .agg(sum_after42=pd.NamedAgg(column="deaths", aggfunc=sum))
    .reset_index()
)

In [17]:
timing_comparison = timing.pivot(
    index="year", columns="later_death", values="sum_after42"
).dropna(subset=["after 42 days"])

In [18]:
timing_comparison["pct"] = (timing_comparison["after 42 days"]) / (
    timing_comparison["after 42 days"] + timing_comparison["within 42 days"]
)

In [19]:
timing_comparison = timing_comparison.reset_index()

In [20]:
timing_comparison["yeardate"] = pd.to_datetime(timing_comparison["year"])

#### Percent of total obstetric deaths from causes 42 days or more after over time
#### is this right? how did it soar in 2010? it must be a difference in how they counted them? redo this analysis

In [21]:
alt.Chart(timing_comparison).mark_line().encode(x=alt.X("year:N"), y=(alt.Y("pct")))

### Maternal mortality broken down by race and cause

In [22]:
mm_byrace = pd.read_table("data/raw/mm_2020_byrace_bycause.txt")

In [23]:
mm_byrace.columns = mm_byrace.columns.str.lower().str.replace(" ", "_")

In [24]:
mm_byrace.head()

Unnamed: 0,notes,race,race_code,hispanic_origin,hispanic_origin_code,cause_of_death,cause_of_death_code,deaths,population,crude_rate
0,,American Indian or Alaska Native,1002-5,Hispanic or Latino,2135-2,Other specified pregnancy-related conditions,O26.8,2.0,2133093,Unreliable
1,Total,American Indian or Alaska Native,1002-5,Hispanic or Latino,2135-2,,,2.0,2133093,Unreliable
2,,American Indian or Alaska Native,1002-5,Not Hispanic or Latino,2186-2,"Liver disorders in pregnancy, childbirth and t...",O26.6,1.0,2766392,Unreliable
3,,American Indian or Alaska Native,1002-5,Not Hispanic or Latino,2186-2,Other specified pregnancy-related conditions,O26.8,5.0,2766392,Unreliable
4,,American Indian or Alaska Native,1002-5,Not Hispanic or Latino,2186-2,Maternal care for intrauterine death,O36.4,1.0,2766392,Unreliable


### 2020 natality data by race (live births for race mortality rates)

In [25]:
births_byrace = pd.read_table(
    "data/raw/natality_2020_race.txt", dtype={"Year": str, "Year Code": str}
)

In [26]:
births_byrace_2020 = births_byrace[births_byrace["Year"] == "2020"]

In [27]:
births_byrace_2020.columns = (
    births_byrace_2020.columns.str.lower().str.replace(" ", "_").str.replace("'", "")
)

In [28]:
births_byrace_2020.columns = [
    "notes",
    "year",
    "year_code",
    "race",
    "mothers_single_race_6_code",
    "hispanic_origin",
    "mothers_hispanic_origin_code",
    "births",
]

In [29]:
aapi = ["Native Hawaiian or Other Pacific Islander", "Asian"]

birth_conditions = [
    (
        (births_byrace_2020["race"] == "White")
        & (births_byrace_2020["hispanic_origin"] == "Not Hispanic or Latino")
        & (births_byrace_2020["notes"] != "Total")
    ),
    (
        (births_byrace_2020["race"] == "Black or African American")
        & (births_byrace_2020["hispanic_origin"] == "Not Hispanic or Latino")
        & (births_byrace_2020["notes"] != "Total")
    ),
    (
        (births_byrace_2020["race"].isin(aapi))
        & (births_byrace_2020["hispanic_origin"] == "Not Hispanic or Latino")
        & (births_byrace_2020["notes"] != "Total")
    ),
    (
        (births_byrace_2020["race"] == "American Indian or Alaska Native")
        & (births_byrace_2020["hispanic_origin"] == "Not Hispanic or Latino")
        & (births_byrace_2020["notes"] != "Total")
    ),
    (
        (births_byrace_2020["hispanic_origin"] == "Hispanic or Latino")
        & (births_byrace_2020["notes"] != "Total")
    ),
]

birth_values = [
    "White",
    "Black",
    "AAPI",
    "American Indian or Alaska Native",
    "Hispanic or Latino",
]

births_byrace_2020["race_whispanicorigin"] = np.select(birth_conditions, birth_values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  births_byrace_2020["race_whispanicorigin"] = np.select(birth_conditions, birth_values)


In [30]:
births_only = births_byrace_2020.groupby("race_whispanicorigin").agg(
    births_2020=pd.NamedAgg(column="births", aggfunc=sum)
)

In [31]:
notcounted = ["O96.0", "O96.1", "O96.9", "O97.0", "O97.1"]

In [32]:
conditions = [
    (mm_byrace["cause_of_death_code"].isin(notcounted)),
    (~mm_byrace["cause_of_death_code"].isin(notcounted)),
]

values = ["not recorded", "recorded"]

mm_byrace["cdc_maternal_mortality"] = np.select(conditions, values)

In [33]:
race_ho_conditions = [
    (
        (mm_byrace["race"] == "White")
        & (mm_byrace["hispanic_origin"] == "Not Hispanic or Latino")
    ),
    (
        (mm_byrace["race"] == "Black or African American")
        & (mm_byrace["hispanic_origin"] == "Not Hispanic or Latino")
    ),
    (
        (mm_byrace["race"] == "Asian or Pacific Islander")
        & (mm_byrace["hispanic_origin"] == "Not Hispanic or Latino")
    ),
    (
        (mm_byrace["race"] == "American Indian or Alaska Native")
        & (mm_byrace["hispanic_origin"] == "Not Hispanic or Latino")
    ),
    (mm_byrace["hispanic_origin"] == "Hispanic or Latino"),
    (mm_byrace["race"].isna())
    & (mm_byrace["hispanic_origin"].isna())
    & (mm_byrace["notes"] == "Total"),
]

race_ho_values = [
    "White",
    "Black",
    "AAPI",
    "American Indian or Alaska Native",
    "Hispanic or Latino",
    "Overall",
]

mm_byrace["race_whispanicorigin"] = np.select(race_ho_conditions, race_ho_values)

In [34]:
recorded_by_cdc = (
    mm_byrace[
        (mm_byrace["race_whispanicorigin"] != "0") & (mm_byrace["notes"] != "Total")
    ]
    .groupby(["race_whispanicorigin", "cdc_maternal_mortality"])
    .agg(deaths=pd.NamedAgg(column="deaths", aggfunc=sum))
    .reset_index()
)

In [35]:
mm_byrace[mm_byrace["race_whispanicorigin"] == "Overall"]

Unnamed: 0,notes,race,race_code,hispanic_origin,hispanic_origin_code,cause_of_death,cause_of_death_code,deaths,population,crude_rate,cdc_maternal_mortality,race_whispanicorigin
211,Total,,,,,,,1288.0,329484123,0.4,recorded,Overall


In [36]:
cdc_rec = recorded_by_cdc.pivot(
    index="race_whispanicorigin", columns="cdc_maternal_mortality", values="deaths"
)

In [37]:
race_deaths_summary = (
    mm_byrace.groupby(["race", "hispanic_origin", "cause_of_death"])
    .agg(deaths=pd.NamedAgg(column="deaths", aggfunc=sum))
    .reset_index()
)

In [38]:
race_deaths_summary["race_hisporigin"] = (
    race_deaths_summary["race"] + race_deaths_summary["hispanic_origin"]
)

In [39]:
deathcause_byrace = race_deaths_summary.pivot(
    index="cause_of_death", columns="race_hisporigin", values="deaths"
).reset_index()

In [40]:
cleaned_race_deathsummary = deathcause_byrace[
    [
        "cause_of_death",
        "American Indian or Alaska NativeNot Hispanic or Latino",
        "Asian or Pacific IslanderNot Hispanic or Latino",
        "Black or African AmericanNot Hispanic or Latino",
        "WhiteHispanic or Latino",
        "WhiteNot Hispanic or Latino",
    ]
].reset_index()

In [41]:
cleaned_race_deathsummary.columns = [
    "index",
    "cause_of_death",
    "american_indian",
    "aapi",
    "black",
    "hisp_latino",
    "white",
]

In [42]:
cleaned_race_deathsummary.fillna(0).sort_values("white", ascending=False)

Unnamed: 0,index,cause_of_death,american_indian,aapi,black,hisp_latino,white
46,46,Other specified pregnancy-related conditions,5.0,7.0,83.0,41.0,93.0
6,6,Death from direct obstetric cause occurring mo...,6.0,13.0,78.0,41.0,88.0
7,7,Death from indirect obstetric cause occurring ...,2.0,7.0,45.0,36.0,77.0
45,45,Other specified diseases and conditions compli...,2.0,3.0,28.0,11.0,49.0
15,15,Diseases of the circulatory system complicatin...,3.0,2.0,32.0,9.0,45.0
30,30,"Liver disorders in pregnancy, childbirth and t...",1.0,3.0,6.0,6.0,17.0
4,4,Cardiomyopathy in the puerperium,0.0,1.0,13.0,2.0,14.0
37,37,Obstetric death of unspecified cause,0.0,1.0,6.0,1.0,10.0
54,54,Pre-existing hypertensive heart disease compli...,0.0,1.0,13.0,9.0,8.0
50,50,"Pre-eclampsia, unspecified",0.0,1.0,8.0,4.0,8.0


In [43]:
mm_cdc_rec = births_only.merge(cdc_rec, on="race_whispanicorigin").reset_index()
mm_cdc_rec["rate_recorded"] = 100000 * (
    mm_cdc_rec["recorded"] / mm_cdc_rec["births_2020"]
)
mm_cdc_rec["rate_unrecorded"] = 100000 * (
    mm_cdc_rec["not recorded"] / mm_cdc_rec["births_2020"]
)

In [44]:
mm_cdc_rec

Unnamed: 0,race_whispanicorigin,births_2020,not recorded,recorded,rate_recorded,rate_unrecorded
0,AAPI,228694.0,22.0,36.0,15.741559,9.619841
1,American Indian or Alaska Native,26813.0,9.0,13.0,48.483944,33.565808
2,Black,529811.0,135.0,296.0,55.86898,25.480785
3,Hispanic or Latino,866713.0,80.0,158.0,18.229795,9.230276
4,White,1843432.0,180.0,358.0,19.420299,9.764396


In [45]:
# dw.add_data(chart_id="N60BA", data=mm_cdc_rec)

In [46]:
cdc_causes = pd.read_table("data/raw/mm_2020_byrace_bycause_cdcinclusion.txt")
cdc_causes.columns = cdc_causes.columns.str.lower().str.replace(" ", "_")

In [47]:
conditions = [
    (
        (cdc_causes["race"] == "White")
        & (cdc_causes["hispanic_origin"] == "Not Hispanic or Latino")
    ),
    (
        (cdc_causes["race"] == "Black or African American")
        & (cdc_causes["hispanic_origin"] == "Not Hispanic or Latino")
    ),
    (
        (cdc_causes["race"] == "Asian or Pacific Islander")
        & (cdc_causes["hispanic_origin"] == "Not Hispanic or Latino")
    ),
    (
        (cdc_causes["race"] == "American Indian or Alaska Native")
        & (cdc_causes["hispanic_origin"] == "Not Hispanic or Latino")
    ),
    (cdc_causes["hispanic_origin"] == "Hispanic or Latino"),
]

values = [
    "White",
    "Black",
    "AAPI",
    "American Indian or Alaska Native",
    "Hispanic or Latino",
]

cdc_causes["race_whispanicorigin"] = np.select(conditions, values)

In [48]:
births_byrace_2020[births_byrace_2020["notes"] == "Total"]

Unnamed: 0,notes,year,year_code,race,mothers_single_race_6_code,hispanic_origin,mothers_hispanic_origin_code,births,race_whispanicorigin
3,Total,2020,2020,American Indian or Alaska Native,1002-5,,,35173.0,0
7,Total,2020,2020,Asian,A,,,231789.0,0
11,Total,2020,2020,Black or African American,2054-5,,,584979.0,0
15,Total,2020,2020,Native Hawaiian or Other Pacific Islander,NHOPI,,,12782.0,0
19,Total,2020,2020,White,2106-3,,,2647430.0,0
23,Total,2020,2020,More than one race,M,,,101494.0,0
24,Total,2020,2020,,,,,3613647.0,0


### International comparison data

In [49]:
international_mm = pd.read_csv("data/raw/mm_intl.csv", dtype={"Period": str})

In [50]:
international_mm.columns = international_mm.columns.str.lower().str.replace(" ", "_")

In [51]:
intl_mm_rates = international_mm[
    (
        international_mm["indicator"]
        == "Maternal mortality ratio (per 100 000 live births)"
    )
    & (international_mm["period"] == "2017")
]
intl_mm_rates = intl_mm_rates[
    ["spatialdimvaluecode", "location", "period", "factvaluenumeric"]
]

In [52]:
close_to_us = intl_mm_rates[
    (intl_mm_rates["factvaluenumeric"] > 15) & (intl_mm_rates["factvaluenumeric"] < 20)
]

In [53]:
countrycodes = close_to_us["spatialdimvaluecode"]

In [54]:
overtime = international_mm[international_mm["spatialdimvaluecode"].isin(countrycodes)]

In [55]:
rates_overtime = overtime[
    overtime["indicator"] == "Maternal mortality ratio (per 100 000 live births)"
]

In [56]:
rates_overtime = rates_overtime[["location", "period", "factvaluenumeric"]].pivot(
    index="location", columns="period", values="factvaluenumeric"
)

In [57]:
target_countries = [
    "United States of America",
    "Russian Federation",
    "Iran (Islamic Republic of)",
    "Saudi Arabia",
    "Uruguay",
]

In [58]:
intl_rates_overtime_long = overtime[
    (overtime["indicator"] == "Maternal mortality ratio (per 100 000 live births)")
    & (overtime["location"].isin(target_countries))
]

In [59]:
intl_rates_overtime_long = intl_rates_overtime_long[
    ["location", "period", "factvaluenumeric"]
]

In [60]:
alt.Chart(intl_rates_overtime_long).mark_line(point=True).encode(
    x=alt.X("period:O", timeUnit="year"),
    y=alt.Y("factvaluenumeric"),
    color=alt.Color("location:N"),
)

In [61]:
hl_one = mm_byrace[mm_byrace["race_whispanicorigin"] == "Hispanic or Latino"]

In [64]:
cleaned_race_deathsummary.to_csv("data/processed/test.csv")

In [65]:
cdc_rec

cdc_maternal_mortality,not recorded,recorded
race_whispanicorigin,Unnamed: 1_level_1,Unnamed: 2_level_1
AAPI,22.0,36.0
American Indian or Alaska Native,9.0,13.0
Black,135.0,296.0
Hispanic or Latino,80.0,158.0
White,180.0,358.0
