# Transform CDC lifetables to 5y age groups for Markov model calibration

In [12]:
import os

import numpy as np 
import pandas as pd

PARENT_DIR = "/Users/sophiewagner/repos/lynch-syndrome"
DATA_DIR = os.path.join(PARENT_DIR, "data")

In [13]:
FILE_PATH = os.path.join(DATA_DIR, "cdc_lifetable_2022.xlsx")
male = pd.read_excel(FILE_PATH, sheet_name="Table 2 Male")
female = pd.read_excel(FILE_PATH, sheet_name="Table 3 Female")

In [14]:
male.head()

Unnamed: 0,"Table 2. Life table for males: United States, 2022",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,Age (years),Probability of dying between ages x and x + 1,Number surviving to age x,Number dying between ages x and x + 1,Person-years lived between ages x and x + 1,Total number of person-years lived above age x,Expectation of life at age x
1,,qx,lx,dx,Lx,Tx,ex
2,0–1,0.006067,100000,606.708008,99474.585938,7480832,74.808319
3,1–2,0.000489,99393.289062,48.606049,99368.984375,7381357.5,74.264145
4,2–3,0.000323,99344.679688,32.062302,99328.648438,7281988.5,73.30024


In [15]:
male[-5:]

Unnamed: 0,"Table 2. Life table for males: United States, 2022",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
99,97–98,0.317536,2629.987549,835.115417,2212.429688,6114.749512,2.325011
100,98–99,0.340553,1794.87207,611.249817,1489.247192,3902.319824,2.174149
101,99–100,0.363502,1183.622314,430.24942,968.49762,2413.072754,2.038718
102,100 and over,1.0,753.372925,753.372925,1444.575073,1444.575073,1.917477
103,"SOURCE: National Center for Health Statistics,...",,,,,,


In [16]:
def transform_to_5y_age_groups(df):
    
    df = df.iloc[2:-1, 0:4]
    df.columns = ["age","prob_death_x_x1", "n_survive_to_x", "n_deaths_x_x1"]
    df.loc[df["age"]=="100 and over", "age"] = 100
    df["age"] = df["age"].astype(str).str.replace(r"[–—−]", "-", regex=True)
    df["age"] = df["age"].astype(str).str.split("-").str[0].astype(int)
    
    prob_death_1y_y = df["prob_death_x_x1"].astype(float)
    prob_death_1y_m = 1-(1-prob_death_1y_y)**(1/12) # convert to monthly probability of death
    
    df["age_5y"] = df["age"].apply(lambda x: (x // 5) * 5)
    deaths_5y = df.groupby("age_5y")["n_deaths_x_x1"].sum() # N deaths in the interval
    surv_5y = df.groupby("age_5y")["n_survive_to_x"].max() # N surviving to the start of the interval
    
    prob_death_5y_y = deaths_5y / surv_5y
    prob_death_5y_m = 1 - (1 - prob_death_5y_y)**(1/12) # convert to monthly probability of death
    
    return prob_death_1y_y, prob_death_1y_m, prob_death_5y_y, prob_death_5y_m

In [17]:
px_1y_m_y, px_1y_m_m, px_5y_m_y, px_5y_m_m = transform_to_5y_age_groups(male)
px_1y_f_y, px_1y_f_m, px_5y_f_y, px_5y_f_m = transform_to_5y_age_groups(female)

lifetable_5y = pd.DataFrame({
    "age": np.arange(0, 105, 5),
    "prob_death_male_y": px_5y_m_y,
    "prob_death_female_y": px_5y_f_y,
    "prob_death_male_m": px_5y_m_m,
    "prob_death_female_m": px_5y_f_m,
})

lifetable_1y = pd.DataFrame({
    "age": np.arange(0, 101, 1),
    "prob_death_male_y": px_1y_m_y,
    "prob_death_female_y": px_1y_f_y,
    "prob_death_male_m": px_1y_m_m,
    "prob_death_female_m": px_1y_f_m,
})


In [18]:
# Write to csv
LT_DIR = os.path.join(DATA_DIR, "lifetables")
lifetable_5y.to_csv(os.path.join(LT_DIR, "lifetable_5y.csv"), index=False, header=True)
lifetable_1y.to_csv(os.path.join(LT_DIR, "lifetable_1y.csv"), index=False, header=True)