# Life expectancy open and interp

In [None]:
    # # 1. life expectancy
    # # -------------------
    
    # # load worldbank and unwpp life expectancy data 
    # meta, worldbank = load_worldbank_data()
    # meta, unwpp = load_unwpp_data()

    # # unpack values
    # df_countries, df_regions = meta
    # df_worldbank_country, df_worldbank_region = worldbank
    # df_unwpp_country, df_unwpp_region = unwpp

    # # manipulate unwpp data to get life expectancy values at birth, interpolate for each year
    # df_birthyears, df_life_expectancy_5 = get_life_expectancies(
    #     df_unwpp_country
    # )
    

In [None]:

# update to UN WPP 2020 

def load_unwpp_data():
    """ load life expectancy data at exact age (5 years old) from UN WPP2019
    
    Input data is extracted from UNWPP2019 single age life tables, which up to age 100 for both sexes combined provide 
    a set of values showing the mortality experience of a hypothetical group of infants born at the same time and subject 
    throughout their lifetime to the specific mortality rates of a given year, from 1950 to 2020. 
    ex: Expectation of life (ex) at age x, i.e., average number of years lived subsequent to age x by those reaching age x
    source: https://population.un.org/wpp/Download/Standard/Mortality/
    see also: https://population.un.org/wpp/Download/Standard/CSV/
    life expectancy at age 5, defined as years left to live (ex) , is expressed for 5 year brackets (1950-1955, 1955-1960...2015-2020). 
    Here we assign this value to the central year in the bracket (1952, 1957...2017). Note that output is complete for countries, 
    but missing for some regions.

    Input:
        excel file from UNWPP2019 life tables (only life expectancy e(x) at age 5 as years left to live), preprocessed to match format of WB data (see .m script)
    
    Returns:
        meta:      ([df, df])   metadata on countries (df_countries) and on regions (df_regions)
        worldbank: ([df, df])   life expectancy expressed as years left to live at age 5 per country (df_unwpp_country) and per region (df_unwpp_region), 
                                every 5 years between 1952 and 2017. 
    """
    
    # get metadata from WorldBank data 
    df_worldbank = pd.read_excel(os.path.join(pkg_dir,'data/world_bank/world_bank_life_expectancy_by_country.xls'), header=None)
    worldbank_country_meta = df_worldbank.iloc[:,:4].values
    df_worldbank_regions   = pd.read_excel(
        os.path.join(pkg_dir,'data/world_bank/world_bank_life_expectancy_by_country.xls'), 
        'world regions', 
        header=None
    )
    worldbank_region_meta  = df_worldbank_regions.iloc[:,:2].values
    
    # convert metadata in usable dataframe
    df_countries = pd.DataFrame(worldbank_country_meta,columns=['name','abbreviation','region','incomegroup']).set_index('name')
    df_regions = pd.DataFrame(worldbank_region_meta,columns=['name','abbreviation']).set_index('name')
    
    # load United Nations life expectancy at age 5 data, defined as years left to live 
    # assume block is 5 instead of reported 6 years to avoid overlap and take middle of that 5-year block (so 1952 for period 1950-1955). 
    # Note. we have to subtract 5 to get birth year of 5-year old (a 5-year old in 1952 was born in 1947 and we need the latter). This is done in later function. 
    # xls file preprocessed from 'WPP2019_MORT_F16_1_LIFE_EXPECTANCY_BY_AGE_BOTH_SEXES_orig.xls'
    unwpp_years = np.arange(1952,2017+5,5)  
    df_unwpp = pd.read_excel(os.path.join(pkg_dir,'data/UN_WPP/WPP2019_MORT_F16_1_LIFE_EXPECTANCY_BY_AGE_BOTH_SEXES.xlsx'),header=None)
    unwpp_country_data = df_unwpp.values[:,4:]
    
    df_unwpp_country = pd.DataFrame(
        data=unwpp_country_data.transpose(), 
        index=unwpp_years, 
        columns=worldbank_country_meta[:,0]
    )

    df_unwpp_region_raw =  pd.read_excel(
        os.path.join(pkg_dir,'data/UN_WPP/WPP2019_MORT_F16_1_LIFE_EXPECTANCY_BY_AGE_BOTH_SEXES.xlsx'), 
        'world regions', 
        header=None
    )
    
    unwpp_region_data = df_unwpp_region_raw.values[:,2:]
    
    df_unwpp_region = pd.DataFrame(
        data=unwpp_region_data.transpose(), 
        index=unwpp_years, 
        columns=worldbank_region_meta[:,0]
    )
    
    # manually adjust country names with accent problems
    correct_names = {
        'CÃ´te d\'Ivoire' : 'Côte d\Ivoire', 
        'SÃ£o TomÃ© and Principe' : 'São Tomé and Principe'
    }

    df_unwpp_country.rename(columns=correct_names, inplace=True)
    df_countries.rename(index=correct_names, inplace=True)


    # bundle for communicaton
    meta = (df_countries, df_regions)
    unwpp = (df_unwpp_country, df_unwpp_region)
    
    return meta, unwpp